From 97ad92ab2d5be1c8f5126abddd04dfded4a059c5 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 26 Oct 2023 07:58:01 +0200 Subject: [PATCH] chore(rust): inline parquet2 (#12026) --- .../logical/categorical/string_cache.rs | 3 +- crates/polars-core/src/datatypes/aliases.rs | 57 +- crates/polars-core/src/datatypes/mod.rs | 1 - crates/polars-error/Cargo.toml | 1 - crates/polars-error/src/lib.rs | 7 - crates/polars-parquet/Cargo.toml | 34 +- crates/polars-parquet/LICENSE | 2 +- crates/polars-parquet/src/arrow/mod.rs | 2 +- .../arrow/read/deserialize/binary/basic.rs | 8 +- .../read/deserialize/binary/dictionary.rs | 2 +- .../arrow/read/deserialize/binary/nested.rs | 6 +- .../arrow/read/deserialize/boolean/basic.rs | 8 +- .../arrow/read/deserialize/boolean/nested.rs | 6 +- .../arrow/read/deserialize/dictionary/mod.rs | 10 +- .../read/deserialize/dictionary/nested.rs | 8 +- .../deserialize/fixed_size_binary/basic.rs | 8 +- .../fixed_size_binary/dictionary.rs | 2 +- .../deserialize/fixed_size_binary/nested.rs | 6 +- .../src/arrow/read/deserialize/mod.rs | 4 +- .../src/arrow/read/deserialize/nested.rs | 2 +- .../arrow/read/deserialize/nested_utils.rs | 6 +- .../src/arrow/read/deserialize/null/mod.rs | 12 +- .../src/arrow/read/deserialize/null/nested.rs | 2 +- .../arrow/read/deserialize/primitive/basic.rs | 10 +- .../read/deserialize/primitive/dictionary.rs | 4 +- .../read/deserialize/primitive/integer.rs | 12 +- .../read/deserialize/primitive/nested.rs | 8 +- .../src/arrow/read/deserialize/simple.rs | 8 +- .../src/arrow/read/deserialize/utils.rs | 14 +- crates/polars-parquet/src/arrow/read/file.rs | 2 +- .../src/arrow/read/indexes/binary.rs | 2 +- .../src/arrow/read/indexes/boolean.rs | 2 +- .../arrow/read/indexes/fixed_len_binary.rs | 2 +- .../src/arrow/read/indexes/mod.rs | 22 +- .../src/arrow/read/indexes/primitive.rs | 8 +- crates/polars-parquet/src/arrow/read/mod.rs | 20 +- .../src/arrow/read/row_group.rs | 14 +- .../src/arrow/read/schema/convert.rs | 10 +- .../src/arrow/read/schema/metadata.rs | 2 +- .../src/arrow/read/schema/mod.rs | 4 +- .../src/arrow/read/statistics/binary.rs | 3 +- .../src/arrow/read/statistics/boolean.rs | 3 +- .../src/arrow/read/statistics/fixlen.rs | 2 +- .../src/arrow/read/statistics/mod.rs | 13 +- .../src/arrow/read/statistics/primitive.rs | 7 +- .../src/arrow/read/statistics/utf8.rs | 3 +- .../src/arrow/write/binary/basic.rs | 10 +- .../src/arrow/write/binary/nested.rs | 6 +- .../src/arrow/write/boolean/basic.rs | 14 +- .../src/arrow/write/boolean/nested.rs | 6 +- .../src/arrow/write/dictionary.rs | 12 +- crates/polars-parquet/src/arrow/write/file.rs | 8 +- .../src/arrow/write/fixed_len_bytes.rs | 8 +- crates/polars-parquet/src/arrow/write/mod.rs | 21 +- .../src/arrow/write/nested/mod.rs | 6 +- .../polars-parquet/src/arrow/write/pages.rs | 12 +- .../src/arrow/write/primitive/basic.rs | 12 +- .../src/arrow/write/primitive/nested.rs | 10 +- .../src/arrow/write/row_group.rs | 8 +- .../polars-parquet/src/arrow/write/schema.rs | 12 +- crates/polars-parquet/src/arrow/write/sink.rs | 4 +- .../src/arrow/write/utf8/basic.rs | 10 +- .../src/arrow/write/utf8/nested.rs | 6 +- .../polars-parquet/src/arrow/write/utils.rs | 14 +- crates/polars-parquet/src/lib.rs | 1 + .../src/parquet/bloom_filter/hash.rs | 17 + .../src/parquet/bloom_filter/mod.rs | 71 + .../src/parquet/bloom_filter/read.rs | 51 + .../src/parquet/bloom_filter/split_block.rs | 82 ++ .../polars-parquet/src/parquet/compression.rs | 385 ++++++ .../src/parquet/deserialize/binary.rs | 70 + .../src/parquet/deserialize/boolean.rs | 39 + .../src/parquet/deserialize/filtered_rle.rs | 274 ++++ .../src/parquet/deserialize/fixed_len.rs | 107 ++ .../src/parquet/deserialize/hybrid_rle.rs | 204 +++ .../src/parquet/deserialize/mod.rs | 17 + .../src/parquet/deserialize/native.rs | 97 ++ .../src/parquet/deserialize/utils.rs | 174 +++ .../src/parquet/encoding/bitpacked/decode.rs | 211 +++ .../src/parquet/encoding/bitpacked/encode.rs | 54 + .../src/parquet/encoding/bitpacked/mod.rs | 220 ++++ .../src/parquet/encoding/bitpacked/pack.rs | 108 ++ .../src/parquet/encoding/bitpacked/unpack.rs | 137 ++ .../encoding/delta_bitpacked/decoder.rs | 362 +++++ .../encoding/delta_bitpacked/encoder.rs | 122 ++ .../parquet/encoding/delta_bitpacked/mod.rs | 90 ++ .../encoding/delta_byte_array/decoder.rs | 106 ++ .../encoding/delta_byte_array/encoder.rs | 32 + .../parquet/encoding/delta_byte_array/mod.rs | 33 + .../delta_length_byte_array/decoder.rs | 80 ++ .../delta_length_byte_array/encoder.rs | 19 + .../encoding/delta_length_byte_array/mod.rs | 50 + .../src/parquet/encoding/hybrid_rle/bitmap.rs | 102 ++ .../parquet/encoding/hybrid_rle/decoder.rs | 142 ++ .../parquet/encoding/hybrid_rle/encoder.rs | 166 +++ .../src/parquet/encoding/hybrid_rle/mod.rs | 263 ++++ .../src/parquet/encoding/mod.rs | 27 + .../src/parquet/encoding/plain_byte_array.rs | 46 + .../src/parquet/encoding/uleb128.rs | 97 ++ .../src/parquet/encoding/zigzag_leb128.rs | 69 + crates/polars-parquet/src/parquet/error.rs | 134 ++ .../src/parquet/indexes/index.rs | 322 +++++ .../src/parquet/indexes/intervals.rs | 137 ++ .../polars-parquet/src/parquet/indexes/mod.rs | 234 ++++ .../parquet/metadata/column_chunk_metadata.rs | 210 +++ .../src/parquet/metadata/column_descriptor.rs | 50 + .../src/parquet/metadata/column_order.rs | 30 + .../src/parquet/metadata/file_metadata.rs | 129 ++ .../src/parquet/metadata/mod.rs | 17 + .../src/parquet/metadata/row_metadata.rs | 103 ++ .../src/parquet/metadata/schema_descriptor.rs | 141 ++ .../src/parquet/metadata/sort.rs | 94 ++ crates/polars-parquet/src/parquet/mod.rs | 37 + crates/polars-parquet/src/parquet/page/mod.rs | 428 ++++++ .../src/parquet/parquet_bridge.rs | 704 ++++++++++ .../src/parquet/read/column/mod.rs | 204 +++ .../src/parquet/read/column/stream.rs | 51 + .../src/parquet/read/compression.rs | 286 ++++ .../src/parquet/read/indexes/deserialize.rs | 27 + .../src/parquet/read/indexes/mod.rs | 4 + .../src/parquet/read/indexes/read.rs | 131 ++ .../polars-parquet/src/parquet/read/levels.rs | 27 + .../src/parquet/read/metadata.rs | 101 ++ crates/polars-parquet/src/parquet/read/mod.rs | 237 ++++ .../src/parquet/read/page/indexed_reader.rs | 204 +++ .../src/parquet/read/page/mod.rs | 18 + .../src/parquet/read/page/reader.rs | 306 +++++ .../src/parquet/read/page/stream.rs | 138 ++ .../polars-parquet/src/parquet/read/stream.rs | 88 ++ .../parquet/schema/io_message/from_message.rs | 1159 +++++++++++++++++ .../src/parquet/schema/io_message/mod.rs | 3 + .../parquet/schema/io_thrift/from_thrift.rs | 134 ++ .../src/parquet/schema/io_thrift/mod.rs | 85 ++ .../src/parquet/schema/io_thrift/to_thrift.rs | 82 ++ .../polars-parquet/src/parquet/schema/mod.rs | 7 + .../src/parquet/schema/types/basic_type.rs | 16 + .../parquet/schema/types/converted_type.rs | 238 ++++ .../src/parquet/schema/types/mod.rs | 17 + .../src/parquet/schema/types/parquet_type.rs | 206 +++ .../src/parquet/schema/types/physical_type.rs | 58 + .../src/parquet/schema/types/spec.rs | 181 +++ .../src/parquet/statistics/binary.rs | 51 + .../src/parquet/statistics/boolean.rs | 72 + .../parquet/statistics/fixed_len_binary.rs | 76 ++ .../src/parquet/statistics/mod.rs | 134 ++ .../src/parquet/statistics/primitive.rs | 70 + crates/polars-parquet/src/parquet/types.rs | 141 ++ .../src/parquet/write/column_chunk.rs | 208 +++ .../src/parquet/write/compression.rs | 160 +++ .../src/parquet/write/dyn_iter.rs | 65 + .../polars-parquet/src/parquet/write/file.rs | 279 ++++ .../src/parquet/write/indexes/mod.rs | 4 + .../src/parquet/write/indexes/serialize.rs | 78 ++ .../src/parquet/write/indexes/write.rs | 46 + .../polars-parquet/src/parquet/write/mod.rs | 57 + .../polars-parquet/src/parquet/write/page.rs | 243 ++++ .../src/parquet/write/row_group.rs | 200 +++ .../src/parquet/write/statistics.rs | 323 +++++ .../src/parquet/write/stream.rs | 192 +++ crates/polars-utils/Cargo.toml | 1 + crates/polars-utils/src/aliases.rs | 57 + py-polars/Cargo.lock | 32 +- 162 files changed, 13635 insertions(+), 308 deletions(-) create mode 100644 crates/polars-parquet/src/parquet/bloom_filter/hash.rs create mode 100644 crates/polars-parquet/src/parquet/bloom_filter/mod.rs create mode 100644 crates/polars-parquet/src/parquet/bloom_filter/read.rs create mode 100644 crates/polars-parquet/src/parquet/bloom_filter/split_block.rs create mode 100644 crates/polars-parquet/src/parquet/compression.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/binary.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/boolean.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/filtered_rle.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/fixed_len.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/hybrid_rle.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/mod.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/native.rs create mode 100644 crates/polars-parquet/src/parquet/deserialize/utils.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/bitpacked/encode.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/hybrid_rle/decoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/hybrid_rle/encoder.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/mod.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/plain_byte_array.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/uleb128.rs create mode 100644 crates/polars-parquet/src/parquet/encoding/zigzag_leb128.rs create mode 100644 crates/polars-parquet/src/parquet/error.rs create mode 100644 crates/polars-parquet/src/parquet/indexes/index.rs create mode 100644 crates/polars-parquet/src/parquet/indexes/intervals.rs create mode 100644 crates/polars-parquet/src/parquet/indexes/mod.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/column_descriptor.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/column_order.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/file_metadata.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/mod.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/row_metadata.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs create mode 100644 crates/polars-parquet/src/parquet/metadata/sort.rs create mode 100644 crates/polars-parquet/src/parquet/mod.rs create mode 100644 crates/polars-parquet/src/parquet/page/mod.rs create mode 100644 crates/polars-parquet/src/parquet/parquet_bridge.rs create mode 100644 crates/polars-parquet/src/parquet/read/column/mod.rs create mode 100644 crates/polars-parquet/src/parquet/read/column/stream.rs create mode 100644 crates/polars-parquet/src/parquet/read/compression.rs create mode 100644 crates/polars-parquet/src/parquet/read/indexes/deserialize.rs create mode 100644 crates/polars-parquet/src/parquet/read/indexes/mod.rs create mode 100644 crates/polars-parquet/src/parquet/read/indexes/read.rs create mode 100644 crates/polars-parquet/src/parquet/read/levels.rs create mode 100644 crates/polars-parquet/src/parquet/read/metadata.rs create mode 100644 crates/polars-parquet/src/parquet/read/mod.rs create mode 100644 crates/polars-parquet/src/parquet/read/page/indexed_reader.rs create mode 100644 crates/polars-parquet/src/parquet/read/page/mod.rs create mode 100644 crates/polars-parquet/src/parquet/read/page/reader.rs create mode 100644 crates/polars-parquet/src/parquet/read/page/stream.rs create mode 100644 crates/polars-parquet/src/parquet/read/stream.rs create mode 100644 crates/polars-parquet/src/parquet/schema/io_message/from_message.rs create mode 100644 crates/polars-parquet/src/parquet/schema/io_message/mod.rs create mode 100644 crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs create mode 100644 crates/polars-parquet/src/parquet/schema/io_thrift/mod.rs create mode 100644 crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs create mode 100644 crates/polars-parquet/src/parquet/schema/mod.rs create mode 100644 crates/polars-parquet/src/parquet/schema/types/basic_type.rs create mode 100644 crates/polars-parquet/src/parquet/schema/types/converted_type.rs create mode 100644 crates/polars-parquet/src/parquet/schema/types/mod.rs create mode 100644 crates/polars-parquet/src/parquet/schema/types/parquet_type.rs create mode 100644 crates/polars-parquet/src/parquet/schema/types/physical_type.rs create mode 100644 crates/polars-parquet/src/parquet/schema/types/spec.rs create mode 100644 crates/polars-parquet/src/parquet/statistics/binary.rs create mode 100644 crates/polars-parquet/src/parquet/statistics/boolean.rs create mode 100644 crates/polars-parquet/src/parquet/statistics/fixed_len_binary.rs create mode 100644 crates/polars-parquet/src/parquet/statistics/mod.rs create mode 100644 crates/polars-parquet/src/parquet/statistics/primitive.rs create mode 100644 crates/polars-parquet/src/parquet/types.rs create mode 100644 crates/polars-parquet/src/parquet/write/column_chunk.rs create mode 100644 crates/polars-parquet/src/parquet/write/compression.rs create mode 100644 crates/polars-parquet/src/parquet/write/dyn_iter.rs create mode 100644 crates/polars-parquet/src/parquet/write/file.rs create mode 100644 crates/polars-parquet/src/parquet/write/indexes/mod.rs create mode 100644 crates/polars-parquet/src/parquet/write/indexes/serialize.rs create mode 100644 crates/polars-parquet/src/parquet/write/indexes/write.rs create mode 100644 crates/polars-parquet/src/parquet/write/mod.rs create mode 100644 crates/polars-parquet/src/parquet/write/page.rs create mode 100644 crates/polars-parquet/src/parquet/write/row_group.rs create mode 100644 crates/polars-parquet/src/parquet/write/statistics.rs create mode 100644 crates/polars-parquet/src/parquet/write/stream.rs diff --git a/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs b/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs index f39a1523446c..f0bcbafca52f 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs @@ -7,9 +7,8 @@ use hashbrown::hash_map::RawEntryMut; use once_cell::sync::Lazy; use smartstring::{LazyCompact, SmartString}; -use crate::datatypes::PlIdHashMap; +use crate::datatypes::{InitHashMaps2, PlIdHashMap}; use crate::hashing::_HASHMAP_INIT_SIZE; -use crate::prelude::InitHashMaps; /// We use atomic reference counting to determine how many threads use the /// string cache. If the refcount is zero, we may clear the string cache. diff --git a/crates/polars-core/src/datatypes/aliases.rs b/crates/polars-core/src/datatypes/aliases.rs index 421e7cd3c8a4..d5ce2da0974b 100644 --- a/crates/polars-core/src/datatypes/aliases.rs +++ b/crates/polars-core/src/datatypes/aliases.rs @@ -1,4 +1,5 @@ pub use arrow::legacy::index::{IdxArr, IdxSize}; +pub use polars_utils::aliases::{InitHashMaps, PlHashMap, PlHashSet, PlIndexMap, PlIndexSet}; use super::*; use crate::hashing::IdBuildHasher; @@ -21,14 +22,10 @@ pub type IdxType = UInt32Type; #[cfg(feature = "bigidx")] pub type IdxType = UInt64Type; -pub type PlHashMap = hashbrown::HashMap; -/// This hashmap has the uses an IdHasher +/// This hashmap uses an IdHasher pub type PlIdHashMap = hashbrown::HashMap; -pub type PlHashSet = hashbrown::HashSet; -pub type PlIndexMap = indexmap::IndexMap; -pub type PlIndexSet = indexmap::IndexSet; -pub trait InitHashMaps { +pub trait InitHashMaps2 { type HashMap; fn new() -> Self::HashMap; @@ -36,53 +33,7 @@ pub trait InitHashMaps { fn with_capacity(capacity: usize) -> Self::HashMap; } -impl InitHashMaps for PlHashMap { - type HashMap = Self; - - fn new() -> Self::HashMap { - Self::with_capacity_and_hasher(0, Default::default()) - } - - fn with_capacity(capacity: usize) -> Self { - Self::with_capacity_and_hasher(capacity, Default::default()) - } -} -impl InitHashMaps for PlHashSet { - type HashMap = Self; - - fn new() -> Self::HashMap { - Self::with_capacity_and_hasher(0, Default::default()) - } - - fn with_capacity(capacity: usize) -> Self { - Self::with_capacity_and_hasher(capacity, Default::default()) - } -} - -impl InitHashMaps for PlIndexSet { - type HashMap = Self; - - fn new() -> Self::HashMap { - Self::with_capacity_and_hasher(0, Default::default()) - } - - fn with_capacity(capacity: usize) -> Self::HashMap { - Self::with_capacity_and_hasher(capacity, Default::default()) - } -} - -impl InitHashMaps for PlIndexMap { - type HashMap = Self; - - fn new() -> Self::HashMap { - Self::with_capacity_and_hasher(0, Default::default()) - } - - fn with_capacity(capacity: usize) -> Self::HashMap { - Self::with_capacity_and_hasher(capacity, Default::default()) - } -} -impl InitHashMaps for PlIdHashMap { +impl InitHashMaps2 for PlIdHashMap { type HashMap = Self; fn new() -> Self::HashMap { diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index a5f1ef643070..9151f6915037 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -21,7 +21,6 @@ use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use std::ops::{Add, AddAssign, Div, Mul, Rem, Sub, SubAssign}; -use ahash::RandomState; pub use aliases::*; pub use any_value::*; use arrow::compute::comparison::Simd8; diff --git a/crates/polars-error/Cargo.toml b/crates/polars-error/Cargo.toml index 60e4800f073f..64b81ed950e6 100644 --- a/crates/polars-error/Cargo.toml +++ b/crates/polars-error/Cargo.toml @@ -12,7 +12,6 @@ description = "Error definitions for the Polars DataFrame library" arrow-format = { version = "0.8.1", optional = true } avro-schema = { workspace = true, optional = true } object_store = { workspace = true, optional = true } -parquet2 = { workspace = true, optional = true } regex = { workspace = true, optional = true } simdutf8 = { workspace = true } thiserror = { workspace = true } diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs index be7a72debe32..c44131717b81 100644 --- a/crates/polars-error/src/lib.rs +++ b/crates/polars-error/src/lib.rs @@ -91,13 +91,6 @@ impl From for PolarsError { } } -#[cfg(feature = "parquet2")] -impl From for PolarsError { - fn from(err: parquet2::error::Error) -> Self { - polars_err!(ComputeError: "parquet error: {err:?}") - } -} - #[cfg(feature = "avro-schema")] impl From for PolarsError { fn from(value: avro_schema::error::Error) -> Self { diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml index 673d000740ac..f1b8791c8d70 100644 --- a/crates/polars-parquet/Cargo.toml +++ b/crates/polars-parquet/Cargo.toml @@ -20,14 +20,26 @@ ethnum = { workspace = true } fallible-streaming-iterator = { workspace = true, optional = true } futures = { workspace = true, optional = true } num-traits = { workspace = true } -parquet2 = { workspace = true, optional = true, default-features = true, features = ["async"] } -polars-error = { workspace = true, features = ["parquet2"] } +polars-error = { workspace = true } +polars-utils = { workspace = true } simdutf8 = { workspace = true } -[features] -bloom_filter = ["parquet2/bloom_filter"] -async = ["futures"] +parquet-format-safe = "0.2" +seq-macro = { version = "0.3", default-features = false } +streaming-decompression = "0.1" + +async-stream = { version = "0.3.3", optional = true } + +brotli = { version = "^3.3", optional = true } +flate2 = { version = "^1.0", optional = true, default-features = false } +lz4 = { version = "1.24", optional = true } +serde = { version = "^1.0", optional = true, features = ["derive"] } +snap = { version = "^1.1", optional = true } +zstd = { version = "^0.12", optional = true, default-features = false } +xxhash-rust = { version = "0.8", optional = true, features = ["xxh64"] } + +[features] compression = [ "zstd", "gzip", @@ -37,8 +49,10 @@ compression = [ ] # compression backends -zstd = ["parquet2/zstd"] -snappy = ["parquet2/snappy"] -gzip = ["parquet2/gzip"] -lz4 = ["parquet2/lz4"] -brotli = ["parquet2/brotli"] +snappy = ["snap"] +gzip = ["flate2/rust_backend"] +gzip_zlib_ng = ["flate2/zlib-ng"] + +async = ["async-stream", "futures", "parquet-format-safe/async"] +bloom_filter = ["xxhash-rust"] +serde_types = ["serde"] diff --git a/crates/polars-parquet/LICENSE b/crates/polars-parquet/LICENSE index a4b4b70523c3..7fd76611dd29 100644 --- a/crates/polars-parquet/LICENSE +++ b/crates/polars-parquet/LICENSE @@ -1,5 +1,5 @@ Some of the code in this crate is subject to the Apache 2 license below, as it -was taken from the arrow2 Rust crate in October 2023. Later changes are subject +was taken from the arrow2 and parquet2 Rust crate in October 2023. Later changes are subject to the MIT license in ../../LICENSE. diff --git a/crates/polars-parquet/src/arrow/mod.rs b/crates/polars-parquet/src/arrow/mod.rs index 1ccb35dfeccf..aff9a98c9670 100644 --- a/crates/polars-parquet/src/arrow/mod.rs +++ b/crates/polars-parquet/src/arrow/mod.rs @@ -3,6 +3,6 @@ pub mod write; #[cfg(feature = "io_parquet_bloom_filter")] #[cfg_attr(docsrs, doc(cfg(feature = "io_parquet_bloom_filter")))] -pub use parquet2::bloom_filter; +pub use crate::parquet::bloom_filter; const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema"; diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs index 902fa69d0031..8bde2ce8ea46 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs @@ -5,10 +5,6 @@ use arrow::array::{Array, BinaryArray, Utf8Array}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::{DataType, PhysicalType}; use arrow::offset::Offset; -use parquet2::deserialize::SliceFilteredIter; -use parquet2::encoding::{delta_length_byte_array, hybrid_rle, Encoding}; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; use polars_error::{to_compute_err, PolarsResult}; use super::super::utils::{ @@ -17,6 +13,10 @@ use super::super::utils::{ }; use super::super::{utils, Pages}; use super::utils::*; +use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::{delta_length_byte_array, hybrid_rle, Encoding}; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; #[derive(Debug)] pub(super) struct Required<'a> { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs index d4c91dbc5d72..20548f201b46 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs @@ -4,7 +4,6 @@ use arrow::array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Utf8Array use arrow::bitmap::MutableBitmap; use arrow::datatypes::{DataType, PhysicalType}; use arrow::offset::Offset; -use parquet2::page::DictPage; use polars_error::PolarsResult; use super::super::dictionary::*; @@ -12,6 +11,7 @@ use super::super::utils::MaybeNext; use super::super::Pages; use super::utils::{Binary, SizedBinaryIter}; use crate::arrow::read::deserialize::nested_utils::{InitNested, NestedState}; +use crate::parquet::page::DictPage; /// An iterator adapter over [`Pages`] assumed to be encoded as parquet's dictionary-encoded binary representation #[derive(Debug)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs index 37c0a35006f6..750d81dae1b6 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs @@ -4,9 +4,6 @@ use arrow::array::Array; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::offset::Offset; -use parquet2::encoding::Encoding; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; use polars_error::PolarsResult; use super::super::nested_utils::*; @@ -15,6 +12,9 @@ use super::super::utils::MaybeNext; use super::basic::{deserialize_plain, finish, Dict, ValuesDictionary}; use super::utils::*; use crate::arrow::read::Pages; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; #[derive(Debug)] enum State<'a> { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs index 413cfd15da35..1c736d6d8a8c 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs @@ -4,10 +4,6 @@ use arrow::array::BooleanArray; use arrow::bitmap::utils::BitmapIter; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::deserialize::SliceFilteredIter; -use parquet2::encoding::Encoding; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; use polars_error::PolarsResult; use super::super::utils::{ @@ -15,6 +11,10 @@ use super::super::utils::{ FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, }; use super::super::{utils, Pages}; +use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; #[derive(Debug)] struct Values<'a>(BitmapIter<'a>); diff --git a/crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs index d3a8c0b305c4..e10a6dfb2d2e 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs @@ -4,14 +4,14 @@ use arrow::array::BooleanArray; use arrow::bitmap::utils::BitmapIter; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::encoding::Encoding; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; use polars_error::PolarsResult; use super::super::nested_utils::*; use super::super::utils::MaybeNext; use super::super::{utils, Pages}; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; // The state of a `DataPage` of `Boolean` parquet boolean type #[allow(clippy::large_enum_variant)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs index acb795eb04d4..9bd21d9946e1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs @@ -5,17 +5,17 @@ use std::collections::VecDeque; use arrow::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::deserialize::SliceFilteredIter; -use parquet2::encoding::hybrid_rle::HybridRleDecoder; -use parquet2::encoding::Encoding; -use parquet2::page::{DataPage, DictPage, Page}; -use parquet2::schema::Repetition; use super::utils::{ self, dict_indices_decoder, extend_from_decoder, get_selected_rows, DecodedState, Decoder, FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, }; use super::Pages; +use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{DataPage, DictPage, Page}; +use crate::parquet::schema::Repetition; // The state of a `DataPage` of `Primitive` parquet primitive type #[derive(Debug)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs index 03aded4b2b97..7da9ff48314e 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs @@ -3,16 +3,16 @@ use std::collections::VecDeque; use arrow::array::{Array, DictionaryArray, DictionaryKey}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::encoding::hybrid_rle::HybridRleDecoder; -use parquet2::encoding::Encoding; -use parquet2::page::{DataPage, DictPage, Page}; -use parquet2::schema::Repetition; use polars_error::{polars_err, PolarsResult}; use super::super::super::Pages; use super::super::nested_utils::*; use super::super::utils::{dict_indices_decoder, not_implemented, MaybeNext, PageState}; use super::finish_key; +use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{DataPage, DictPage, Page}; +use crate::parquet::schema::Repetition; // The state of a required DataPage with a boolean physical type #[derive(Debug)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs index e51122eec41e..1d7a0bf8dc68 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs @@ -3,10 +3,6 @@ use std::collections::VecDeque; use arrow::array::FixedSizeBinaryArray; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::deserialize::SliceFilteredIter; -use parquet2::encoding::{hybrid_rle, Encoding}; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; use polars_error::PolarsResult; use super::super::utils::{ @@ -16,6 +12,10 @@ use super::super::utils::{ }; use super::super::Pages; use super::utils::FixedSizeBinary; +use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::{hybrid_rle, Encoding}; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; pub(super) type Dict = Vec; diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs index 346f092fab84..f092e94314ef 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs @@ -3,13 +3,13 @@ use std::collections::VecDeque; use arrow::array::{Array, DictionaryArray, DictionaryKey, FixedSizeBinaryArray}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::page::DictPage; use polars_error::PolarsResult; use super::super::dictionary::*; use super::super::utils::MaybeNext; use super::super::Pages; use crate::arrow::read::deserialize::nested_utils::{InitNested, NestedState}; +use crate::parquet::page::DictPage; /// An iterator adapter over [`Pages`] assumed to be encoded as parquet's dictionary-encoded binary representation #[derive(Debug)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs index 11a1e3f044a7..54ef413363a8 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs @@ -3,9 +3,6 @@ use std::collections::VecDeque; use arrow::array::FixedSizeBinaryArray; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; -use parquet2::encoding::Encoding; -use parquet2::page::{DataPage, DictPage}; -use parquet2::schema::Repetition; use polars_error::PolarsResult; use super::super::utils::{not_implemented, MaybeNext, PageState}; @@ -16,6 +13,9 @@ use crate::arrow::read::deserialize::fixed_size_binary::basic::{ use crate::arrow::read::deserialize::nested_utils::{next, NestedDecoder}; use crate::arrow::read::deserialize::utils::Pushable; use crate::arrow::read::{InitNested, NestedState, Pages}; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{DataPage, DictPage}; +use crate::parquet::schema::Repetition; #[derive(Debug)] enum State<'a> { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs index 3b8281373c96..8e43c0e100c1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs @@ -14,13 +14,13 @@ mod utils; use arrow::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, MapArray}; use arrow::datatypes::{DataType, Field, IntervalUnit}; use arrow::offset::Offsets; -use parquet2::read::get_page_iterator as _get_page_iterator; -use parquet2::schema::types::PrimitiveType; use simple::page_iter_to_arrays; pub use self::nested_utils::{init_nested, InitNested, NestedArrayIter, NestedState}; pub use self::struct_::StructIterator; use super::*; +use crate::parquet::read::get_page_iterator as _get_page_iterator; +use crate::parquet::schema::types::PrimitiveType; /// Creates a new iterator of compressed pages. pub fn get_page_iterator( diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs index 6741c05ee852..3d768c8c8745 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -2,11 +2,11 @@ use arrow::array::PrimitiveArray; use arrow::datatypes::{DataType, Field}; use arrow::match_integer_type; use ethnum::I256; -use parquet2::schema::types::PrimitiveType; use polars_error::polars_bail; use super::nested_utils::{InitNested, NestedArrayIter}; use super::*; +use crate::parquet::schema::types::PrimitiveType; /// Converts an iterator of arrays to a trait object returning trait objects #[inline] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index f27841f7f5b4..da88b18a9731 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -2,14 +2,14 @@ use std::collections::VecDeque; use arrow::array::Array; use arrow::bitmap::MutableBitmap; -use parquet2::encoding::hybrid_rle::HybridRleDecoder; -use parquet2::page::{split_buffer, DataPage, DictPage, Page}; -use parquet2::read::levels::get_bit_width; use polars_error::PolarsResult; use super::super::Pages; pub use super::utils::Zip; use super::utils::{DecodedState, MaybeNext, PageState}; +use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; +use crate::parquet::page::{split_buffer, DataPage, DictPage, Page}; +use crate::parquet::read::levels::get_bit_width; /// trait describing deserialized repetition and definition levels pub trait Nested: std::fmt::Debug + Send + Sync { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs index ad6227a45f2e..b65ccae1c75f 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs @@ -3,9 +3,9 @@ mod nested; use arrow::array::NullArray; use arrow::datatypes::DataType; pub(super) use nested::NestedIter; -use parquet2::page::Page; use super::super::{ArrayIter, Pages}; +use crate::parquet::page::Page; /// Converts [`Pages`] to an [`ArrayIter`] pub fn iter_to_arrays<'a, I>( @@ -56,14 +56,14 @@ where mod tests { use arrow::array::NullArray; use arrow::datatypes::DataType; - use parquet2::encoding::Encoding; - use parquet2::error::Error as ParquetError; - use parquet2::metadata::Descriptor; - use parquet2::page::{DataPage, DataPageHeader, DataPageHeaderV1, Page}; - use parquet2::schema::types::{PhysicalType, PrimitiveType}; use polars_error::*; use super::iter_to_arrays; + use crate::parquet::encoding::Encoding; + use crate::parquet::error::Error as ParquetError; + use crate::parquet::metadata::Descriptor; + use crate::parquet::page::{DataPage, DataPageHeader, DataPageHeaderV1, Page}; + use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; #[test] fn limit() { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs index 0da98d13f17d..ccd3b160b674 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs @@ -2,12 +2,12 @@ use std::collections::VecDeque; use arrow::array::NullArray; use arrow::datatypes::DataType; -use parquet2::page::{DataPage, DictPage}; use polars_error::PolarsResult; use super::super::nested_utils::*; use super::super::{utils, Pages}; use crate::arrow::read::deserialize::utils::DecodedState; +use crate::parquet::page::{DataPage, DictPage}; impl<'a> utils::PageState<'a> for usize { fn len(&self) -> usize { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs index acd6ea5ae785..a918b578cd85 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs @@ -4,15 +4,15 @@ use arrow::array::MutablePrimitiveArray; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::types::NativeType; -use parquet2::deserialize::SliceFilteredIter; -use parquet2::encoding::{hybrid_rle, Encoding}; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; -use parquet2::types::{decode, NativeType as ParquetNativeType}; use polars_error::PolarsResult; use super::super::utils::{get_selected_rows, FilteredOptionalPageValidity, OptionalPageValidity}; use super::super::{utils, Pages}; +use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::{hybrid_rle, Encoding}; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; +use crate::parquet::types::{decode, NativeType as ParquetNativeType}; #[derive(Debug)] pub(super) struct FilteredRequiredValues<'a> { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs index 6f476cbafe79..3996f388f31b 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs @@ -4,8 +4,6 @@ use arrow::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::types::NativeType; -use parquet2::page::DictPage; -use parquet2::types::NativeType as ParquetNativeType; use polars_error::PolarsResult; use super::super::dictionary::{nested_next_dict, *}; @@ -13,6 +11,8 @@ use super::super::nested_utils::{InitNested, NestedState}; use super::super::utils::MaybeNext; use super::super::Pages; use super::basic::deserialize_plain; +use crate::parquet::page::DictPage; +use crate::parquet::types::NativeType as ParquetNativeType; fn read_dict(data_type: DataType, op: F, dict: &DictPage) -> Box where diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs index 8472a54bda3d..973b70537b37 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs @@ -5,12 +5,6 @@ use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::types::NativeType; use num_traits::AsPrimitive; -use parquet2::deserialize::SliceFilteredIter; -use parquet2::encoding::delta_bitpacked::Decoder; -use parquet2::encoding::Encoding; -use parquet2::page::{split_buffer, DataPage, DictPage}; -use parquet2::schema::Repetition; -use parquet2::types::NativeType as ParquetNativeType; use polars_error::{to_compute_err, PolarsResult}; use super::super::{utils, Pages}; @@ -18,6 +12,12 @@ use super::basic::{finish, PrimitiveDecoder, State as PrimitiveState}; use crate::arrow::read::deserialize::utils::{ get_selected_rows, FilteredOptionalPageValidity, OptionalPageValidity, }; +use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::delta_bitpacked::Decoder; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{split_buffer, DataPage, DictPage}; +use crate::parquet::schema::Repetition; +use crate::parquet::types::NativeType as ParquetNativeType; /// The state of a [`DataPage`] of an integer parquet type (i32 or i64) #[derive(Debug)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs index 3b87d3fcde92..11b59b70ffbd 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs @@ -4,15 +4,15 @@ use arrow::array::PrimitiveArray; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::types::NativeType; -use parquet2::encoding::Encoding; -use parquet2::page::{DataPage, DictPage}; -use parquet2::schema::Repetition; -use parquet2::types::{decode, NativeType as ParquetNativeType}; use polars_error::PolarsResult; use super::super::nested_utils::*; use super::super::{utils, Pages}; use super::basic::{deserialize_plain, Values, ValuesDictionary}; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{DataPage, DictPage}; +use crate::parquet::schema::Repetition; +use crate::parquet::types::{decode, NativeType as ParquetNativeType}; // The state of a `DataPage` of `Primitive` parquet primitive type #[allow(clippy::large_enum_variant)] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/simple.rs b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs index 119b569116f5..60171432ec98 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/simple.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs @@ -3,14 +3,14 @@ use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; use arrow::match_integer_type; use arrow::types::{days_ms, i256, NativeType}; use ethnum::I256; -use parquet2::schema::types::{ - PhysicalType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, -}; -use parquet2::types::int96_to_i64_ns; use polars_error::{polars_bail, PolarsResult}; use super::super::{ArrayIter, Pages}; use super::{binary, boolean, fixed_size_binary, null, primitive}; +use crate::parquet::schema::types::{ + PhysicalType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, +}; +use crate::parquet::types::int96_to_i64_ns; /// Converts an iterator of arrays to a trait object returning trait objects #[inline] diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs index 8e3eb98147c9..767180ed0db4 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs @@ -2,16 +2,16 @@ use std::collections::VecDeque; use arrow::bitmap::utils::BitmapIter; use arrow::bitmap::MutableBitmap; -use parquet2::deserialize::{ - FilteredHybridEncoded, FilteredHybridRleDecoderIter, HybridDecoderBitmapIter, HybridEncoded, -}; -use parquet2::encoding::hybrid_rle; -use parquet2::indexes::Interval; -use parquet2::page::{split_buffer, DataPage, DictPage, Page}; -use parquet2::schema::Repetition; use polars_error::{polars_err, to_compute_err, PolarsError, PolarsResult}; use super::super::Pages; +use crate::parquet::deserialize::{ + FilteredHybridEncoded, FilteredHybridRleDecoderIter, HybridDecoderBitmapIter, HybridEncoded, +}; +use crate::parquet::encoding::hybrid_rle; +use crate::parquet::indexes::Interval; +use crate::parquet::page::{split_buffer, DataPage, DictPage, Page}; +use crate::parquet::schema::Repetition; pub fn not_implemented(page: &DataPage) -> PolarsError { let is_optional = page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; diff --git a/crates/polars-parquet/src/arrow/read/file.rs b/crates/polars-parquet/src/arrow/read/file.rs index 109011175c7b..0d026e4d74a9 100644 --- a/crates/polars-parquet/src/arrow/read/file.rs +++ b/crates/polars-parquet/src/arrow/read/file.rs @@ -3,11 +3,11 @@ use std::io::{Read, Seek}; use arrow::array::Array; use arrow::chunk::Chunk; use arrow::datatypes::Schema; -use parquet2::indexes::FilteredPage; use polars_error::PolarsResult; use super::{RowGroupDeserializer, RowGroupMetaData}; use crate::arrow::read::read_columns_many; +use crate::parquet::indexes::FilteredPage; /// An iterator of [`Chunk`]s coming from row groups of a parquet file. /// diff --git a/crates/polars-parquet/src/arrow/read/indexes/binary.rs b/crates/polars-parquet/src/arrow/read/indexes/binary.rs index 83de6a6f525a..5e115feb4253 100644 --- a/crates/polars-parquet/src/arrow/read/indexes/binary.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/binary.rs @@ -1,10 +1,10 @@ use arrow::array::{Array, BinaryArray, PrimitiveArray, Utf8Array}; use arrow::datatypes::{DataType, PhysicalType}; use arrow::trusted_len::TrustedLen; -use parquet2::indexes::PageIndex; use polars_error::{to_compute_err, PolarsResult}; use super::ColumnPageStatistics; +use crate::parquet::indexes::PageIndex; pub fn deserialize( indexes: &[PageIndex>], diff --git a/crates/polars-parquet/src/arrow/read/indexes/boolean.rs b/crates/polars-parquet/src/arrow/read/indexes/boolean.rs index 5c809673eba1..b6414e24a621 100644 --- a/crates/polars-parquet/src/arrow/read/indexes/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/boolean.rs @@ -1,7 +1,7 @@ use arrow::array::{BooleanArray, PrimitiveArray}; -use parquet2::indexes::PageIndex; use super::ColumnPageStatistics; +use crate::parquet::indexes::PageIndex; pub fn deserialize(indexes: &[PageIndex]) -> ColumnPageStatistics { ColumnPageStatistics { diff --git a/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs b/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs index c6cede8dd466..1a99e0b3e73a 100644 --- a/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs @@ -2,9 +2,9 @@ use arrow::array::{Array, FixedSizeBinaryArray, MutableFixedSizeBinaryArray, Pri use arrow::datatypes::{DataType, PhysicalType, PrimitiveType}; use arrow::trusted_len::TrustedLen; use arrow::types::{i256, NativeType}; -use parquet2::indexes::PageIndex; use super::ColumnPageStatistics; +use crate::parquet::indexes::PageIndex; pub fn deserialize(indexes: &[PageIndex>], data_type: DataType) -> ColumnPageStatistics { ColumnPageStatistics { diff --git a/crates/polars-parquet/src/arrow/read/indexes/mod.rs b/crates/polars-parquet/src/arrow/read/indexes/mod.rs index 1abd34c5a968..60be17db158f 100644 --- a/crates/polars-parquet/src/arrow/read/indexes/mod.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/mod.rs @@ -1,12 +1,12 @@ //! API to perform page-level filtering (also known as indexes) -use parquet2::error::Error as ParquetError; -use parquet2::indexes::{ +use crate::parquet::error::Error as ParquetError; +use crate::parquet::indexes::{ select_pages, BooleanIndex, ByteIndex, FixedLenByteIndex, Index as ParquetIndex, NativeIndex, PageLocation, }; -use parquet2::metadata::{ColumnChunkMetaData, RowGroupMetaData}; -use parquet2::read::{read_columns_indexes as _read_columns_indexes, read_pages_locations}; -use parquet2::schema::types::PhysicalType as ParquetPhysicalType; +use crate::parquet::metadata::{ColumnChunkMetaData, RowGroupMetaData}; +use crate::parquet::read::{read_columns_indexes as _read_columns_indexes, read_pages_locations}; +use crate::parquet::schema::types::PhysicalType as ParquetPhysicalType; mod binary; mod boolean; @@ -18,10 +18,10 @@ use std::io::{Read, Seek}; use arrow::array::{Array, UInt64Array}; use arrow::datatypes::{DataType, Field, PhysicalType, PrimitiveType}; -pub use parquet2::indexes::{FilteredPage, Interval}; use polars_error::{polars_bail, PolarsResult}; use super::get_field_pages; +pub use crate::parquet::indexes::{FilteredPage, Interval}; /// Page statistics of an Arrow field. #[derive(Debug, PartialEq)] @@ -83,7 +83,7 @@ fn deserialize( let index = index.as_any().downcast_ref::>().unwrap(); Ok(primitive::deserialize_i32(&index.indexes, data_type).into()) }, - parquet2::schema::types::PhysicalType::Int64 => { + crate::parquet::schema::types::PhysicalType::Int64 => { let index = index.as_any().downcast_ref::>().unwrap(); Ok( primitive::deserialize_i64( @@ -94,7 +94,7 @@ fn deserialize( .into(), ) }, - parquet2::schema::types::PhysicalType::FixedLenByteArray(_) => { + crate::parquet::schema::types::PhysicalType::FixedLenByteArray(_) => { let index = index.as_any().downcast_ref::().unwrap(); Ok(fixed_len_binary::deserialize(&index.indexes, data_type).into()) }, @@ -108,7 +108,7 @@ fn deserialize( let index = index.as_any().downcast_ref::>().unwrap(); Ok(primitive::deserialize_i32(&index.indexes, data_type).into()) }, - parquet2::schema::types::PhysicalType::Int64 => { + crate::parquet::schema::types::PhysicalType::Int64 => { let index = index.as_any().downcast_ref::>().unwrap(); Ok( primitive::deserialize_i64( @@ -119,7 +119,7 @@ fn deserialize( .into(), ) }, - parquet2::schema::types::PhysicalType::FixedLenByteArray(_) => { + crate::parquet::schema::types::PhysicalType::FixedLenByteArray(_) => { let index = index.as_any().downcast_ref::().unwrap(); Ok(fixed_len_binary::deserialize(&index.indexes, data_type).into()) }, @@ -153,7 +153,7 @@ fn deserialize( .into(), ) }, - parquet2::schema::types::PhysicalType::Int96 => { + crate::parquet::schema::types::PhysicalType::Int96 => { let index = index .as_any() .downcast_ref::>() diff --git a/crates/polars-parquet/src/arrow/read/indexes/primitive.rs b/crates/polars-parquet/src/arrow/read/indexes/primitive.rs index fd551c35a2b0..c2aba43f07cc 100644 --- a/crates/polars-parquet/src/arrow/read/indexes/primitive.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/primitive.rs @@ -3,11 +3,13 @@ use arrow::datatypes::{DataType, TimeUnit}; use arrow::trusted_len::TrustedLen; use arrow::types::{i256, NativeType}; use ethnum::I256; -use parquet2::indexes::PageIndex; -use parquet2::schema::types::{PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit}; -use parquet2::types::int96_to_i64_ns; use super::ColumnPageStatistics; +use crate::parquet::indexes::PageIndex; +use crate::parquet::schema::types::{ + PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, +}; +use crate::parquet::types::int96_to_i64_ns; #[inline] fn deserialize_int32>>( diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index 1a8cdc4c05b7..02d1336d11fc 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -19,17 +19,22 @@ pub use deserialize::{ pub use file::{FileReader, RowGroupReader}; #[cfg(feature = "async")] use futures::{AsyncRead, AsyncSeek}; -// re-exports of parquet2's relevant APIs -pub use parquet2::{ +use polars_error::PolarsResult; +pub use row_group::*; +pub use schema::{infer_schema, FileMetaData}; + +#[cfg(feature = "async")] +pub use crate::parquet::read::{get_page_stream, read_metadata_async as _read_metadata_async}; +// re-exports of crate::parquet's relevant APIs +pub use crate::parquet::{ error::Error as ParquetError, fallible_streaming_iterator, metadata::{ColumnChunkMetaData, ColumnDescriptor, RowGroupMetaData}, page::{CompressedDataPage, DataPageHeader, Page}, read::{ - decompress, get_column_iterator, get_page_stream, - read_columns_indexes as _read_columns_indexes, read_metadata as _read_metadata, - read_metadata_async as _read_metadata_async, read_pages_locations, BasicDecompressor, - Decompressor, MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State, + decompress, get_column_iterator, read_columns_indexes as _read_columns_indexes, + read_metadata as _read_metadata, read_pages_locations, BasicDecompressor, Decompressor, + MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State, }, schema::types::{ GroupLogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, @@ -38,9 +43,6 @@ pub use parquet2::{ types::int96_to_i64_ns, FallibleStreamingIterator, }; -use polars_error::PolarsResult; -pub use row_group::*; -pub use schema::{infer_schema, FileMetaData}; /// Trait describing a [`FallibleStreamingIterator`] of [`Page`] pub trait Pages: diff --git a/crates/polars-parquet/src/arrow/read/row_group.rs b/crates/polars-parquet/src/arrow/read/row_group.rs index 24c7c9c64d40..a988a6d8c562 100644 --- a/crates/polars-parquet/src/arrow/read/row_group.rs +++ b/crates/polars-parquet/src/arrow/read/row_group.rs @@ -3,13 +3,13 @@ use std::io::{Read, Seek}; use arrow::array::Array; use arrow::chunk::Chunk; use arrow::datatypes::Field; -use parquet2::indexes::FilteredPage; -use parquet2::metadata::ColumnChunkMetaData; -use parquet2::read::{BasicDecompressor, IndexedPageReader, PageMetaData, PageReader}; use polars_error::PolarsResult; use super::{ArrayIter, RowGroupMetaData}; use crate::arrow::read::column_iter_to_arrays; +use crate::parquet::indexes::FilteredPage; +use crate::parquet::metadata::ColumnChunkMetaData; +use crate::parquet::read::{BasicDecompressor, IndexedPageReader, PageMetaData, PageReader}; /// An [`Iterator`] of [`Chunk`] that (dynamically) adapts a vector of iterators of [`Array`] into /// an iterator of [`Chunk`]. @@ -132,8 +132,12 @@ where } type Pages = Box< - dyn Iterator> - + Sync + dyn Iterator< + Item = std::result::Result< + crate::parquet::page::CompressedPage, + crate::parquet::error::Error, + >, + > + Sync + Send, >; diff --git a/crates/polars-parquet/src/arrow/read/schema/convert.rs b/crates/polars-parquet/src/arrow/read/schema/convert.rs index 549eaf654d1d..3f1e961c0dd4 100644 --- a/crates/polars-parquet/src/arrow/read/schema/convert.rs +++ b/crates/polars-parquet/src/arrow/read/schema/convert.rs @@ -1,12 +1,12 @@ //! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`]. use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; -use parquet2::schema::types::{ + +use crate::arrow::read::schema::SchemaInferenceOptions; +use crate::parquet::schema::types::{ FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, }; -use parquet2::schema::Repetition; - -use crate::arrow::read::schema::SchemaInferenceOptions; +use crate::parquet::schema::Repetition; /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain /// any physical column. @@ -399,10 +399,10 @@ pub(crate) fn to_data_type( #[cfg(test)] mod tests { use arrow::datatypes::{DataType, Field, TimeUnit}; - use parquet2::metadata::SchemaDescriptor; use polars_error::*; use super::*; + use crate::parquet::metadata::SchemaDescriptor; #[test] fn test_flat_primitives() -> PolarsResult<()> { diff --git a/crates/polars-parquet/src/arrow/read/schema/metadata.rs b/crates/polars-parquet/src/arrow/read/schema/metadata.rs index c3056cd63597..557f4fedfab4 100644 --- a/crates/polars-parquet/src/arrow/read/schema/metadata.rs +++ b/crates/polars-parquet/src/arrow/read/schema/metadata.rs @@ -2,10 +2,10 @@ use arrow::datatypes::{Metadata, Schema}; use arrow::io::ipc::read::deserialize_schema; use base64::engine::general_purpose; use base64::Engine as _; -pub use parquet2::metadata::KeyValue; use polars_error::{polars_bail, PolarsResult}; use super::super::super::ARROW_SCHEMA_META_KEY; +pub use crate::parquet::metadata::KeyValue; /// Reads an arrow schema from Parquet's file metadata. Returns `None` if no schema was found. /// # Errors diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index 2d4c8b5da54f..a5242105574d 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -7,11 +7,11 @@ mod metadata; pub(crate) use convert::*; pub use convert::{parquet_to_arrow_schema, parquet_to_arrow_schema_with_options}; pub use metadata::read_schema_from_metadata; -pub use parquet2::metadata::{FileMetaData, KeyValue, SchemaDescriptor}; -pub use parquet2::schema::types::ParquetType; use polars_error::PolarsResult; use self::metadata::parse_key_value_metadata; +pub use crate::parquet::metadata::{FileMetaData, KeyValue, SchemaDescriptor}; +pub use crate::parquet::schema::types::ParquetType; /// Options when inferring schemas from Parquet pub struct SchemaInferenceOptions { diff --git a/crates/polars-parquet/src/arrow/read/statistics/binary.rs b/crates/polars-parquet/src/arrow/read/statistics/binary.rs index 925d81176e2b..7931cec42c1c 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/binary.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/binary.rs @@ -1,8 +1,9 @@ use arrow::array::{MutableArray, MutableBinaryArray}; use arrow::offset::Offset; -use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; +use crate::parquet::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; + pub(super) fn push( from: Option<&dyn ParquetStatistics>, min: &mut dyn MutableArray, diff --git a/crates/polars-parquet/src/arrow/read/statistics/boolean.rs b/crates/polars-parquet/src/arrow/read/statistics/boolean.rs index 23a5504124ce..07a823f3cf2f 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/boolean.rs @@ -1,7 +1,8 @@ use arrow::array::{MutableArray, MutableBooleanArray}; -use parquet2::statistics::{BooleanStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; +use crate::parquet::statistics::{BooleanStatistics, Statistics as ParquetStatistics}; + pub(super) fn push( from: Option<&dyn ParquetStatistics>, min: &mut dyn MutableArray, diff --git a/crates/polars-parquet/src/arrow/read/statistics/fixlen.rs b/crates/polars-parquet/src/arrow/read/statistics/fixlen.rs index b5e03eaa38f2..8758ce3c05e4 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/fixlen.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/fixlen.rs @@ -1,11 +1,11 @@ use arrow::array::*; use arrow::types::{days_ms, i256}; use ethnum::I256; -use parquet2::statistics::{FixedLenStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; use super::super::{convert_days_ms, convert_i128}; use crate::arrow::read::convert_i256; +use crate::parquet::statistics::{FixedLenStatistics, Statistics as ParquetStatistics}; pub(super) fn push_i128( from: Option<&dyn ParquetStatistics>, diff --git a/crates/polars-parquet/src/arrow/read/statistics/mod.rs b/crates/polars-parquet/src/arrow/read/statistics/mod.rs index ada51dbc2d39..0dcf04105ca7 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/mod.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/mod.rs @@ -1,4 +1,4 @@ -//! APIs exposing `parquet2`'s statistics as arrow's statistics. +//! APIs exposing `crate::parquet`'s statistics as arrow's statistics. use std::collections::VecDeque; use std::sync::Arc; @@ -7,16 +7,17 @@ use arrow::datatypes::{DataType, Field, IntervalUnit, PhysicalType}; use arrow::types::i256; use arrow::with_match_primitive_type; use ethnum::I256; -use parquet2::metadata::RowGroupMetaData; -use parquet2::schema::types::{ +use polars_error::{polars_bail, PolarsResult}; + +use crate::parquet::metadata::RowGroupMetaData; +use crate::parquet::schema::types::{ PhysicalType as ParquetPhysicalType, PrimitiveType as ParquetPrimitiveType, }; -use parquet2::statistics::{ +use crate::parquet::statistics::{ BinaryStatistics, BooleanStatistics, FixedLenStatistics, PrimitiveStatistics, Statistics as ParquetStatistics, }; -use parquet2::types::int96_to_i64_ns; -use polars_error::{polars_bail, PolarsResult}; +use crate::parquet::types::int96_to_i64_ns; mod binary; mod boolean; diff --git a/crates/polars-parquet/src/arrow/read/statistics/primitive.rs b/crates/polars-parquet/src/arrow/read/statistics/primitive.rs index ecfa2e18972f..e6a48ed67628 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/primitive.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/primitive.rs @@ -1,11 +1,12 @@ use arrow::array::*; use arrow::datatypes::TimeUnit; use arrow::types::NativeType; -use parquet2::schema::types::{PrimitiveLogicalType, TimeUnit as ParquetTimeUnit}; -use parquet2::statistics::{PrimitiveStatistics, Statistics as ParquetStatistics}; -use parquet2::types::NativeType as ParquetNativeType; use polars_error::PolarsResult; +use crate::parquet::schema::types::{PrimitiveLogicalType, TimeUnit as ParquetTimeUnit}; +use crate::parquet::statistics::{PrimitiveStatistics, Statistics as ParquetStatistics}; +use crate::parquet::types::NativeType as ParquetNativeType; + pub fn timestamp(logical_type: Option<&PrimitiveLogicalType>, time_unit: TimeUnit, x: i64) -> i64 { let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = logical_type { unit diff --git a/crates/polars-parquet/src/arrow/read/statistics/utf8.rs b/crates/polars-parquet/src/arrow/read/statistics/utf8.rs index a716e8d22b8a..b12d6b7b11a9 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/utf8.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/utf8.rs @@ -1,8 +1,9 @@ use arrow::array::{MutableArray, MutableUtf8Array}; use arrow::offset::Offset; -use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; +use crate::parquet::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; + pub(super) fn push( from: Option<&dyn ParquetStatistics>, min: &mut dyn MutableArray, diff --git a/crates/polars-parquet/src/arrow/write/binary/basic.rs b/crates/polars-parquet/src/arrow/write/binary/basic.rs index ee2cb022e699..3390a90c43e3 100644 --- a/crates/polars-parquet/src/arrow/write/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binary/basic.rs @@ -1,14 +1,16 @@ use arrow::array::{Array, BinaryArray}; use arrow::bitmap::Bitmap; use arrow::offset::Offset; -use parquet2::encoding::{delta_bitpacked, Encoding}; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::{serialize_statistics, BinaryStatistics, ParquetStatistics, Statistics}; use polars_error::{polars_bail, PolarsResult}; use super::super::{utils, WriteOptions}; use crate::arrow::read::schema::is_nullable; +use crate::parquet::encoding::{delta_bitpacked, Encoding}; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::{ + serialize_statistics, BinaryStatistics, ParquetStatistics, Statistics, +}; pub(crate) fn encode_plain( array: &BinaryArray, diff --git a/crates/polars-parquet/src/arrow/write/binary/nested.rs b/crates/polars-parquet/src/arrow/write/binary/nested.rs index d72917df942d..3b20b3af4936 100644 --- a/crates/polars-parquet/src/arrow/write/binary/nested.rs +++ b/crates/polars-parquet/src/arrow/write/binary/nested.rs @@ -1,14 +1,14 @@ use arrow::array::{Array, BinaryArray}; use arrow::offset::Offset; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::Nested; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; pub fn array_to_page( array: &BinaryArray, diff --git a/crates/polars-parquet/src/arrow/write/boolean/basic.rs b/crates/polars-parquet/src/arrow/write/boolean/basic.rs index c18a9b0bfb24..466039c9b55d 100644 --- a/crates/polars-parquet/src/arrow/write/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/basic.rs @@ -1,15 +1,15 @@ use arrow::array::*; -use parquet2::encoding::hybrid_rle::bitpacked_encode; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::{ - serialize_statistics, BooleanStatistics, ParquetStatistics, Statistics, -}; use polars_error::PolarsResult; use super::super::{utils, WriteOptions}; use crate::arrow::read::schema::is_nullable; +use crate::parquet::encoding::hybrid_rle::bitpacked_encode; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::{ + serialize_statistics, BooleanStatistics, ParquetStatistics, Statistics, +}; fn encode(iterator: impl Iterator, buffer: &mut Vec) -> PolarsResult<()> { // encode values using bitpacking diff --git a/crates/polars-parquet/src/arrow/write/boolean/nested.rs b/crates/polars-parquet/src/arrow/write/boolean/nested.rs index 3ee9cfba328f..eb7a66cfd32c 100644 --- a/crates/polars-parquet/src/arrow/write/boolean/nested.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/nested.rs @@ -1,13 +1,13 @@ use arrow::array::{Array, BooleanArray}; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::Nested; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; pub fn array_to_page( array: &BooleanArray, diff --git a/crates/polars-parquet/src/arrow/write/dictionary.rs b/crates/polars-parquet/src/arrow/write/dictionary.rs index e996e78770d5..519e65d28ad6 100644 --- a/crates/polars-parquet/src/arrow/write/dictionary.rs +++ b/crates/polars-parquet/src/arrow/write/dictionary.rs @@ -1,12 +1,6 @@ use arrow::array::{Array, DictionaryArray, DictionaryKey}; use arrow::bitmap::{Bitmap, MutableBitmap}; use arrow::datatypes::DataType; -use parquet2::encoding::hybrid_rle::encode_u32; -use parquet2::encoding::Encoding; -use parquet2::page::{DictPage, Page}; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::{serialize_statistics, ParquetStatistics}; -use parquet2::write::DynIter; use polars_error::{polars_bail, PolarsResult}; use super::binary::{ @@ -22,6 +16,12 @@ use super::utf8::{build_statistics as utf8_build_statistics, encode_plain as utf use super::{nested, Nested, WriteOptions}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::{slice_nested_leaf, utils}; +use crate::parquet::encoding::hybrid_rle::encode_u32; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::{DictPage, Page}; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::{serialize_statistics, ParquetStatistics}; +use crate::parquet::write::DynIter; fn serialize_def_levels_simple( validity: Option<&Bitmap>, diff --git a/crates/polars-parquet/src/arrow/write/file.rs b/crates/polars-parquet/src/arrow/write/file.rs index b0fad55b1e1d..a44a914ad46f 100644 --- a/crates/polars-parquet/src/arrow/write/file.rs +++ b/crates/polars-parquet/src/arrow/write/file.rs @@ -1,12 +1,12 @@ use std::io::Write; use arrow::datatypes::Schema; -use parquet2::metadata::{KeyValue, SchemaDescriptor}; -use parquet2::write::{RowGroupIter, WriteOptions as FileWriteOptions}; use polars_error::{PolarsError, PolarsResult}; use super::schema::schema_to_metadata_key; use super::{to_parquet_schema, ThriftFileMetaData, WriteOptions}; +use crate::parquet::metadata::{KeyValue, SchemaDescriptor}; +use crate::parquet::write::{RowGroupIter, WriteOptions as FileWriteOptions}; /// Attaches [`Schema`] to `key_value_metadata` pub fn add_arrow_schema( @@ -23,7 +23,7 @@ pub fn add_arrow_schema( /// An interface to write a parquet to a [`Write`] pub struct FileWriter { - writer: parquet2::write::FileWriter, + writer: crate::parquet::write::FileWriter, schema: Schema, options: WriteOptions, } @@ -56,7 +56,7 @@ impl FileWriter { let created_by = Some("Arrow2 - Native Rust implementation of Arrow".to_string()); Ok(Self { - writer: parquet2::write::FileWriter::new( + writer: crate::parquet::write::FileWriter::new( writer, parquet_schema, FileWriteOptions { diff --git a/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs b/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs index 0531f66ffa4d..c1ce9754a4ed 100644 --- a/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs +++ b/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs @@ -1,14 +1,14 @@ use arrow::array::{Array, FixedSizeBinaryArray, PrimitiveArray}; use arrow::types::i256; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::{serialize_statistics, FixedLenStatistics}; use polars_error::PolarsResult; use super::binary::ord_binary; use super::{utils, WriteOptions}; use crate::arrow::read::schema::is_nullable; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::{serialize_statistics, FixedLenStatistics}; pub(crate) fn encode_plain(array: &FixedSizeBinaryArray, is_optional: bool, buffer: &mut Vec) { // append the non-null values diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index a1db4a4f4147..7276d90639dd 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -32,20 +32,23 @@ use arrow::datatypes::*; use arrow::types::{days_ms, i256, NativeType}; pub use nested::{num_values, write_rep_and_def}; pub use pages::{to_leaves, to_nested, to_parquet_leaves}; -pub use parquet2::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; -pub use parquet2::encoding::Encoding; -pub use parquet2::metadata::{ +pub use utils::write_def_levels; + +pub use crate::parquet::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; +pub use crate::parquet::encoding::Encoding; +pub use crate::parquet::metadata::{ Descriptor, FileMetaData, KeyValue, SchemaDescriptor, ThriftFileMetaData, }; -pub use parquet2::page::{CompressedDataPage, CompressedPage, Page}; -use parquet2::schema::types::PrimitiveType as ParquetPrimitiveType; -pub use parquet2::schema::types::{FieldInfo, ParquetType, PhysicalType as ParquetPhysicalType}; -pub use parquet2::write::{ +pub use crate::parquet::page::{CompressedDataPage, CompressedPage, Page}; +use crate::parquet::schema::types::PrimitiveType as ParquetPrimitiveType; +pub use crate::parquet::schema::types::{ + FieldInfo, ParquetType, PhysicalType as ParquetPhysicalType, +}; +pub use crate::parquet::write::{ compress, write_metadata_sidecar, Compressor, DynIter, DynStreamingIterator, RowGroupIter, Version, }; -pub use parquet2::{fallible_streaming_iterator, FallibleStreamingIterator}; -pub use utils::write_def_levels; +pub use crate::parquet::{fallible_streaming_iterator, FallibleStreamingIterator}; /// Currently supported options to write to parquet #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/crates/polars-parquet/src/arrow/write/nested/mod.rs b/crates/polars-parquet/src/arrow/write/nested/mod.rs index 4fed334a820f..c53d266255c5 100644 --- a/crates/polars-parquet/src/arrow/write/nested/mod.rs +++ b/crates/polars-parquet/src/arrow/write/nested/mod.rs @@ -2,13 +2,13 @@ mod def; mod rep; use arrow::offset::Offset; -use parquet2::encoding::hybrid_rle::encode_u32; -use parquet2::read::levels::get_bit_width; -use parquet2::write::Version; use polars_error::PolarsResult; pub use rep::num_values; use super::Nested; +use crate::parquet::encoding::hybrid_rle::encode_u32; +use crate::parquet::read::levels::get_bit_width; +use crate::parquet::write::Version; fn write_levels_v1) -> PolarsResult<()>>( buffer: &mut Vec, diff --git a/crates/polars-parquet/src/arrow/write/pages.rs b/crates/polars-parquet/src/arrow/write/pages.rs index 46698d083f96..2a3a3ac47f22 100644 --- a/crates/polars-parquet/src/arrow/write/pages.rs +++ b/crates/polars-parquet/src/arrow/write/pages.rs @@ -4,13 +4,13 @@ use arrow::array::{Array, ListArray, MapArray, StructArray}; use arrow::bitmap::Bitmap; use arrow::datatypes::PhysicalType; use arrow::offset::{Offset, OffsetsBuffer}; -use parquet2::page::Page; -use parquet2::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType}; -use parquet2::write::DynIter; use polars_error::{polars_bail, PolarsResult}; use super::{array_to_pages, Encoding, WriteOptions}; use crate::arrow::read::schema::is_nullable; +use crate::parquet::page::Page; +use crate::parquet::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType}; +use crate::parquet::write::DynIter; #[derive(Debug, Clone, PartialEq)] pub struct ListNested { @@ -259,11 +259,13 @@ mod tests { use arrow::array::*; use arrow::bitmap::Bitmap; use arrow::datatypes::*; - use parquet2::schema::types::{GroupLogicalType, PrimitiveConvertedType, PrimitiveLogicalType}; - use parquet2::schema::Repetition; use super::super::{FieldInfo, ParquetPhysicalType, ParquetPrimitiveType}; use super::*; + use crate::parquet::schema::types::{ + GroupLogicalType, PrimitiveConvertedType, PrimitiveLogicalType, + }; + use crate::parquet::schema::Repetition; #[test] fn test_struct() { diff --git a/crates/polars-parquet/src/arrow/write/primitive/basic.rs b/crates/polars-parquet/src/arrow/write/primitive/basic.rs index 81d7b5cba943..a83e1f22d45d 100644 --- a/crates/polars-parquet/src/arrow/write/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/basic.rs @@ -1,16 +1,16 @@ use arrow::array::{Array, PrimitiveArray}; use arrow::types::NativeType; -use parquet2::encoding::delta_bitpacked::encode; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::{serialize_statistics, PrimitiveStatistics}; -use parquet2::types::NativeType as ParquetNativeType; use polars_error::{polars_bail, PolarsResult}; use super::super::{utils, WriteOptions}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::utils::ExactSizedIter; +use crate::parquet::encoding::delta_bitpacked::encode; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::{serialize_statistics, PrimitiveStatistics}; +use crate::parquet::types::NativeType as ParquetNativeType; pub(crate) fn encode_plain( array: &PrimitiveArray, diff --git a/crates/polars-parquet/src/arrow/write/primitive/nested.rs b/crates/polars-parquet/src/arrow/write/primitive/nested.rs index a5cb2229de6f..22f6ec7d8148 100644 --- a/crates/polars-parquet/src/arrow/write/primitive/nested.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/nested.rs @@ -1,16 +1,16 @@ use arrow::array::{Array, PrimitiveArray}; use arrow::types::NativeType as ArrowNativeType; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::serialize_statistics; -use parquet2::types::NativeType; use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::Nested; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::serialize_statistics; +use crate::parquet::types::NativeType; pub fn array_to_page( array: &PrimitiveArray, diff --git a/crates/polars-parquet/src/arrow/write/row_group.rs b/crates/polars-parquet/src/arrow/write/row_group.rs index 6d2269c178b9..88ede63c3d75 100644 --- a/crates/polars-parquet/src/arrow/write/row_group.rs +++ b/crates/polars-parquet/src/arrow/write/row_group.rs @@ -1,16 +1,16 @@ use arrow::array::Array; use arrow::chunk::Chunk; use arrow::datatypes::Schema; -use parquet2::error::Error as ParquetError; -use parquet2::schema::types::ParquetType; -use parquet2::write::Compressor; -use parquet2::FallibleStreamingIterator; use polars_error::{polars_bail, to_compute_err, PolarsError, PolarsResult}; use super::{ array_to_columns, to_parquet_schema, DynIter, DynStreamingIterator, Encoding, RowGroupIter, SchemaDescriptor, WriteOptions, }; +use crate::parquet::error::Error as ParquetError; +use crate::parquet::schema::types::ParquetType; +use crate::parquet::write::Compressor; +use crate::parquet::FallibleStreamingIterator; /// Maps a [`Chunk`] and parquet-specific options to an [`RowGroupIter`] used to /// write to parquet diff --git a/crates/polars-parquet/src/arrow/write/schema.rs b/crates/polars-parquet/src/arrow/write/schema.rs index 89fa6c7ef99a..e4a84a7d7906 100644 --- a/crates/polars-parquet/src/arrow/write/schema.rs +++ b/crates/polars-parquet/src/arrow/write/schema.rs @@ -2,16 +2,16 @@ use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::io::ipc::write::{default_ipc_fields, schema_to_bytes}; use base64::engine::general_purpose; use base64::Engine as _; -use parquet2::metadata::KeyValue; -use parquet2::schema::types::{ - GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, - PrimitiveConvertedType, PrimitiveLogicalType, TimeUnit as ParquetTimeUnit, -}; -use parquet2::schema::Repetition; use polars_error::{polars_bail, PolarsResult}; use super::super::ARROW_SCHEMA_META_KEY; use crate::arrow::write::decimal_length_from_precision; +use crate::parquet::metadata::KeyValue; +use crate::parquet::schema::types::{ + GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, + PrimitiveConvertedType, PrimitiveLogicalType, TimeUnit as ParquetTimeUnit, +}; +use crate::parquet::schema::Repetition; pub fn schema_to_metadata_key(schema: &Schema) -> KeyValue { let serialized_schema = schema_to_bytes(schema, &default_ipc_fields(&schema.fields)); diff --git a/crates/polars-parquet/src/arrow/write/sink.rs b/crates/polars-parquet/src/arrow/write/sink.rs index 16ffd4176d1d..1ddef77aa687 100644 --- a/crates/polars-parquet/src/arrow/write/sink.rs +++ b/crates/polars-parquet/src/arrow/write/sink.rs @@ -7,12 +7,12 @@ use arrow::chunk::Chunk; use arrow::datatypes::Schema; use futures::future::BoxFuture; use futures::{AsyncWrite, AsyncWriteExt, FutureExt, Sink, TryFutureExt}; -use parquet2::metadata::KeyValue; -use parquet2::write::{FileStreamer, WriteOptions as ParquetWriteOptions}; use polars_error::{polars_bail, to_compute_err, PolarsError, PolarsResult}; use super::file::add_arrow_schema; use super::{Encoding, SchemaDescriptor, WriteOptions}; +use crate::parquet::metadata::KeyValue; +use crate::parquet::write::{FileStreamer, WriteOptions as ParquetWriteOptions}; /// Sink that writes array [`chunks`](Chunk) as a Parquet file. /// diff --git a/crates/polars-parquet/src/arrow/write/utf8/basic.rs b/crates/polars-parquet/src/arrow/write/utf8/basic.rs index cb64dfd561f5..f1d874683fcd 100644 --- a/crates/polars-parquet/src/arrow/write/utf8/basic.rs +++ b/crates/polars-parquet/src/arrow/write/utf8/basic.rs @@ -1,14 +1,16 @@ use arrow::array::{Array, Utf8Array}; use arrow::offset::Offset; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::{serialize_statistics, BinaryStatistics, ParquetStatistics, Statistics}; use polars_error::{polars_bail, PolarsResult}; use super::super::binary::{encode_delta, ord_binary}; use super::super::{utils, WriteOptions}; use crate::arrow::read::schema::is_nullable; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::{ + serialize_statistics, BinaryStatistics, ParquetStatistics, Statistics, +}; pub(crate) fn encode_plain( array: &Utf8Array, diff --git a/crates/polars-parquet/src/arrow/write/utf8/nested.rs b/crates/polars-parquet/src/arrow/write/utf8/nested.rs index a0a8640dde9f..1cc0a1f0523b 100644 --- a/crates/polars-parquet/src/arrow/write/utf8/nested.rs +++ b/crates/polars-parquet/src/arrow/write/utf8/nested.rs @@ -1,14 +1,14 @@ use arrow::array::{Array, Utf8Array}; use arrow::offset::Offset; -use parquet2::encoding::Encoding; -use parquet2::page::DataPage; -use parquet2::schema::types::PrimitiveType; use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::Nested; +use crate::parquet::encoding::Encoding; +use crate::parquet::page::DataPage; +use crate::parquet::schema::types::PrimitiveType; pub fn array_to_page( array: &Utf8Array, diff --git a/crates/polars-parquet/src/arrow/write/utils.rs b/crates/polars-parquet/src/arrow/write/utils.rs index c7424e6e6f4d..6497240374c6 100644 --- a/crates/polars-parquet/src/arrow/write/utils.rs +++ b/crates/polars-parquet/src/arrow/write/utils.rs @@ -1,14 +1,14 @@ use arrow::bitmap::Bitmap; -use parquet2::compression::CompressionOptions; -use parquet2::encoding::hybrid_rle::encode_bool; -use parquet2::encoding::Encoding; -use parquet2::metadata::Descriptor; -use parquet2::page::{DataPage, DataPageHeader, DataPageHeaderV1, DataPageHeaderV2}; -use parquet2::schema::types::PrimitiveType; -use parquet2::statistics::ParquetStatistics; use polars_error::PolarsResult; use super::{Version, WriteOptions}; +use crate::parquet::compression::CompressionOptions; +use crate::parquet::encoding::hybrid_rle::encode_bool; +use crate::parquet::encoding::Encoding; +use crate::parquet::metadata::Descriptor; +use crate::parquet::page::{DataPage, DataPageHeader, DataPageHeaderV1, DataPageHeaderV2}; +use crate::parquet::schema::types::PrimitiveType; +use crate::parquet::statistics::ParquetStatistics; fn encode_iter_v1>(buffer: &mut Vec, iter: I) -> PolarsResult<()> { buffer.extend_from_slice(&[0; 4]); diff --git a/crates/polars-parquet/src/lib.rs b/crates/polars-parquet/src/lib.rs index 4b64c583ce23..ae45ad4df442 100644 --- a/crates/polars-parquet/src/lib.rs +++ b/crates/polars-parquet/src/lib.rs @@ -1,3 +1,4 @@ #![allow(clippy::len_without_is_empty)] pub mod arrow; pub use arrow::{read, write}; +pub mod parquet; diff --git a/crates/polars-parquet/src/parquet/bloom_filter/hash.rs b/crates/polars-parquet/src/parquet/bloom_filter/hash.rs new file mode 100644 index 000000000000..c535faa44d76 --- /dev/null +++ b/crates/polars-parquet/src/parquet/bloom_filter/hash.rs @@ -0,0 +1,17 @@ +use xxhash_rust::xxh64::xxh64; + +use crate::parquet::types::NativeType; + +const SEED: u64 = 0; + +/// (xxh64) hash of a [`NativeType`]. +#[inline] +pub fn hash_native(value: T) -> u64 { + xxh64(value.to_le_bytes().as_ref(), SEED) +} + +/// (xxh64) hash of a sequence of bytes (e.g. ByteArray). +#[inline] +pub fn hash_byte>(value: A) -> u64 { + xxh64(value.as_ref(), SEED) +} diff --git a/crates/polars-parquet/src/parquet/bloom_filter/mod.rs b/crates/polars-parquet/src/parquet/bloom_filter/mod.rs new file mode 100644 index 000000000000..218715d7ac5f --- /dev/null +++ b/crates/polars-parquet/src/parquet/bloom_filter/mod.rs @@ -0,0 +1,71 @@ +//! API to read and use bloom filters +mod hash; +mod read; +mod split_block; + +pub use hash::{hash_byte, hash_native}; +pub use read::read; +pub use split_block::{insert, is_in_set}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basics() { + let mut bitset = vec![0; 32]; + + // insert + for a in 0..10i64 { + let hash = hash_native(a); + insert(&mut bitset, hash); + } + + // bloom filter produced by parquet-mr/spark for a column of i64 (0..=10) + /* + import pyspark.sql // 3.2.1 + spark = pyspark.sql.SparkSession.builder.getOrCreate() + spark.conf.set("parquet.bloom.filter.enabled", True) + spark.conf.set("parquet.bloom.filter.expected.ndv", 10) + spark.conf.set("parquet.bloom.filter.max.bytes", 32) + + data = [(i % 10,) for i in range(100)] + df = spark.createDataFrame(data, ["id"]).repartition(1) + + df.write.parquet("bla.parquet", mode = "overwrite") + */ + let expected: &[u8] = &[ + 24, 130, 24, 8, 134, 8, 68, 6, 2, 101, 128, 10, 64, 2, 38, 78, 114, 1, 64, 38, 1, 192, + 194, 152, 64, 70, 0, 36, 56, 121, 64, 0, + ]; + assert_eq!(bitset, expected); + + // check + for a in 0..11i64 { + let hash = hash_native(a); + + let valid = is_in_set(&bitset, hash); + + assert_eq!(a < 10, valid); + } + } + + #[test] + fn binary() { + let mut bitset = vec![0; 32]; + + // insert + for a in 0..10i64 { + let value = format!("a{}", a); + let hash = hash_byte(value); + insert(&mut bitset, hash); + } + + // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10 + let expected: &[u8] = &[ + 200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5, + 99, 65, 2, 0, 224, 44, 64, 78, 96, 4, + ]; + assert_eq!(bitset, expected); + } +} diff --git a/crates/polars-parquet/src/parquet/bloom_filter/read.rs b/crates/polars-parquet/src/parquet/bloom_filter/read.rs new file mode 100644 index 000000000000..50c90a2e407a --- /dev/null +++ b/crates/polars-parquet/src/parquet/bloom_filter/read.rs @@ -0,0 +1,51 @@ +use std::io::{Read, Seek, SeekFrom}; + +use parquet_format_safe::thrift::protocol::TCompactInputProtocol; +use parquet_format_safe::{ + BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHeader, SplitBlockAlgorithm, + Uncompressed, +}; + +use crate::parquet::error::Error; +use crate::parquet::metadata::ColumnChunkMetaData; + +/// Reads the bloom filter associated to [`ColumnChunkMetaData`] into `bitset`. +/// Results in an empty `bitset` if there is no associated bloom filter or the algorithm is not supported. +/// # Error +/// Errors if the column contains no metadata or the filter can't be read or deserialized. +pub fn read( + column_metadata: &ColumnChunkMetaData, + mut reader: &mut R, + bitset: &mut Vec, +) -> Result<(), Error> { + let offset = column_metadata.metadata().bloom_filter_offset; + + let offset = if let Some(offset) = offset { + offset as u64 + } else { + bitset.clear(); + return Ok(()); + }; + reader.seek(SeekFrom::Start(offset))?; + + // deserialize header + let mut prot = TCompactInputProtocol::new(&mut reader, usize::MAX); // max is ok since `BloomFilterHeader` never allocates + let header = BloomFilterHeader::read_from_in_protocol(&mut prot)?; + + if header.algorithm != BloomFilterAlgorithm::BLOCK(SplitBlockAlgorithm {}) { + bitset.clear(); + return Ok(()); + } + if header.compression != BloomFilterCompression::UNCOMPRESSED(Uncompressed {}) { + bitset.clear(); + return Ok(()); + } + + let length: usize = header.num_bytes.try_into()?; + + bitset.clear(); + bitset.try_reserve(length)?; + reader.by_ref().take(length as u64).read_to_end(bitset)?; + + Ok(()) +} diff --git a/crates/polars-parquet/src/parquet/bloom_filter/split_block.rs b/crates/polars-parquet/src/parquet/bloom_filter/split_block.rs new file mode 100644 index 000000000000..576f4d5f1aba --- /dev/null +++ b/crates/polars-parquet/src/parquet/bloom_filter/split_block.rs @@ -0,0 +1,82 @@ +use std::convert::TryInto; + +/// magic numbers taken from https://github.com/apache/parquet-format/blob/master/BloomFilter.md +const SALT: [u32; 8] = [ + 1203114875, 1150766481, 2284105051, 2729912477, 1884591559, 770785867, 2667333959, 1550580529, +]; + +fn hash_to_block_index(hash: u64, len: usize) -> usize { + let number_of_blocks = len as u64 / 32; + let low_hash = hash >> 32; + let block_index = ((low_hash * number_of_blocks) >> 32) as u32; + block_index as usize +} + +fn new_mask(x: u32) -> [u32; 8] { + let mut a = [0u32; 8]; + for i in 0..8 { + let mask = x.wrapping_mul(SALT[i]); + let mask = mask >> 27; + let mask = 0x1 << mask; + a[i] = mask; + } + a +} + +/// loads a block from the bitset to the stack +#[inline] +fn load_block(bitset: &[u8]) -> [u32; 8] { + let mut a = [0u32; 8]; + let bitset = bitset.chunks_exact(4).take(8); + for (a, chunk) in a.iter_mut().zip(bitset) { + *a = u32::from_le_bytes(chunk.try_into().unwrap()) + } + a +} + +/// assigns a block from the stack to `bitset` +#[inline] +fn unload_block(block: [u32; 8], bitset: &mut [u8]) { + let bitset = bitset.chunks_exact_mut(4).take(8); + for (a, chunk) in block.iter().zip(bitset) { + let a = a.to_le_bytes(); + chunk[0] = a[0]; + chunk[1] = a[1]; + chunk[2] = a[2]; + chunk[3] = a[3]; + } +} + +/// Returns whether the `hash` is in the set +pub fn is_in_set(bitset: &[u8], hash: u64) -> bool { + let block_index = hash_to_block_index(hash, bitset.len()); + let key = hash as u32; + + let mask = new_mask(key); + let slice = &bitset[block_index * 32..(block_index + 1) * 32]; + let block_mask = load_block(slice); + + for i in 0..8 { + if mask[i] & block_mask[i] == 0 { + return false; + } + } + true +} + +/// Inserts a new hash to the set +pub fn insert(bitset: &mut [u8], hash: u64) { + let block_index = hash_to_block_index(hash, bitset.len()); + let key = hash as u32; + + let mask = new_mask(key); + let slice = &bitset[block_index * 32..(block_index + 1) * 32]; + let mut block_mask = load_block(slice); + + for i in 0..8 { + block_mask[i] |= mask[i]; + + let mut_slice = &mut bitset[block_index * 32..(block_index + 1) * 32]; + unload_block(block_mask, mut_slice) + } +} diff --git a/crates/polars-parquet/src/parquet/compression.rs b/crates/polars-parquet/src/parquet/compression.rs new file mode 100644 index 000000000000..9ec187ddee9b --- /dev/null +++ b/crates/polars-parquet/src/parquet/compression.rs @@ -0,0 +1,385 @@ +//! Functionality to compress and decompress data according to the parquet specification +pub use super::parquet_bridge::{ + BrotliLevel, Compression, CompressionOptions, GzipLevel, ZstdLevel, +}; +use crate::parquet::error::{Error, Result}; + +fn inner_compress Result, F: Fn(&[u8], &mut [u8]) -> Result>( + input: &[u8], + output: &mut Vec, + get_length: G, + compress: F, +) -> Result<()> { + let original_length = output.len(); + let max_required_length = get_length(input.len())?; + + output.resize(original_length + max_required_length, 0); + let compressed_size = compress(input, &mut output[original_length..])?; + + output.truncate(original_length + compressed_size); + Ok(()) +} + +/// Compresses data stored in slice `input_buf` and writes the compressed result +/// to `output_buf`. +/// Note that you'll need to call `clear()` before reusing the same `output_buf` +/// across different `compress` calls. +pub fn compress( + compression: CompressionOptions, + input_buf: &[u8], + output_buf: &mut Vec, +) -> Result<()> { + match compression { + #[cfg(feature = "brotli")] + CompressionOptions::Brotli(level) => { + use std::io::Write; + const BROTLI_DEFAULT_BUFFER_SIZE: usize = 4096; + const BROTLI_DEFAULT_LG_WINDOW_SIZE: u32 = 22; // recommended between 20-22 + + let q = level.unwrap_or_default(); + let mut encoder = brotli::CompressorWriter::new( + output_buf, + BROTLI_DEFAULT_BUFFER_SIZE, + q.compression_level(), + BROTLI_DEFAULT_LG_WINDOW_SIZE, + ); + encoder.write_all(input_buf)?; + encoder.flush().map_err(|e| e.into()) + }, + #[cfg(not(feature = "brotli"))] + CompressionOptions::Brotli(_) => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Brotli, + "compress to brotli".to_string(), + )), + #[cfg(feature = "gzip")] + CompressionOptions::Gzip(level) => { + use std::io::Write; + let level = level.unwrap_or_default(); + let mut encoder = flate2::write::GzEncoder::new(output_buf, level.into()); + encoder.write_all(input_buf)?; + encoder.try_finish().map_err(|e| e.into()) + }, + #[cfg(not(feature = "gzip"))] + CompressionOptions::Gzip(_) => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Gzip, + "compress to gzip".to_string(), + )), + #[cfg(feature = "snappy")] + CompressionOptions::Snappy => inner_compress( + input_buf, + output_buf, + |len| Ok(snap::raw::max_compress_len(len)), + |input, output| Ok(snap::raw::Encoder::new().compress(input, output)?), + ), + #[cfg(not(feature = "snappy"))] + CompressionOptions::Snappy => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Snappy, + "compress to snappy".to_string(), + )), + #[cfg(feature = "lz4")] + CompressionOptions::Lz4Raw => inner_compress( + input_buf, + output_buf, + |len| Ok(lz4::block::compress_bound(len)?), + |input, output| { + let compressed_size = lz4::block::compress_to_buffer(input, None, false, output)?; + Ok(compressed_size) + }, + ), + #[cfg(all(not(feature = "lz4"), not(feature = "lz4_flex")))] + CompressionOptions::Lz4Raw => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Lz4, + "compress to lz4".to_string(), + )), + #[cfg(feature = "zstd")] + CompressionOptions::Zstd(level) => { + use std::io::Write; + let level = level.map(|v| v.compression_level()).unwrap_or_default(); + + let mut encoder = zstd::Encoder::new(output_buf, level)?; + encoder.write_all(input_buf)?; + match encoder.finish() { + Ok(_) => Ok(()), + Err(e) => Err(e.into()), + } + }, + #[cfg(not(feature = "zstd"))] + CompressionOptions::Zstd(_) => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Zstd, + "compress to zstd".to_string(), + )), + CompressionOptions::Uncompressed => Err(Error::InvalidParameter( + "Compressing uncompressed".to_string(), + )), + _ => Err(Error::FeatureNotSupported(format!( + "Compression {:?} is not supported", + compression, + ))), + } +} + +/// Decompresses data stored in slice `input_buf` and writes output to `output_buf`. +/// Returns the total number of bytes written. +pub fn decompress(compression: Compression, input_buf: &[u8], output_buf: &mut [u8]) -> Result<()> { + match compression { + #[cfg(feature = "brotli")] + Compression::Brotli => { + use std::io::Read; + const BROTLI_DEFAULT_BUFFER_SIZE: usize = 4096; + brotli::Decompressor::new(input_buf, BROTLI_DEFAULT_BUFFER_SIZE) + .read_exact(output_buf) + .map_err(|e| e.into()) + }, + #[cfg(not(feature = "brotli"))] + Compression::Brotli => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Brotli, + "decompress with brotli".to_string(), + )), + #[cfg(feature = "gzip")] + Compression::Gzip => { + use std::io::Read; + let mut decoder = flate2::read::GzDecoder::new(input_buf); + decoder.read_exact(output_buf).map_err(|e| e.into()) + }, + #[cfg(not(feature = "gzip"))] + Compression::Gzip => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Gzip, + "decompress with gzip".to_string(), + )), + #[cfg(feature = "snappy")] + Compression::Snappy => { + use snap::raw::{decompress_len, Decoder}; + + let len = decompress_len(input_buf)?; + if len > output_buf.len() { + return Err(Error::OutOfSpec(String::from("snappy header out of spec"))); + } + Decoder::new() + .decompress(input_buf, output_buf) + .map_err(|e| e.into()) + .map(|_| ()) + }, + #[cfg(not(feature = "snappy"))] + Compression::Snappy => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Snappy, + "decompress with snappy".to_string(), + )), + #[cfg(all(feature = "lz4_flex", not(feature = "lz4")))] + Compression::Lz4Raw => lz4_flex::block::decompress_into(input_buf, output_buf) + .map(|_| {}) + .map_err(|e| e.into()), + #[cfg(feature = "lz4")] + Compression::Lz4Raw => { + lz4::block::decompress_to_buffer(input_buf, Some(output_buf.len() as i32), output_buf) + .map(|_| {}) + .map_err(|e| e.into()) + }, + #[cfg(all(not(feature = "lz4"), not(feature = "lz4_flex")))] + Compression::Lz4Raw => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Lz4, + "decompress with lz4".to_string(), + )), + + #[cfg(any(feature = "lz4_flex", feature = "lz4"))] + Compression::Lz4 => try_decompress_hadoop(input_buf, output_buf).or_else(|_| { + lz4_decompress_to_buffer(input_buf, Some(output_buf.len() as i32), output_buf) + .map(|_| {}) + }), + + #[cfg(all(not(feature = "lz4_flex"), not(feature = "lz4")))] + Compression::Lz4 => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Lz4, + "decompress with legacy lz4".to_string(), + )), + + #[cfg(feature = "zstd")] + Compression::Zstd => { + use std::io::Read; + let mut decoder = zstd::Decoder::new(input_buf)?; + decoder.read_exact(output_buf).map_err(|e| e.into()) + }, + #[cfg(not(feature = "zstd"))] + Compression::Zstd => Err(Error::FeatureNotActive( + crate::parquet::error::Feature::Zstd, + "decompress with zstd".to_string(), + )), + Compression::Uncompressed => Err(Error::InvalidParameter( + "Compressing uncompressed".to_string(), + )), + _ => Err(Error::FeatureNotSupported(format!( + "Compression {:?} is not supported", + compression, + ))), + } +} + +/// Try to decompress the buffer as if it was compressed with the Hadoop Lz4Codec. +/// Translated from the apache arrow c++ function [TryDecompressHadoop](https://github.com/apache/arrow/blob/bf18e6e4b5bb6180706b1ba0d597a65a4ce5ca48/cpp/src/arrow/util/compression_lz4.cc#L474). +/// Returns error if decompression failed. +#[cfg(any(feature = "lz4", feature = "lz4_flex"))] +fn try_decompress_hadoop(input_buf: &[u8], output_buf: &mut [u8]) -> Result<()> { + // Parquet files written with the Hadoop Lz4Codec use their own framing. + // The input buffer can contain an arbitrary number of "frames", each + // with the following structure: + // - bytes 0..3: big-endian uint32_t representing the frame decompressed size + // - bytes 4..7: big-endian uint32_t representing the frame compressed size + // - bytes 8...: frame compressed data + // + // The Hadoop Lz4Codec source code can be found here: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc + + const SIZE_U32: usize = std::mem::size_of::(); + const PREFIX_LEN: usize = SIZE_U32 * 2; + let mut input_len = input_buf.len(); + let mut input = input_buf; + let mut output_len = output_buf.len(); + let mut output: &mut [u8] = output_buf; + while input_len >= PREFIX_LEN { + let mut bytes = [0; SIZE_U32]; + bytes.copy_from_slice(&input[0..4]); + let expected_decompressed_size = u32::from_be_bytes(bytes); + let mut bytes = [0; SIZE_U32]; + bytes.copy_from_slice(&input[4..8]); + let expected_compressed_size = u32::from_be_bytes(bytes); + input = &input[PREFIX_LEN..]; + input_len -= PREFIX_LEN; + + if input_len < expected_compressed_size as usize { + return Err(Error::oos("Not enough bytes for Hadoop frame")); + } + + if output_len < expected_decompressed_size as usize { + return Err(Error::oos("Not enough bytes to hold advertised output")); + } + let decompressed_size = lz4_decompress_to_buffer( + &input[..expected_compressed_size as usize], + Some(output_len as i32), + output, + )?; + if decompressed_size != expected_decompressed_size as usize { + return Err(Error::oos("unexpected decompressed size")); + } + input_len -= expected_compressed_size as usize; + output_len -= expected_decompressed_size as usize; + if input_len > expected_compressed_size as usize { + input = &input[expected_compressed_size as usize..]; + output = &mut output[expected_decompressed_size as usize..]; + } else { + break; + } + } + if input_len == 0 { + Ok(()) + } else { + Err(Error::oos("Not all input are consumed")) + } +} + +#[cfg(feature = "lz4")] +#[inline] +fn lz4_decompress_to_buffer( + src: &[u8], + uncompressed_size: Option, + buffer: &mut [u8], +) -> Result { + let size = lz4::block::decompress_to_buffer(src, uncompressed_size, buffer)?; + Ok(size) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_roundtrip(c: CompressionOptions, data: &[u8]) { + let offset = 2048; + + // Compress to a buffer that already has data is possible + let mut compressed = vec![2; offset]; + compress(c, data, &mut compressed).expect("Error when compressing"); + + // data is compressed... + assert!(compressed.len() - offset < data.len()); + + let mut decompressed = vec![0; data.len()]; + decompress(c.into(), &compressed[offset..], &mut decompressed) + .expect("Error when decompressing"); + assert_eq!(data, decompressed.as_slice()); + } + + fn test_codec(c: CompressionOptions) { + let sizes = vec![1000, 10000, 100000]; + for size in sizes { + let data = (0..size).map(|x| (x % 255) as u8).collect::>(); + test_roundtrip(c, &data); + } + } + + #[test] + fn test_codec_snappy() { + test_codec(CompressionOptions::Snappy); + } + + #[test] + fn test_codec_gzip_default() { + test_codec(CompressionOptions::Gzip(None)); + } + + #[test] + fn test_codec_gzip_low_compression() { + test_codec(CompressionOptions::Gzip(Some( + GzipLevel::try_new(1).unwrap(), + ))); + } + + #[test] + fn test_codec_gzip_high_compression() { + test_codec(CompressionOptions::Gzip(Some( + GzipLevel::try_new(10).unwrap(), + ))); + } + + #[test] + fn test_codec_brotli_default() { + test_codec(CompressionOptions::Brotli(None)); + } + + #[test] + fn test_codec_brotli_low_compression() { + test_codec(CompressionOptions::Brotli(Some( + BrotliLevel::try_new(1).unwrap(), + ))); + } + + #[test] + fn test_codec_brotli_high_compression() { + test_codec(CompressionOptions::Brotli(Some( + BrotliLevel::try_new(11).unwrap(), + ))); + } + + #[test] + fn test_codec_lz4_raw() { + test_codec(CompressionOptions::Lz4Raw); + } + + #[test] + fn test_codec_zstd_default() { + test_codec(CompressionOptions::Zstd(None)); + } + + #[cfg(feature = "zstd")] + #[test] + fn test_codec_zstd_low_compression() { + test_codec(CompressionOptions::Zstd(Some( + ZstdLevel::try_new(1).unwrap(), + ))); + } + + #[cfg(feature = "zstd")] + #[test] + fn test_codec_zstd_high_compression() { + test_codec(CompressionOptions::Zstd(Some( + ZstdLevel::try_new(21).unwrap(), + ))); + } +} diff --git a/crates/polars-parquet/src/parquet/deserialize/binary.rs b/crates/polars-parquet/src/parquet/deserialize/binary.rs new file mode 100644 index 000000000000..683b45e3bdc2 --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/binary.rs @@ -0,0 +1,70 @@ +use super::utils; +use crate::parquet::encoding::hybrid_rle; +use crate::parquet::encoding::plain_byte_array::BinaryIter; +use crate::parquet::error::Error; +use crate::parquet::page::{split_buffer, DataPage}; +use crate::parquet::parquet_bridge::{Encoding, Repetition}; + +#[derive(Debug)] +pub struct Dictionary<'a, P> { + pub indexes: hybrid_rle::HybridRleDecoder<'a>, + pub dict: P, +} + +impl<'a, P> Dictionary<'a, P> { + pub fn try_new(page: &'a DataPage, dict: P) -> Result { + let indexes = utils::dict_indices_decoder(page)?; + + Ok(Self { indexes, dict }) + } + + #[inline] + pub fn len(&self) -> usize { + self.indexes.size_hint().0 + } +} + +#[allow(clippy::large_enum_variant)] +pub enum BinaryPageState<'a, P> { + Optional(utils::DefLevelsDecoder<'a>, BinaryIter<'a>), + Required(BinaryIter<'a>), + RequiredDictionary(Dictionary<'a, P>), + OptionalDictionary(utils::DefLevelsDecoder<'a>, Dictionary<'a, P>), +} + +impl<'a, P> BinaryPageState<'a, P> { + pub fn try_new(page: &'a DataPage, dict: Option

) -> Result { + let is_optional = + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + + match (page.encoding(), dict, is_optional) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + Dictionary::try_new(page, dict).map(Self::RequiredDictionary) + }, + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + Ok(Self::OptionalDictionary( + utils::DefLevelsDecoder::try_new(page)?, + Dictionary::try_new(page, dict)?, + )) + }, + (Encoding::Plain, _, true) => { + let (_, _, values) = split_buffer(page)?; + + let validity = utils::DefLevelsDecoder::try_new(page)?; + let values = BinaryIter::new(values, None); + + Ok(Self::Optional(validity, values)) + }, + (Encoding::Plain, _, false) => { + let (_, _, values) = split_buffer(page)?; + let values = BinaryIter::new(values, Some(page.num_values())); + + Ok(Self::Required(values)) + }, + _ => Err(Error::FeatureNotSupported(format!( + "Viewing page for encoding {:?} for binary type", + page.encoding(), + ))), + } + } +} diff --git a/crates/polars-parquet/src/parquet/deserialize/boolean.rs b/crates/polars-parquet/src/parquet/deserialize/boolean.rs new file mode 100644 index 000000000000..435e2b0abd30 --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/boolean.rs @@ -0,0 +1,39 @@ +use super::utils; +use crate::parquet::encoding::hybrid_rle::BitmapIter; +use crate::parquet::error::Error; +use crate::parquet::page::{split_buffer, DataPage}; +use crate::parquet::parquet_bridge::{Encoding, Repetition}; + +// The state of a `DataPage` of `Boolean` parquet boolean type +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +pub enum BooleanPageState<'a> { + Optional(utils::DefLevelsDecoder<'a>, BitmapIter<'a>), + Required(&'a [u8], usize), +} + +impl<'a> BooleanPageState<'a> { + pub fn try_new(page: &'a DataPage) -> Result { + let is_optional = + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + + match (page.encoding(), is_optional) { + (Encoding::Plain, true) => { + let validity = utils::DefLevelsDecoder::try_new(page)?; + + let (_, _, values) = split_buffer(page)?; + let values = BitmapIter::new(values, 0, values.len() * 8); + + Ok(Self::Optional(validity, values)) + }, + (Encoding::Plain, false) => { + let (_, _, values) = split_buffer(page)?; + Ok(Self::Required(values, page.num_values())) + }, + _ => Err(Error::InvalidParameter(format!( + "Viewing page for encoding {:?} for boolean type not supported", + page.encoding(), + ))), + } + } +} diff --git a/crates/polars-parquet/src/parquet/deserialize/filtered_rle.rs b/crates/polars-parquet/src/parquet/deserialize/filtered_rle.rs new file mode 100644 index 000000000000..57c95c77b401 --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/filtered_rle.rs @@ -0,0 +1,274 @@ +use std::collections::VecDeque; + +use super::{HybridDecoderBitmapIter, HybridEncoded}; +use crate::parquet::encoding::hybrid_rle::BitmapIter; +use crate::parquet::error::Error; +use crate::parquet::indexes::Interval; + +/// Type definition of a [`FilteredHybridBitmapIter`] of [`HybridDecoderBitmapIter`]. +pub type FilteredHybridRleDecoderIter<'a> = + FilteredHybridBitmapIter<'a, HybridDecoderBitmapIter<'a>>; + +/// The decoding state of the hybrid-RLE decoder with a maximum definition level of 1 +/// that can supports skipped runs +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FilteredHybridEncoded<'a> { + /// a bitmap (values, offset, length, skipped_set) + Bitmap { + values: &'a [u8], + offset: usize, + length: usize, + }, + Repeated { + is_set: bool, + length: usize, + }, + /// When the run was skipped - contains the number of set values on the skipped run + Skipped(usize), +} + +fn is_set_count(values: &[u8], offset: usize, length: usize) -> usize { + BitmapIter::new(values, offset, length) + .filter(|x| *x) + .count() +} + +impl<'a> FilteredHybridEncoded<'a> { + /// Returns the length of the run in number of items + #[inline] + pub fn len(&self) -> usize { + match self { + FilteredHybridEncoded::Bitmap { length, .. } => *length, + FilteredHybridEncoded::Repeated { length, .. } => *length, + FilteredHybridEncoded::Skipped(_) => 0, + } + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// An [`Iterator`] adapter over [`HybridEncoded`] that yields [`FilteredHybridEncoded`]. +/// +/// This iterator adapter is used in combination with +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FilteredHybridBitmapIter<'a, I: Iterator, Error>>> { + iter: I, + current: Option<(HybridEncoded<'a>, usize)>, + // a run may end in the middle of an interval, in which case we must + // split the interval in parts. This tracks the current interval being computed + current_interval: Option, + selected_rows: VecDeque, + current_items_in_runs: usize, + + total_items: usize, +} + +impl<'a, I: Iterator, Error>>> FilteredHybridBitmapIter<'a, I> { + pub fn new(iter: I, selected_rows: VecDeque) -> Self { + let total_items = selected_rows.iter().map(|x| x.length).sum(); + Self { + iter, + current: None, + current_interval: None, + selected_rows, + current_items_in_runs: 0, + total_items, + } + } + + fn advance_current_interval(&mut self, length: usize) { + if let Some(interval) = &mut self.current_interval { + interval.start += length; + interval.length -= length; + self.total_items -= length; + } + } + + /// Returns the number of elements remaining. Note that each run + /// of the iterator contains more than one element - this is is _not_ equivalent to size_hint. + pub fn len(&self) -> usize { + self.total_items + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl<'a, I: Iterator, Error>>> Iterator + for FilteredHybridBitmapIter<'a, I> +{ + type Item = Result, Error>; + + fn next(&mut self) -> Option { + let interval = if let Some(interval) = self.current_interval { + interval + } else { + self.current_interval = self.selected_rows.pop_front(); + self.current_interval?; // case where iteration finishes + return self.next(); + }; + + let (run, offset) = if let Some((run, offset)) = self.current { + (run, offset) + } else { + // a new run + let run = self.iter.next()?; // no run => something wrong since intervals should only slice items up all runs' length + match run { + Ok(run) => { + self.current = Some((run, 0)); + }, + Err(e) => return Some(Err(e)), + } + return self.next(); + }; + + // one of three things can happen: + // * the start of the interval is not aligned with the start of the run => issue a `Skipped` and advance the run / next run + // * the run contains this interval => consume the interval and keep the run + // * the run contains part of this interval => consume the run and keep the interval + + match run { + HybridEncoded::Repeated(is_set, full_run_length) => { + let run_length = full_run_length - offset; + // interval.start is from the start of the first run; discount `current_items_in_runs` + // to get the start from the current run's offset + let interval_start = interval.start - self.current_items_in_runs; + + if interval_start > 0 { + // we need to skip values from the run + let to_skip = interval_start; + + // we only skip up to a run (yield a single skip per multiple runs) + let max_skip = full_run_length - offset; + let to_skip = to_skip.min(max_skip); + + let set = if is_set { to_skip } else { 0 }; + + self.current_items_in_runs += to_skip; + + self.current = if to_skip == max_skip { + None + } else { + Some((run, offset + to_skip)) + }; + + return Some(Ok(FilteredHybridEncoded::Skipped(set))); + }; + + // slice the bitmap according to current interval + // note that interval start is from the start of the first run. + let new_offset = offset + interval_start; + + if interval_start > run_length { + let set = if is_set { run_length } else { 0 }; + + self.advance_current_interval(run_length); + self.current_items_in_runs += run_length; + self.current = None; + Some(Ok(FilteredHybridEncoded::Skipped(set))) + } else { + let length = if run_length > interval.length { + // interval is fully consumed + self.current_items_in_runs += interval.length; + + // fetch next interval + self.total_items -= interval.length; + self.current_interval = self.selected_rows.pop_front(); + + self.current = Some((run, offset + interval.length)); + + interval.length + } else { + // the run is consumed and the interval is shortened accordingly + self.current_items_in_runs += run_length; + + // the interval may cover two runs; shorten the length + // to its maximum allowed for this run + let length = run_length.min(full_run_length - new_offset); + + self.advance_current_interval(length); + + self.current = None; + length + }; + Some(Ok(FilteredHybridEncoded::Repeated { is_set, length })) + } + }, + HybridEncoded::Bitmap(values, full_run_length) => { + let run_length = full_run_length - offset; + // interval.start is from the start of the first run; discount `current_items_in_runs` + // to get the start from the current run's offset + let interval_start = interval.start - self.current_items_in_runs; + + if interval_start > 0 { + // we need to skip values from the run + let to_skip = interval_start; + + // we only skip up to a run (yield a single skip per multiple runs) + let max_skip = full_run_length - offset; + let to_skip = to_skip.min(max_skip); + + let set = is_set_count(values, offset, to_skip); + + self.current_items_in_runs += to_skip; + + self.current = if to_skip == max_skip { + None + } else { + Some((run, offset + to_skip)) + }; + + return Some(Ok(FilteredHybridEncoded::Skipped(set))); + }; + + // slice the bitmap according to current interval + // note that interval start is from the start of the first run. + let new_offset = offset + interval_start; + + if interval_start > run_length { + let set = is_set_count(values, offset, full_run_length); + + self.advance_current_interval(run_length); + self.current_items_in_runs += run_length; + self.current = None; + Some(Ok(FilteredHybridEncoded::Skipped(set))) + } else { + let length = if run_length > interval.length { + // interval is fully consumed + self.current_items_in_runs += interval.length; + + // fetch next interval + self.total_items -= interval.length; + self.current_interval = self.selected_rows.pop_front(); + + self.current = Some((run, offset + interval.length)); + + interval.length + } else { + // the run is consumed and the interval is shortened accordingly + self.current_items_in_runs += run_length; + + // the interval may cover two runs; shorten the length + // to its maximum allowed for this run + let length = run_length.min(full_run_length - new_offset); + + self.advance_current_interval(length); + + self.current = None; + length + }; + Some(Ok(FilteredHybridEncoded::Bitmap { + values, + offset: new_offset, + length, + })) + } + }, + } + } +} diff --git a/crates/polars-parquet/src/parquet/deserialize/fixed_len.rs b/crates/polars-parquet/src/parquet/deserialize/fixed_len.rs new file mode 100644 index 000000000000..b0885cc6ad4c --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/fixed_len.rs @@ -0,0 +1,107 @@ +use super::utils; +use crate::parquet::encoding::hybrid_rle; +use crate::parquet::error::Error; +use crate::parquet::page::{split_buffer, DataPage}; +use crate::parquet::parquet_bridge::{Encoding, Repetition}; +use crate::parquet::schema::types::PhysicalType; + +#[derive(Debug)] +pub struct FixexBinaryIter<'a> { + values: std::slice::ChunksExact<'a, u8>, +} + +impl<'a> FixexBinaryIter<'a> { + pub fn new(values: &'a [u8], size: usize) -> Self { + let values = values.chunks_exact(size); + Self { values } + } +} + +impl<'a> Iterator for FixexBinaryIter<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option { + self.values.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.values.size_hint() + } +} + +#[derive(Debug)] +pub struct Dictionary<'a, P> { + pub indexes: hybrid_rle::HybridRleDecoder<'a>, + pub dict: P, +} + +impl<'a, P> Dictionary<'a, P> { + pub fn try_new(page: &'a DataPage, dict: P) -> Result { + let indexes = utils::dict_indices_decoder(page)?; + + Ok(Self { indexes, dict }) + } + + #[inline] + pub fn len(&self) -> usize { + self.indexes.size_hint().0 + } +} + +#[allow(clippy::large_enum_variant)] +pub enum FixedLenBinaryPageState<'a, P> { + Optional(utils::DefLevelsDecoder<'a>, FixexBinaryIter<'a>), + Required(FixexBinaryIter<'a>), + RequiredDictionary(Dictionary<'a, P>), + OptionalDictionary(utils::DefLevelsDecoder<'a>, Dictionary<'a, P>), +} + +impl<'a, P> FixedLenBinaryPageState<'a, P> { + pub fn try_new(page: &'a DataPage, dict: Option

) -> Result { + let is_optional = + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + + let size: usize = if let PhysicalType::FixedLenByteArray(size) = + page.descriptor.primitive_type.physical_type + { + size + } else { + return Err(Error::InvalidParameter( + "FixedLenBinaryPageState must be initialized by pages of FixedLenByteArray" + .to_string(), + )); + }; + + match (page.encoding(), dict, is_optional) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + Dictionary::try_new(page, dict).map(Self::RequiredDictionary) + }, + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + Ok(Self::OptionalDictionary( + utils::DefLevelsDecoder::try_new(page)?, + Dictionary::try_new(page, dict)?, + )) + }, + (Encoding::Plain, _, true) => { + let (_, _, values) = split_buffer(page)?; + + let validity = utils::DefLevelsDecoder::try_new(page)?; + let values = FixexBinaryIter::new(values, size); + + Ok(Self::Optional(validity, values)) + }, + (Encoding::Plain, _, false) => { + let (_, _, values) = split_buffer(page)?; + let values = FixexBinaryIter::new(values, size); + + Ok(Self::Required(values)) + }, + _ => Err(Error::FeatureNotSupported(format!( + "Viewing page for encoding {:?} for binary type", + page.encoding(), + ))), + } + } +} diff --git a/crates/polars-parquet/src/parquet/deserialize/hybrid_rle.rs b/crates/polars-parquet/src/parquet/deserialize/hybrid_rle.rs new file mode 100644 index 000000000000..746dd27b330d --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/hybrid_rle.rs @@ -0,0 +1,204 @@ +use crate::parquet::encoding::hybrid_rle::{self, BitmapIter}; +use crate::parquet::error::Error; + +/// The decoding state of the hybrid-RLE decoder with a maximum definition level of 1 +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum HybridEncoded<'a> { + /// a bitmap + Bitmap(&'a [u8], usize), + /// A repeated item. The first attribute corresponds to whether the value is set + /// the second attribute corresponds to the number of repetitions. + Repeated(bool, usize), +} + +impl<'a> HybridEncoded<'a> { + /// Returns the length of the run in number of items + #[inline] + pub fn len(&self) -> usize { + match self { + HybridEncoded::Bitmap(_, length) => *length, + HybridEncoded::Repeated(_, length) => *length, + } + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +pub trait HybridRleRunsIterator<'a>: Iterator, Error>> { + /// Number of elements remaining. This may not be the items of the iterator - an item + /// of the iterator may contain more than one element. + fn number_of_elements(&self) -> usize; +} + +/// An iterator of [`HybridEncoded`], adapter over [`hybrid_rle::HybridEncoded`]. +#[derive(Debug, Clone)] +pub struct HybridRleIter<'a, I> +where + I: Iterator, Error>>, +{ + iter: I, + length: usize, + consumed: usize, +} + +impl<'a, I> HybridRleIter<'a, I> +where + I: Iterator, Error>>, +{ + /// Returns a new [`HybridRleIter`] + #[inline] + pub fn new(iter: I, length: usize) -> Self { + Self { + iter, + length, + consumed: 0, + } + } + + /// the number of elements in the iterator. Note that this _is not_ the number of runs. + #[inline] + pub fn len(&self) -> usize { + self.length - self.consumed + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl<'a, I> HybridRleRunsIterator<'a> for HybridRleIter<'a, I> +where + I: Iterator, Error>>, +{ + fn number_of_elements(&self) -> usize { + self.len() + } +} + +impl<'a, I> Iterator for HybridRleIter<'a, I> +where + I: Iterator, Error>>, +{ + type Item = Result, Error>; + + #[inline] + fn next(&mut self) -> Option { + if self.consumed == self.length { + return None; + }; + let run = self.iter.next()?; + + Some(run.map(|run| match run { + hybrid_rle::HybridEncoded::Bitpacked(pack) => { + // a pack has at most `pack.len() * 8` bits + let pack_size = pack.len() * 8; + + let additional = pack_size.min(self.len()); + + self.consumed += additional; + HybridEncoded::Bitmap(pack, additional) + }, + hybrid_rle::HybridEncoded::Rle(value, length) => { + let is_set = value[0] == 1; + + let additional = length.min(self.len()); + + self.consumed += additional; + HybridEncoded::Repeated(is_set, additional) + }, + })) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +/// Type definition for a [`HybridRleIter`] using [`hybrid_rle::Decoder`]. +pub type HybridDecoderBitmapIter<'a> = HybridRleIter<'a, hybrid_rle::Decoder<'a>>; + +#[derive(Debug)] +enum HybridBooleanState<'a> { + /// a bitmap + Bitmap(BitmapIter<'a>), + /// A repeated item. The first attribute corresponds to whether the value is set + /// the second attribute corresponds to the number of repetitions. + Repeated(bool, usize), +} + +/// An iterator adapter that maps an iterator of [`HybridEncoded`] into an iterator +/// over [`bool`]. +#[derive(Debug)] +pub struct HybridRleBooleanIter<'a, I> +where + I: Iterator, Error>>, +{ + iter: I, + current_run: Option>, +} + +impl<'a, I> HybridRleBooleanIter<'a, I> +where + I: Iterator, Error>>, +{ + pub fn new(iter: I) -> Self { + Self { + iter, + current_run: None, + } + } +} + +impl<'a, I> Iterator for HybridRleBooleanIter<'a, I> +where + I: HybridRleRunsIterator<'a>, +{ + type Item = Result; + + #[inline] + fn next(&mut self) -> Option { + if let Some(run) = &mut self.current_run { + match run { + HybridBooleanState::Bitmap(bitmap) => bitmap.next().map(Ok), + HybridBooleanState::Repeated(value, remaining) => if *remaining == 0 { + None + } else { + *remaining -= 1; + Some(*value) + } + .map(Ok), + } + } else if let Some(run) = self.iter.next() { + let run = run.map(|run| match run { + HybridEncoded::Bitmap(bitmap, length) => { + HybridBooleanState::Bitmap(BitmapIter::new(bitmap, 0, length)) + }, + HybridEncoded::Repeated(value, length) => { + HybridBooleanState::Repeated(value, length) + }, + }); + match run { + Ok(run) => { + self.current_run = Some(run); + self.next() + }, + Err(e) => Some(Err(e)), + } + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let exact = self.iter.number_of_elements(); + (exact, Some(exact)) + } +} + +/// Type definition for a [`HybridRleBooleanIter`] using [`hybrid_rle::Decoder`]. +pub type HybridRleDecoderIter<'a> = HybridRleBooleanIter<'a, HybridDecoderBitmapIter<'a>>; diff --git a/crates/polars-parquet/src/parquet/deserialize/mod.rs b/crates/polars-parquet/src/parquet/deserialize/mod.rs new file mode 100644 index 000000000000..f16fbb901bce --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/mod.rs @@ -0,0 +1,17 @@ +// TODO! fix and/or prune? +#![allow(ambiguous_glob_reexports)] +mod binary; +mod boolean; +mod filtered_rle; +mod fixed_len; +mod hybrid_rle; +mod native; +mod utils; + +pub use binary::*; +pub use boolean::*; +pub use filtered_rle::*; +pub use fixed_len::*; +pub use hybrid_rle::*; +pub use native::*; +pub use utils::{DefLevelsDecoder, OptionalValues, SliceFilteredIter}; diff --git a/crates/polars-parquet/src/parquet/deserialize/native.rs b/crates/polars-parquet/src/parquet/deserialize/native.rs new file mode 100644 index 000000000000..11a9cf2c3e26 --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/native.rs @@ -0,0 +1,97 @@ +use super::utils; +use crate::parquet::encoding::hybrid_rle; +use crate::parquet::error::Error; +use crate::parquet::page::{split_buffer, DataPage}; +use crate::parquet::parquet_bridge::{Encoding, Repetition}; +use crate::parquet::types::{decode, NativeType}; + +/// Typedef of an iterator over PLAIN page values +pub type Casted<'a, T> = std::iter::Map, fn(&'a [u8]) -> T>; + +/// Views the values of the data page as [`Casted`] to [`NativeType`]. +pub fn native_cast(page: &DataPage) -> Result, Error> { + let (_, _, values) = split_buffer(page)?; + if values.len() % std::mem::size_of::() != 0 { + return Err(Error::oos( + "A primitive page data's len must be a multiple of the type", + )); + } + + Ok(values + .chunks_exact(std::mem::size_of::()) + .map(decode::)) +} + +#[derive(Debug)] +pub struct Dictionary<'a, P> { + pub indexes: hybrid_rle::HybridRleDecoder<'a>, + pub dict: P, +} + +impl<'a, P> Dictionary<'a, P> { + pub fn try_new(page: &'a DataPage, dict: P) -> Result { + let indexes = utils::dict_indices_decoder(page)?; + + Ok(Self { dict, indexes }) + } + + pub fn len(&self) -> usize { + self.indexes.size_hint().0 + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// The deserialization state of a `DataPage` of `Primitive` parquet primitive type +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +pub enum NativePageState<'a, T, P> +where + T: NativeType, +{ + /// A page of optional values + Optional(utils::DefLevelsDecoder<'a>, Casted<'a, T>), + /// A page of required values + Required(Casted<'a, T>), + /// A page of required, dictionary-encoded values + RequiredDictionary(Dictionary<'a, P>), + /// A page of optional, dictionary-encoded values + OptionalDictionary(utils::DefLevelsDecoder<'a>, Dictionary<'a, P>), +} + +impl<'a, T: NativeType, P> NativePageState<'a, T, P> { + /// Tries to create [`NativePageState`] + /// # Error + /// Errors iff the page is not a `NativePageState` + pub fn try_new(page: &'a DataPage, dict: Option

) -> Result { + let is_optional = + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + + match (page.encoding(), dict, is_optional) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + Dictionary::try_new(page, dict).map(Self::RequiredDictionary) + }, + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + Ok(Self::OptionalDictionary( + utils::DefLevelsDecoder::try_new(page)?, + Dictionary::try_new(page, dict)?, + )) + }, + (Encoding::Plain, _, true) => { + let validity = utils::DefLevelsDecoder::try_new(page)?; + let values = native_cast(page)?; + + Ok(Self::Optional(validity, values)) + }, + (Encoding::Plain, _, false) => native_cast(page).map(Self::Required), + _ => Err(Error::FeatureNotSupported(format!( + "Viewing page for encoding {:?} for native type {}", + page.encoding(), + std::any::type_name::() + ))), + } + } +} diff --git a/crates/polars-parquet/src/parquet/deserialize/utils.rs b/crates/polars-parquet/src/parquet/deserialize/utils.rs new file mode 100644 index 000000000000..0c89d09d4648 --- /dev/null +++ b/crates/polars-parquet/src/parquet/deserialize/utils.rs @@ -0,0 +1,174 @@ +use std::collections::VecDeque; + +use super::hybrid_rle::{HybridDecoderBitmapIter, HybridRleIter}; +use crate::parquet::encoding::hybrid_rle::{self, HybridRleDecoder}; +use crate::parquet::error::Error; +use crate::parquet::indexes::Interval; +use crate::parquet::page::{split_buffer, DataPage}; +use crate::parquet::read::levels::get_bit_width; + +pub(super) fn dict_indices_decoder(page: &DataPage) -> Result { + let (_, _, indices_buffer) = split_buffer(page)?; + + // SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32), + // SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width). + let bit_width = indices_buffer[0]; + if bit_width > 32 { + return Err(Error::oos( + "Bit width of dictionary pages cannot be larger than 32", + )); + } + let indices_buffer = &indices_buffer[1..]; + + hybrid_rle::HybridRleDecoder::try_new(indices_buffer, bit_width as u32, page.num_values()) +} + +/// Decoder of definition levels. +#[derive(Debug)] +pub enum DefLevelsDecoder<'a> { + /// When the maximum definition level is 1, the definition levels are RLE-encoded and + /// the bitpacked runs are bitmaps. This variant contains [`HybridDecoderBitmapIter`] + /// that decodes the runs, but not the individual values + Bitmap(HybridDecoderBitmapIter<'a>), + /// When the maximum definition level is larger than 1 + Levels(HybridRleDecoder<'a>, u32), +} + +impl<'a> DefLevelsDecoder<'a> { + pub fn try_new(page: &'a DataPage) -> Result { + let (_, def_levels, _) = split_buffer(page)?; + + let max_def_level = page.descriptor.max_def_level; + Ok(if max_def_level == 1 { + let iter = hybrid_rle::Decoder::new(def_levels, 1); + let iter = HybridRleIter::new(iter, page.num_values()); + Self::Bitmap(iter) + } else { + let iter = HybridRleDecoder::try_new( + def_levels, + get_bit_width(max_def_level), + page.num_values(), + )?; + Self::Levels(iter, max_def_level as u32) + }) + } +} + +/// Iterator adapter to convert an iterator of non-null values and an iterator over validity +/// into an iterator of optional values. +#[derive(Debug, Clone)] +pub struct OptionalValues>, I: Iterator> { + validity: V, + values: I, +} + +impl>, I: Iterator> OptionalValues { + pub fn new(validity: V, values: I) -> Self { + Self { validity, values } + } +} + +impl>, I: Iterator> Iterator + for OptionalValues +{ + type Item = Result, Error>; + + #[inline] + fn next(&mut self) -> Option { + self.validity + .next() + .map(|x| x.map(|x| if x { self.values.next() } else { None })) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.validity.size_hint() + } +} + +/// An iterator adapter that converts an iterator over items into an iterator over slices of +/// those N items. +/// +/// This iterator is best used with iterators that implement `nth` since skipping items +/// allows this iterator to skip sequences of items without having to call each of them. +#[derive(Debug, Clone)] +pub struct SliceFilteredIter { + iter: I, + selected_rows: VecDeque, + current_remaining: usize, + current: usize, // position in the slice + total_length: usize, +} + +impl SliceFilteredIter { + /// Return a new [`SliceFilteredIter`] + pub fn new(iter: I, selected_rows: VecDeque) -> Self { + let total_length = selected_rows.iter().map(|i| i.length).sum(); + Self { + iter, + selected_rows, + current_remaining: 0, + current: 0, + total_length, + } + } +} + +impl> Iterator for SliceFilteredIter { + type Item = T; + + #[inline] + fn next(&mut self) -> Option { + if self.current_remaining == 0 { + if let Some(interval) = self.selected_rows.pop_front() { + // skip the hole between the previous start and this start + // (start + length) - start + let item = self.iter.nth(interval.start - self.current); + self.current = interval.start + interval.length; + self.current_remaining = interval.length - 1; + self.total_length -= 1; + item + } else { + None + } + } else { + self.current_remaining -= 1; + self.total_length -= 1; + self.iter.next() + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.total_length, Some(self.total_length)) + } +} + +#[cfg(test)] +mod test { + use std::collections::VecDeque; + + use super::*; + + #[test] + fn basic() { + let iter = 0..=100; + + let intervals = vec![ + Interval::new(0, 2), + Interval::new(20, 11), + Interval::new(31, 1), + ]; + + let a: VecDeque = intervals.clone().into_iter().collect(); + let mut a = SliceFilteredIter::new(iter, a); + + let expected: Vec = intervals + .into_iter() + .flat_map(|interval| interval.start..(interval.start + interval.length)) + .collect(); + + assert_eq!(expected, a.by_ref().collect::>()); + assert_eq!((0, Some(0)), a.size_hint()); + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs new file mode 100644 index 000000000000..ea7bde3fd45b --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs @@ -0,0 +1,211 @@ +use super::{Packed, Unpackable, Unpacked}; +use crate::parquet::error::Error; + +/// An [`Iterator`] of [`Unpackable`] unpacked from a bitpacked slice of bytes. +/// # Implementation +/// This iterator unpacks bytes in chunks and does not allocate. +#[derive(Debug, Clone)] +pub struct Decoder<'a, T: Unpackable> { + packed: std::slice::Chunks<'a, u8>, + num_bits: usize, + remaining: usize, // in number of items + current_pack_index: usize, // invariant: < T::PACK_LENGTH + unpacked: T::Unpacked, // has the current unpacked values. +} + +#[inline] +fn decode_pack(packed: &[u8], num_bits: usize, unpacked: &mut T::Unpacked) { + if packed.len() < T::Unpacked::LENGTH * num_bits / 8 { + let mut buf = T::Packed::zero(); + buf.as_mut()[..packed.len()].copy_from_slice(packed); + T::unpack(buf.as_ref(), num_bits, unpacked) + } else { + T::unpack(packed, num_bits, unpacked) + } +} + +impl<'a, T: Unpackable> Decoder<'a, T> { + /// Returns a [`Decoder`] with `T` encoded in `packed` with `num_bits`. + pub fn try_new(packed: &'a [u8], num_bits: usize, mut length: usize) -> Result { + let block_size = std::mem::size_of::() * num_bits; + + if num_bits == 0 { + return Err(Error::oos("Bitpacking requires num_bits > 0")); + } + + if packed.len() * 8 < length * num_bits { + return Err(Error::oos(format!( + "Unpacking {length} items with a number of bits {num_bits} requires at least {} bytes.", + length * num_bits / 8 + ))); + } + + let mut packed = packed.chunks(block_size); + let mut unpacked = T::Unpacked::zero(); + if let Some(chunk) = packed.next() { + decode_pack::(chunk, num_bits, &mut unpacked); + } else { + length = 0 + }; + + Ok(Self { + remaining: length, + packed, + num_bits, + unpacked, + current_pack_index: 0, + }) + } +} + +impl<'a, T: Unpackable> Iterator for Decoder<'a, T> { + type Item = T; + + #[inline] // -71% improvement in bench + fn next(&mut self) -> Option { + if self.remaining == 0 { + return None; + } + let result = self.unpacked[self.current_pack_index]; + self.current_pack_index += 1; + self.remaining -= 1; + if self.current_pack_index == T::Unpacked::LENGTH { + if let Some(packed) = self.packed.next() { + decode_pack::(packed, self.num_bits, &mut self.unpacked); + self.current_pack_index = 0; + } + } + Some(result) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } +} + +#[cfg(test)] +mod tests { + use super::super::tests::case1; + use super::*; + + #[test] + fn test_decode_rle() { + // Test data: 0-7 with bit width 3 + // 0: 000 + // 1: 001 + // 2: 010 + // 3: 011 + // 4: 100 + // 5: 101 + // 6: 110 + // 7: 111 + let num_bits = 3; + let length = 8; + // encoded: 0b10001000u8, 0b11000110, 0b11111010 + let data = vec![0b10001000u8, 0b11000110, 0b11111010]; + + let decoded = Decoder::::try_new(&data, num_bits, length) + .unwrap() + .collect::>(); + assert_eq!(decoded, vec![0, 1, 2, 3, 4, 5, 6, 7]); + } + + #[test] + fn decode_large() { + let (num_bits, expected, data) = case1(); + + let decoded = Decoder::::try_new(&data, num_bits, expected.len()) + .unwrap() + .collect::>(); + assert_eq!(decoded, expected); + } + + #[test] + fn test_decode_bool() { + let num_bits = 1; + let length = 8; + let data = vec![0b10101010]; + + let decoded = Decoder::::try_new(&data, num_bits, length) + .unwrap() + .collect::>(); + assert_eq!(decoded, vec![0, 1, 0, 1, 0, 1, 0, 1]); + } + + #[test] + fn test_decode_u64() { + let num_bits = 1; + let length = 8; + let data = vec![0b10101010]; + + let decoded = Decoder::::try_new(&data, num_bits, length) + .unwrap() + .collect::>(); + assert_eq!(decoded, vec![0, 1, 0, 1, 0, 1, 0, 1]); + } + + #[test] + fn even_case() { + // [0, 1, 2, 3, 4, 5, 6, 0]x99 + let data = &[0b10001000u8, 0b11000110, 0b00011010]; + let num_bits = 3; + let copies = 99; // 8 * 99 % 32 != 0 + let expected = std::iter::repeat(&[0u32, 1, 2, 3, 4, 5, 6, 0]) + .take(copies) + .flatten() + .copied() + .collect::>(); + let data = std::iter::repeat(data) + .take(copies) + .flatten() + .copied() + .collect::>(); + let length = expected.len(); + + let decoded = Decoder::::try_new(&data, num_bits, length) + .unwrap() + .collect::>(); + assert_eq!(decoded, expected); + } + + #[test] + fn odd_case() { + // [0, 1, 2, 3, 4, 5, 6, 0]x4 + [2] + let data = &[0b10001000u8, 0b11000110, 0b00011010]; + let num_bits = 3; + let copies = 4; + let expected = std::iter::repeat(&[0u32, 1, 2, 3, 4, 5, 6, 0]) + .take(copies) + .flatten() + .copied() + .chain(std::iter::once(2)) + .collect::>(); + let data = std::iter::repeat(data) + .take(copies) + .flatten() + .copied() + .chain(std::iter::once(0b00000010u8)) + .collect::>(); + let length = expected.len(); + + let decoded = Decoder::::try_new(&data, num_bits, length) + .unwrap() + .collect::>(); + assert_eq!(decoded, expected); + } + + #[test] + fn test_errors() { + // zero length + assert!(Decoder::::try_new(&[], 1, 0).is_ok()); + // no bytes + assert!(Decoder::::try_new(&[], 1, 1).is_err()); + // too few bytes + assert!(Decoder::::try_new(&[1], 1, 8).is_ok()); + assert!(Decoder::::try_new(&[1, 1], 2, 8).is_ok()); + assert!(Decoder::::try_new(&[1], 1, 9).is_err()); + // zero num_bits + assert!(Decoder::::try_new(&[1], 0, 1).is_err()); + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/encode.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/encode.rs new file mode 100644 index 000000000000..904ff796dd34 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/encode.rs @@ -0,0 +1,54 @@ +use std::convert::TryInto; + +use super::{Packed, Unpackable, Unpacked}; + +/// Encodes (packs) a slice of [`Unpackable`] into bitpacked bytes `packed`, using `num_bits` per value. +/// +/// This function assumes that the maximum value in `unpacked` fits in `num_bits` bits +/// and saturates higher values. +/// +/// Only the first `ceil8(unpacked.len() * num_bits)` of `packed` are populated. +pub fn encode(unpacked: &[T], num_bits: usize, packed: &mut [u8]) { + let chunks = unpacked.chunks_exact(T::Unpacked::LENGTH); + + let remainder = chunks.remainder(); + + let packed_size = (T::Unpacked::LENGTH * num_bits + 7) / 8; + if !remainder.is_empty() { + let packed_chunks = packed.chunks_mut(packed_size); + let mut last_chunk = T::Unpacked::zero(); + for i in 0..remainder.len() { + last_chunk[i] = remainder[i] + } + + chunks + .chain(std::iter::once(last_chunk.as_ref())) + .zip(packed_chunks) + .for_each(|(unpacked, packed)| { + T::pack(&unpacked.try_into().unwrap(), num_bits, packed); + }); + } else { + let packed_chunks = packed.chunks_exact_mut(packed_size); + chunks.zip(packed_chunks).for_each(|(unpacked, packed)| { + T::pack(&unpacked.try_into().unwrap(), num_bits, packed); + }); + } +} + +/// Encodes (packs) a potentially incomplete pack of [`Unpackable`] into bitpacked +/// bytes `packed`, using `num_bits` per value. +/// +/// This function assumes that the maximum value in `unpacked` fits in `num_bits` bits +/// and saturates higher values. +/// +/// Only the first `ceil8(unpacked.len() * num_bits)` of `packed` are populated. +#[inline] +pub fn encode_pack(unpacked: &[T], num_bits: usize, packed: &mut [u8]) { + if unpacked.len() < T::Packed::LENGTH { + let mut complete_unpacked = T::Unpacked::zero(); + complete_unpacked.as_mut()[..unpacked.len()].copy_from_slice(unpacked); + T::pack(&complete_unpacked, num_bits, packed) + } else { + T::pack(&unpacked.try_into().unwrap(), num_bits, packed) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs new file mode 100644 index 000000000000..a05ca2040431 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/mod.rs @@ -0,0 +1,220 @@ +mod decode; +mod encode; +mod pack; +mod unpack; + +pub use decode::Decoder; +pub use encode::{encode, encode_pack}; + +/// A byte slice (e.g. `[u8; 8]`) denoting types that represent complete packs. +pub trait Packed: + Copy + + Sized + + AsRef<[u8]> + + AsMut<[u8]> + + std::ops::IndexMut + + for<'a> TryFrom<&'a [u8]> +{ + const LENGTH: usize; + fn zero() -> Self; +} + +impl Packed for [u8; 8] { + const LENGTH: usize = 8; + #[inline] + fn zero() -> Self { + [0; 8] + } +} + +impl Packed for [u8; 16 * 2] { + const LENGTH: usize = 16 * 2; + #[inline] + fn zero() -> Self { + [0; 16 * 2] + } +} + +impl Packed for [u8; 32 * 4] { + const LENGTH: usize = 32 * 4; + #[inline] + fn zero() -> Self { + [0; 32 * 4] + } +} + +impl Packed for [u8; 64 * 64] { + const LENGTH: usize = 64 * 64; + #[inline] + fn zero() -> Self { + [0; 64 * 64] + } +} + +/// A byte slice of [`Unpackable`] denoting complete unpacked arrays. +pub trait Unpacked: + Copy + + Sized + + AsRef<[T]> + + AsMut<[T]> + + std::ops::Index + + std::ops::IndexMut + + for<'a> TryFrom<&'a [T], Error = std::array::TryFromSliceError> +{ + const LENGTH: usize; + fn zero() -> Self; +} + +impl Unpacked for [u8; 8] { + const LENGTH: usize = 8; + #[inline] + fn zero() -> Self { + [0; 8] + } +} + +impl Unpacked for [u16; 16] { + const LENGTH: usize = 16; + #[inline] + fn zero() -> Self { + [0; 16] + } +} + +impl Unpacked for [u32; 32] { + const LENGTH: usize = 32; + #[inline] + fn zero() -> Self { + [0; 32] + } +} + +impl Unpacked for [u64; 64] { + const LENGTH: usize = 64; + #[inline] + fn zero() -> Self { + [0; 64] + } +} + +/// A type representing a type that can be bitpacked and unpacked by this crate. +pub trait Unpackable: Copy + Sized + Default { + type Packed: Packed; + type Unpacked: Unpacked; + fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked); + fn pack(unpacked: &Self::Unpacked, num_bits: usize, packed: &mut [u8]); +} + +impl Unpackable for u8 { + type Packed = [u8; 8]; + type Unpacked = [u8; 8]; + + #[inline] + fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) { + unpack::unpack8(packed, unpacked, num_bits) + } + + #[inline] + fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) { + pack::pack8(packed, unpacked, num_bits) + } +} + +impl Unpackable for u16 { + type Packed = [u8; 16 * 2]; + type Unpacked = [u16; 16]; + + #[inline] + fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) { + unpack::unpack16(packed, unpacked, num_bits) + } + + #[inline] + fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) { + pack::pack16(packed, unpacked, num_bits) + } +} + +impl Unpackable for u32 { + type Packed = [u8; 32 * 4]; + type Unpacked = [u32; 32]; + + #[inline] + fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) { + unpack::unpack32(packed, unpacked, num_bits) + } + + #[inline] + fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) { + pack::pack32(packed, unpacked, num_bits) + } +} + +impl Unpackable for u64 { + type Packed = [u8; 64 * 64]; + type Unpacked = [u64; 64]; + + #[inline] + fn unpack(packed: &[u8], num_bits: usize, unpacked: &mut Self::Unpacked) { + unpack::unpack64(packed, unpacked, num_bits) + } + + #[inline] + fn pack(packed: &Self::Unpacked, num_bits: usize, unpacked: &mut [u8]) { + pack::pack64(packed, unpacked, num_bits) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + pub fn case1() -> (usize, Vec, Vec) { + let num_bits = 3; + let compressed = vec![ + 0b10001000u8, + 0b11000110, + 0b11111010, + 0b10001000u8, + 0b11000110, + 0b11111010, + 0b10001000u8, + 0b11000110, + 0b11111010, + 0b10001000u8, + 0b11000110, + 0b11111010, + 0b10001000u8, + 0b11000110, + 0b11111010, + ]; + let decompressed = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, + 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + ]; + (num_bits, decompressed, compressed) + } + + #[test] + fn encode_large() { + let (num_bits, unpacked, expected) = case1(); + let mut packed = vec![0u8; 4 * 32]; + + encode(&unpacked, num_bits, &mut packed); + assert_eq!(&packed[..15], expected); + } + + #[test] + fn test_encode() { + let num_bits = 3; + let unpacked = vec![0, 1, 2, 3, 4, 5, 6, 7]; + + let mut packed = vec![0u8; 4 * 32]; + + encode::(&unpacked, num_bits, &mut packed); + + let expected = vec![0b10001000u8, 0b11000110, 0b11111010]; + + assert_eq!(&packed[..3], expected); + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs new file mode 100644 index 000000000000..55183d36d641 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/pack.rs @@ -0,0 +1,108 @@ +/// Macro that generates a packing function taking the number of bits as a const generic +macro_rules! pack_impl { + ($t:ty, $bytes:literal, $bits:tt) => { + pub fn pack(input: &[$t; $bits], output: &mut [u8]) { + if NUM_BITS == 0 { + for out in output { + *out = 0; + } + return; + } + assert!(NUM_BITS <= $bytes * 8); + assert!(output.len() >= NUM_BITS * $bytes); + + let mask = match NUM_BITS { + $bits => <$t>::MAX, + _ => ((1 << NUM_BITS) - 1), + }; + + for i in 0..$bits { + let start_bit = i * NUM_BITS; + let end_bit = start_bit + NUM_BITS; + + let start_bit_offset = start_bit % $bits; + let end_bit_offset = end_bit % $bits; + let start_byte = start_bit / $bits; + let end_byte = end_bit / $bits; + if start_byte != end_byte && end_bit_offset != 0 { + let a = input[i] << start_bit_offset; + let val_a = <$t>::to_le_bytes(a); + for i in 0..$bytes { + output[start_byte * $bytes + i] |= val_a[i] + } + + let b = (input[i] >> (NUM_BITS - end_bit_offset)) & mask; + let val_b = <$t>::to_le_bytes(b); + for i in 0..$bytes { + output[end_byte * $bytes + i] |= val_b[i] + } + } else { + let val = (input[i] & mask) << start_bit_offset; + let val = <$t>::to_le_bytes(val); + + for i in 0..$bytes { + output[start_byte * $bytes + i] |= val[i] + } + } + } + } + }; +} + +/// Macro that generates pack functions that accept num_bits as a parameter +macro_rules! pack { + ($name:ident, $t:ty, $bytes:literal, $bits:tt) => { + mod $name { + pack_impl!($t, $bytes, $bits); + } + + /// Pack unpacked `input` into `output` with a bit width of `num_bits` + pub fn $name(input: &[$t; $bits], output: &mut [u8], num_bits: usize) { + // This will get optimised into a jump table + seq_macro::seq!(i in 0..=$bits { + if i == num_bits { + return $name::pack::(input, output); + } + }); + unreachable!("invalid num_bits {}", num_bits); + } + }; +} + +pack!(pack8, u8, 1, 8); +pack!(pack16, u16, 2, 16); +pack!(pack32, u32, 4, 32); +pack!(pack64, u64, 8, 64); + +#[cfg(test)] +mod tests { + use super::super::unpack::*; + use super::*; + + #[test] + fn test_basic() { + let input = [0u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + for num_bits in 4..16 { + let mut output = [0u8; 16 * 2]; + pack16(&input, &mut output, num_bits); + let mut other = [0u16; 16]; + unpack16(&output, &mut other, num_bits); + assert_eq!(other, input); + } + } + + #[test] + fn test_u32() { + let input = [ + 0u32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0u32, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, + ]; + for num_bits in 4..32 { + let mut output = [0u8; 32 * 4]; + pack32(&input, &mut output, num_bits); + let mut other = [0u32; 32]; + unpack32(&output, &mut other, num_bits); + assert_eq!(other, input); + } + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs new file mode 100644 index 000000000000..061b3acef333 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs + +/// Macro that generates an unpack function taking the number of bits as a const generic +macro_rules! unpack_impl { + ($t:ty, $bytes:literal, $bits:tt) => { + pub fn unpack(input: &[u8], output: &mut [$t; $bits]) { + if NUM_BITS == 0 { + for out in output { + *out = 0; + } + return; + } + + assert!(NUM_BITS <= $bytes * 8); + + let mask = match NUM_BITS { + $bits => <$t>::MAX, + _ => ((1 << NUM_BITS) - 1), + }; + + assert!(input.len() >= NUM_BITS * $bytes); + + let r = |output_idx: usize| { + <$t>::from_le_bytes( + input[output_idx * $bytes..output_idx * $bytes + $bytes] + .try_into() + .unwrap(), + ) + }; + + seq_macro::seq!(i in 0..$bits { + let start_bit = i * NUM_BITS; + let end_bit = start_bit + NUM_BITS; + + let start_bit_offset = start_bit % $bits; + let end_bit_offset = end_bit % $bits; + let start_byte = start_bit / $bits; + let end_byte = end_bit / $bits; + if start_byte != end_byte && end_bit_offset != 0 { + let val = r(start_byte); + let a = val >> start_bit_offset; + let val = r(end_byte); + let b = val << (NUM_BITS - end_bit_offset); + + output[i] = a | (b & mask); + } else { + let val = r(start_byte); + output[i] = (val >> start_bit_offset) & mask; + } + }); + } + }; +} + +/// Macro that generates unpack functions that accept num_bits as a parameter +macro_rules! unpack { + ($name:ident, $t:ty, $bytes:literal, $bits:tt) => { + mod $name { + unpack_impl!($t, $bytes, $bits); + } + + /// Unpack packed `input` into `output` with a bit width of `num_bits` + pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) { + // This will get optimised into a jump table + seq_macro::seq!(i in 0..=$bits { + if i == num_bits { + return $name::unpack::(input, output); + } + }); + unreachable!("invalid num_bits {}", num_bits); + } + }; +} + +unpack!(unpack8, u8, 1, 8); +unpack!(unpack16, u16, 2, 16); +unpack!(unpack32, u32, 4, 32); +unpack!(unpack64, u64, 8, 64); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + let input = [0xFF; 4096]; + + for i in 0..=8 { + let mut output = [0; 8]; + unpack8(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + + for i in 0..=16 { + let mut output = [0; 16]; + unpack16(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + + for i in 0..=32 { + let mut output = [0; 32]; + unpack32(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + + for i in 0..=64 { + let mut output = [0; 64]; + unpack64(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs new file mode 100644 index 000000000000..378706541e55 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs @@ -0,0 +1,362 @@ +use super::super::{bitpacked, uleb128, zigzag_leb128}; +use crate::parquet::encoding::ceil8; +use crate::parquet::error::Error; + +/// An [`Iterator`] of [`i64`] +#[derive(Debug)] +struct Block<'a> { + // this is the minimum delta that must be added to every value. + min_delta: i64, + _num_mini_blocks: usize, + /// Number of values that each mini block has. + values_per_mini_block: usize, + bitwidths: std::slice::Iter<'a, u8>, + values: &'a [u8], + remaining: usize, // number of elements + current_index: usize, // invariant: < values_per_mini_block + // None represents a relative delta of zero, in which case there is no miniblock. + current_miniblock: Option>, + // number of bytes consumed. + consumed_bytes: usize, +} + +impl<'a> Block<'a> { + pub fn try_new( + mut values: &'a [u8], + num_mini_blocks: usize, + values_per_mini_block: usize, + length: usize, + ) -> Result { + let length = std::cmp::min(length, num_mini_blocks * values_per_mini_block); + + let mut consumed_bytes = 0; + let (min_delta, consumed) = zigzag_leb128::decode(values)?; + consumed_bytes += consumed; + values = &values[consumed..]; + + if num_mini_blocks > values.len() { + return Err(Error::oos( + "Block must contain at least num_mini_blocks bytes (the bitwidths)", + )); + } + let (bitwidths, remaining) = values.split_at(num_mini_blocks); + consumed_bytes += num_mini_blocks; + values = remaining; + + let mut block = Block { + min_delta, + _num_mini_blocks: num_mini_blocks, + values_per_mini_block, + bitwidths: bitwidths.iter(), + remaining: length, + values, + current_index: 0, + current_miniblock: None, + consumed_bytes, + }; + + // Set up first mini-block + block.advance_miniblock()?; + + Ok(block) + } + + fn advance_miniblock(&mut self) -> Result<(), Error> { + // unwrap is ok: we sliced it by num_mini_blocks in try_new + let num_bits = self.bitwidths.next().copied().unwrap() as usize; + + self.current_miniblock = if num_bits > 0 { + let length = std::cmp::min(self.remaining, self.values_per_mini_block); + + let miniblock_length = ceil8(self.values_per_mini_block * num_bits); + if miniblock_length > self.values.len() { + return Err(Error::oos( + "block must contain at least miniblock_length bytes (the mini block)", + )); + } + let (miniblock, remainder) = self.values.split_at(miniblock_length); + + self.values = remainder; + self.consumed_bytes += miniblock_length; + + Some(bitpacked::Decoder::try_new(miniblock, num_bits, length).unwrap()) + } else { + None + }; + self.current_index = 0; + + Ok(()) + } +} + +impl<'a> Iterator for Block<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.remaining == 0 { + return None; + } + let result = self.min_delta + + self + .current_miniblock + .as_mut() + .map(|x| x.next().unwrap_or_default()) + .unwrap_or(0) as i64; + self.current_index += 1; + self.remaining -= 1; + + if self.remaining > 0 && self.current_index == self.values_per_mini_block { + if let Err(e) = self.advance_miniblock() { + return Some(Err(e)); + } + } + + Some(Ok(result)) + } +} + +/// Decoder of parquets' `DELTA_BINARY_PACKED`. Implements `Iterator`. +/// # Implementation +/// This struct does not allocate on the heap. +#[derive(Debug)] +pub struct Decoder<'a> { + num_mini_blocks: usize, + values_per_mini_block: usize, + values_remaining: usize, + next_value: i64, + values: &'a [u8], + current_block: Option>, + // the total number of bytes consumed up to a given point, excluding the bytes on the current_block + consumed_bytes: usize, +} + +impl<'a> Decoder<'a> { + pub fn try_new(mut values: &'a [u8]) -> Result { + let mut consumed_bytes = 0; + let (block_size, consumed) = uleb128::decode(values)?; + consumed_bytes += consumed; + assert_eq!(block_size % 128, 0); + values = &values[consumed..]; + let (num_mini_blocks, consumed) = uleb128::decode(values)?; + let num_mini_blocks = num_mini_blocks as usize; + consumed_bytes += consumed; + values = &values[consumed..]; + let (total_count, consumed) = uleb128::decode(values)?; + let total_count = total_count as usize; + consumed_bytes += consumed; + values = &values[consumed..]; + let (first_value, consumed) = zigzag_leb128::decode(values)?; + consumed_bytes += consumed; + values = &values[consumed..]; + + let values_per_mini_block = block_size as usize / num_mini_blocks; + assert_eq!(values_per_mini_block % 8, 0); + + // If we only have one value (first_value), there are no blocks. + let current_block = if total_count > 1 { + Some(Block::try_new( + values, + num_mini_blocks, + values_per_mini_block, + total_count - 1, + )?) + } else { + None + }; + + Ok(Self { + num_mini_blocks, + values_per_mini_block, + values_remaining: total_count, + next_value: first_value, + values, + current_block, + consumed_bytes, + }) + } + + /// Returns the total number of bytes consumed up to this point by [`Decoder`]. + pub fn consumed_bytes(&self) -> usize { + self.consumed_bytes + self.current_block.as_ref().map_or(0, |b| b.consumed_bytes) + } + + fn load_delta(&mut self) -> Result { + // At this point we must have at least one block and value available + let current_block = self.current_block.as_mut().unwrap(); + if let Some(x) = current_block.next() { + x + } else { + // load next block + self.values = &self.values[current_block.consumed_bytes..]; + self.consumed_bytes += current_block.consumed_bytes; + + let next_block = Block::try_new( + self.values, + self.num_mini_blocks, + self.values_per_mini_block, + self.values_remaining, + ); + match next_block { + Ok(mut next_block) => { + let delta = next_block + .next() + .ok_or_else(|| Error::oos("Missing block"))?; + self.current_block = Some(next_block); + delta + }, + Err(e) => Err(e), + } + } + } +} + +impl<'a> Iterator for Decoder<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.values_remaining == 0 { + return None; + } + + let result = Some(Ok(self.next_value)); + + self.values_remaining -= 1; + if self.values_remaining == 0 { + // do not try to load another block + return result; + } + + let delta = match self.load_delta() { + Ok(delta) => delta, + Err(e) => return Some(Err(e)), + }; + + self.next_value += delta; + result + } + + fn size_hint(&self) -> (usize, Option) { + (self.values_remaining, Some(self.values_remaining)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn single_value() { + // Generated by parquet-rs + // + // header: [128, 1, 4, 1, 2] + // block size: 128, 1 + // mini-blocks: 4 + // elements: 1 + // first_value: 2 <=z> 1 + let data = &[128, 1, 4, 1, 2]; + + let mut decoder = Decoder::try_new(data).unwrap(); + let r = decoder.by_ref().collect::, _>>().unwrap(); + + assert_eq!(&r[..], &[1]); + assert_eq!(decoder.consumed_bytes(), 5); + } + + #[test] + fn test_from_spec() { + let expected = (1..=5).collect::>(); + // VALIDATED FROM SPARK==3.1.1 + // header: [128, 1, 4, 5, 2] + // block size: 128, 1 + // mini-blocks: 4 + // elements: 5 + // first_value: 2 <=z> 1 + // block1: [2, 0, 0, 0, 0] + // min_delta: 2 <=z> 1 + // bit_width: 0 + let data = &[128, 1, 4, 5, 2, 2, 0, 0, 0, 0]; + + let mut decoder = Decoder::try_new(data).unwrap(); + let r = decoder.by_ref().collect::, _>>().unwrap(); + + assert_eq!(expected, r); + + assert_eq!(decoder.consumed_bytes(), 10); + } + + #[test] + fn case2() { + let expected = vec![1, 2, 3, 4, 5, 1]; + // VALIDATED FROM SPARK==3.1.1 + // header: [128, 1, 4, 6, 2] + // block size: 128, 1 <=u> 128 + // mini-blocks: 4 <=u> 4 + // elements: 6 <=u> 6 + // first_value: 2 <=z> 1 + // block1: [7, 3, 0, 0, 0] + // min_delta: 7 <=z> -4 + // bit_widths: [3, 0, 0, 0] + // values: [ + // 0b01101101 + // 0b00001011 + // ... + // ] <=b> [3, 3, 3, 3, 0] + let data = &[ + 128, 1, 4, 6, 2, 7, 3, 0, 0, 0, 0b01101101, 0b00001011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // these should not be consumed + 1, 2, 3, + ]; + + let mut decoder = Decoder::try_new(data).unwrap(); + let r = decoder.by_ref().collect::, _>>().unwrap(); + + assert_eq!(expected, r); + assert_eq!(decoder.consumed_bytes(), data.len() - 3); + } + + #[test] + fn multiple_miniblocks() { + #[rustfmt::skip] + let data = &[ + // Header: [128, 1, 4, 65, 100] + 128, 1, // block size <=u> 128 + 4, // number of mini-blocks <=u> 4 + 65, // number of elements <=u> 65 + 100, // first_value <=z> 50 + + // Block 1 header: [7, 3, 4, 0, 0] + 7, // min_delta <=z> -4 + 3, 4, 255, 0, // bit_widths (255 should not be used as only two miniblocks are needed) + + // 32 3-bit values of 0 for mini-block 1 (12 bytes) + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + // 32 4-bit values of 8 for mini-block 2 (16 bytes) + 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, + 0x88, 0x88, + + // these should not be consumed + 1, 2, 3, + ]; + + #[rustfmt::skip] + let expected = [ + // First value + 50, + + // Mini-block 1: 32 deltas of -4 + 46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, -2, -6, -10, -14, -18, -22, -26, -30, -34, + -38, -42, -46, -50, -54, -58, -62, -66, -70, -74, -78, + + // Mini-block 2: 32 deltas of 4 + -74, -70, -66, -62, -58, -54, -50, -46, -42, -38, -34, -30, -26, -22, -18, -14, -10, -6, + -2, 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, + ]; + + let mut decoder = Decoder::try_new(data).unwrap(); + let r = decoder.by_ref().collect::, _>>().unwrap(); + + assert_eq!(&expected[..], &r[..]); + assert_eq!(decoder.consumed_bytes(), data.len() - 3); + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs new file mode 100644 index 000000000000..9bdb861504d1 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/encoder.rs @@ -0,0 +1,122 @@ +use super::super::{bitpacked, uleb128, zigzag_leb128}; +use crate::parquet::encoding::ceil8; + +/// Encodes an iterator of `i64` according to parquet's `DELTA_BINARY_PACKED`. +/// # Implementation +/// * This function does not allocate on the heap. +/// * The number of mini-blocks is always 1. This may change in the future. +pub fn encode>(mut iterator: I, buffer: &mut Vec) { + let block_size = 128; + let mini_blocks = 1; + + let mut container = [0u8; 10]; + let encoded_len = uleb128::encode(block_size, &mut container); + buffer.extend_from_slice(&container[..encoded_len]); + + let encoded_len = uleb128::encode(mini_blocks, &mut container); + buffer.extend_from_slice(&container[..encoded_len]); + + let length = iterator.size_hint().1.unwrap(); + let encoded_len = uleb128::encode(length as u64, &mut container); + buffer.extend_from_slice(&container[..encoded_len]); + + let mut values = [0i64; 128]; + let mut deltas = [0u64; 128]; + + let first_value = iterator.next().unwrap_or_default(); + let (container, encoded_len) = zigzag_leb128::encode(first_value); + buffer.extend_from_slice(&container[..encoded_len]); + + let mut prev = first_value; + let mut length = iterator.size_hint().1.unwrap(); + while length != 0 { + let mut min_delta = i64::MAX; + let mut max_delta = i64::MIN; + let mut num_bits = 0; + for (i, integer) in (0..128).zip(&mut iterator) { + let delta = integer - prev; + min_delta = min_delta.min(delta); + max_delta = max_delta.max(delta); + + num_bits = 64 - (max_delta - min_delta).leading_zeros(); + values[i] = delta; + prev = integer; + } + let consumed = std::cmp::min(length - iterator.size_hint().1.unwrap(), 128); + length = iterator.size_hint().1.unwrap(); + let values = &values[..consumed]; + + values.iter().zip(deltas.iter_mut()).for_each(|(v, delta)| { + *delta = (v - min_delta) as u64; + }); + + // + let (container, encoded_len) = zigzag_leb128::encode(min_delta); + buffer.extend_from_slice(&container[..encoded_len]); + + // one miniblock => 1 byte + buffer.push(num_bits as u8); + write_miniblock(buffer, num_bits as usize, deltas); + } +} + +fn write_miniblock(buffer: &mut Vec, num_bits: usize, deltas: [u64; 128]) { + if num_bits > 0 { + let start = buffer.len(); + + // bitpack encode all (deltas.len = 128 which is a multiple of 32) + let bytes_needed = start + ceil8(deltas.len() * num_bits); + buffer.resize(bytes_needed, 0); + bitpacked::encode(deltas.as_ref(), num_bits, &mut buffer[start..]); + + let bytes_needed = start + ceil8(deltas.len() * num_bits); + buffer.truncate(bytes_needed); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn constant_delta() { + // header: [128, 1, 1, 5, 2]: + // block size: 128 <=u> 128, 1 + // mini-blocks: 1 <=u> 1 + // elements: 5 <=u> 5 + // first_value: 2 <=z> 1 + // block1: [2, 0, 0, 0, 0] + // min_delta: 1 <=z> 2 + // bitwidth: 0 + let data = 1..=5; + let expected = vec![128u8, 1, 1, 5, 2, 2, 0]; + + let mut buffer = vec![]; + encode(data, &mut buffer); + assert_eq!(expected, buffer); + } + + #[test] + fn negative_min_delta() { + // max - min = 1 - -4 = 5 + let data = vec![1, 2, 3, 4, 5, 1]; + // header: [128, 1, 4, 6, 2] + // block size: 128 <=u> 128, 1 + // mini-blocks: 1 <=u> 1 + // elements: 6 <=u> 5 + // first_value: 2 <=z> 1 + // block1: [7, 3, 253, 255] + // min_delta: -4 <=z> 7 + // bitwidth: 3 + // values: [5, 5, 5, 5, 0] <=b> [ + // 0b01101101 + // 0b00001011 + // ] + let mut expected = vec![128u8, 1, 1, 6, 2, 7, 3, 0b01101101, 0b00001011]; + expected.extend(std::iter::repeat(0).take(128 * 3 / 8 - 2)); // 128 values, 3 bits, 2 already used + + let mut buffer = vec![]; + encode(data.into_iter(), &mut buffer); + assert_eq!(expected, buffer); + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs new file mode 100644 index 000000000000..2ba0e953b83c --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs @@ -0,0 +1,90 @@ +mod decoder; +mod encoder; + +pub use decoder::Decoder; +pub use encoder::encode; + +#[cfg(test)] +mod tests { + use super::*; + use crate::parquet::error::Error; + + #[test] + fn basic() -> Result<(), Error> { + let data = vec![1, 3, 1, 2, 3]; + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + let iter = Decoder::try_new(&buffer)?; + + let result = iter.collect::, _>>()?; + assert_eq!(result, data); + Ok(()) + } + + #[test] + fn negative_value() -> Result<(), Error> { + let data = vec![1, 3, -1, 2, 3]; + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + let iter = Decoder::try_new(&buffer)?; + + let result = iter.collect::, _>>()?; + assert_eq!(result, data); + Ok(()) + } + + #[test] + fn some() -> Result<(), Error> { + let data = vec![ + -2147483648, + -1777158217, + -984917788, + -1533539476, + -731221386, + -1322398478, + 906736096, + ]; + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + let iter = Decoder::try_new(&buffer)?; + + let result = iter.collect::, Error>>()?; + assert_eq!(result, data); + Ok(()) + } + + #[test] + fn more_than_one_block() -> Result<(), Error> { + let mut data = vec![1, 3, -1, 2, 3, 10, 1]; + for x in 0..128 { + data.push(x - 10) + } + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + let iter = Decoder::try_new(&buffer)?; + + let result = iter.collect::, _>>()?; + assert_eq!(result, data); + Ok(()) + } + + #[test] + fn test_another() -> Result<(), Error> { + let data = vec![2, 3, 1, 2, 1]; + + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + let len = buffer.len(); + let mut iter = Decoder::try_new(&buffer)?; + + let result = iter.by_ref().collect::, _>>()?; + assert_eq!(result, data); + + assert_eq!(iter.consumed_bytes(), len); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs new file mode 100644 index 000000000000..0313e7890394 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/decoder.rs @@ -0,0 +1,106 @@ +use super::super::{delta_bitpacked, delta_length_byte_array}; +use crate::parquet::error::Error; + +/// Decodes according to [Delta strings](https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7), +/// prefixes, lengths and values +/// # Implementation +/// This struct does not allocate on the heap. +#[derive(Debug)] +pub struct Decoder<'a> { + values: &'a [u8], + prefix_lengths: delta_bitpacked::Decoder<'a>, +} + +impl<'a> Decoder<'a> { + pub fn try_new(values: &'a [u8]) -> Result { + let prefix_lengths = delta_bitpacked::Decoder::try_new(values)?; + Ok(Self { + values, + prefix_lengths, + }) + } + + pub fn into_lengths(self) -> Result, Error> { + assert_eq!(self.prefix_lengths.size_hint().0, 0); + delta_length_byte_array::Decoder::try_new( + &self.values[self.prefix_lengths.consumed_bytes()..], + ) + } +} + +impl<'a> Iterator for Decoder<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + self.prefix_lengths.next().map(|x| x.map(|x| x as u32)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bla() -> Result<(), Error> { + // VALIDATED from spark==3.1.1 + let data = &[ + 128, 1, 4, 2, 0, 0, 0, 0, 0, 0, 128, 1, 4, 2, 10, 0, 0, 0, 0, 0, 72, 101, 108, 108, + 111, 87, 111, 114, 108, 100, + // extra bytes are not from spark, but they should be ignored by the decoder + // because they are beyond the sum of all lengths. + 1, 2, 3, + ]; + // result of encoding + let expected = &["Hello", "World"]; + let expected_lengths = expected.iter().map(|x| x.len() as i32).collect::>(); + let expected_prefixes = vec![0, 0]; + let expected_values = expected.join(""); + let expected_values = expected_values.as_bytes(); + + let mut decoder = Decoder::try_new(data)?; + let prefixes = decoder.by_ref().collect::, _>>()?; + assert_eq!(prefixes, expected_prefixes); + + // move to the lengths + let mut decoder = decoder.into_lengths()?; + + let lengths = decoder.by_ref().collect::, _>>()?; + assert_eq!(lengths, expected_lengths); + + // move to the values + let values = decoder.values(); + assert_eq!(values, expected_values); + Ok(()) + } + + #[test] + fn test_with_prefix() -> Result<(), Error> { + // VALIDATED from spark==3.1.1 + let data = &[ + 128, 1, 4, 2, 0, 6, 0, 0, 0, 0, 128, 1, 4, 2, 10, 4, 0, 0, 0, 0, 72, 101, 108, 108, + 111, 105, 99, 111, 112, 116, 101, 114, + // extra bytes are not from spark, but they should be ignored by the decoder + // because they are beyond the sum of all lengths. + 1, 2, 3, + ]; + // result of encoding + let expected_lengths = vec![5, 7]; + let expected_prefixes = vec![0, 3]; + let expected_values = b"Helloicopter"; + + let mut decoder = Decoder::try_new(data)?; + let prefixes = decoder.by_ref().collect::, _>>()?; + assert_eq!(prefixes, expected_prefixes); + + // move to the lengths + let mut decoder = decoder.into_lengths()?; + + let lengths = decoder.by_ref().collect::, _>>()?; + assert_eq!(lengths, expected_lengths); + + // move to the values + let values = decoder.values(); + assert_eq!(values, expected_values); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs new file mode 100644 index 000000000000..1e9e071c87be --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/encoder.rs @@ -0,0 +1,32 @@ +use super::super::delta_bitpacked; +use crate::parquet::encoding::delta_length_byte_array; + +/// Encodes an iterator of according to DELTA_BYTE_ARRAY +pub fn encode<'a, I: Iterator + Clone>(iterator: I, buffer: &mut Vec) { + let mut previous = b"".as_ref(); + + let mut sum_lengths = 0; + let prefixes = iterator + .clone() + .map(|item| { + let prefix_length = item + .iter() + .zip(previous.iter()) + .enumerate() + // find first difference + .find_map(|(length, (lhs, rhs))| (lhs != rhs).then_some(length)) + .unwrap_or(previous.len()); + previous = item; + + sum_lengths += item.len() - prefix_length; + prefix_length as i64 + }) + .collect::>(); + delta_bitpacked::encode(prefixes.iter().copied(), buffer); + + let remaining = iterator + .zip(prefixes) + .map(|(item, prefix)| &item[prefix as usize..]); + + delta_length_byte_array::encode(remaining, buffer); +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs new file mode 100644 index 000000000000..9eab9c5f6ead --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_byte_array/mod.rs @@ -0,0 +1,33 @@ +mod decoder; +mod encoder; + +pub use decoder::Decoder; +pub use encoder::encode; + +#[cfg(test)] +mod tests { + use super::*; + use crate::parquet::error::Error; + + #[test] + fn basic() -> Result<(), Error> { + let data = vec![b"Hello".as_ref(), b"Helicopter"]; + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + + let mut decoder = Decoder::try_new(&buffer)?; + let prefixes = decoder.by_ref().collect::, _>>()?; + assert_eq!(prefixes, vec![0, 3]); + + // move to the lengths + let mut decoder = decoder.into_lengths()?; + + let lengths = decoder.by_ref().collect::, _>>()?; + assert_eq!(lengths, vec![5, 7]); + + // move to the values + let values = decoder.values(); + assert_eq!(values, b"Helloicopter"); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs new file mode 100644 index 000000000000..df1dd2daaafb --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/decoder.rs @@ -0,0 +1,80 @@ +use super::super::delta_bitpacked; +use crate::parquet::error::Error; + +/// Decodes [Delta-length byte array](https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6) +/// lengths and values. +/// # Implementation +/// This struct does not allocate on the heap. +/// # Example +/// ``` +/// use crate::parquet::parquet::encoding::delta_length_byte_array::Decoder; +/// +/// let expected = &["Hello", "World"]; +/// let expected_lengths = expected.iter().map(|x| x.len() as i32).collect::>(); +/// let expected_values = expected.join(""); +/// let expected_values = expected_values.as_bytes(); +/// let data = &[ +/// 128, 1, 4, 2, 10, 0, 0, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, +/// ]; +/// +/// let mut decoder = Decoder::try_new(data).unwrap(); +/// +/// // Extract the lengths +/// let lengths = decoder.by_ref().collect::, _>>().unwrap(); +/// assert_eq!(lengths, expected_lengths); +/// +/// // Extract the values. This _must_ be called after consuming all lengths by reference (see above). +/// let values = decoder.into_values(); +/// +/// assert_eq!(values, expected_values); +#[derive(Debug)] +pub struct Decoder<'a> { + values: &'a [u8], + lengths: delta_bitpacked::Decoder<'a>, + total_length: u32, +} + +impl<'a> Decoder<'a> { + pub fn try_new(values: &'a [u8]) -> Result { + let lengths = delta_bitpacked::Decoder::try_new(values)?; + Ok(Self { + values, + lengths, + total_length: 0, + }) + } + + /// Consumes this decoder and returns the slice of concatenated values. + /// # Panics + /// This function panics if this iterator has not been fully consumed. + pub fn into_values(self) -> &'a [u8] { + assert_eq!(self.lengths.size_hint().0, 0); + let start = self.lengths.consumed_bytes(); + &self.values[start..start + self.total_length as usize] + } + + /// Returns the slice of concatenated values. + /// # Panics + /// This function panics if this iterator has not yet been fully consumed. + pub fn values(&self) -> &'a [u8] { + assert_eq!(self.lengths.size_hint().0, 0); + let start = self.lengths.consumed_bytes(); + &self.values[start..start + self.total_length as usize] + } +} + +impl<'a> Iterator for Decoder<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + let result = self.lengths.next(); + match result { + Some(Ok(v)) => { + self.total_length += v as u32; + Some(Ok(v as i32)) + }, + Some(Err(error)) => Some(Err(error)), + None => None, + } + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs new file mode 100644 index 000000000000..fc2121cf68e8 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/encoder.rs @@ -0,0 +1,19 @@ +use crate::parquet::encoding::delta_bitpacked; + +/// Encodes a clonable iterator of `&[u8]` into `buffer`. This does not allocated on the heap. +/// # Implementation +/// This encoding is equivalent to call [`delta_bitpacked::encode`] on the lengths of the items +/// of the iterator followed by extending the buffer from each item of the iterator. +pub fn encode, I: Iterator + Clone>(iterator: I, buffer: &mut Vec) { + let mut total_length = 0; + delta_bitpacked::encode( + iterator.clone().map(|x| { + let len = x.as_ref().len(); + total_length += len; + len as i64 + }), + buffer, + ); + buffer.reserve(total_length); + iterator.for_each(|x| buffer.extend(x.as_ref())) +} diff --git a/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs new file mode 100644 index 000000000000..91a42e3636ac --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/delta_length_byte_array/mod.rs @@ -0,0 +1,50 @@ +mod decoder; +mod encoder; + +pub use decoder::Decoder; +pub use encoder::encode; + +#[cfg(test)] +mod tests { + use super::*; + use crate::parquet::error::Error; + + #[test] + fn basic() -> Result<(), Error> { + let data = vec!["aa", "bbb", "a", "aa", "b"]; + + let mut buffer = vec![]; + encode(data.into_iter().map(|x| x.as_bytes()), &mut buffer); + + let mut iter = Decoder::try_new(&buffer)?; + + let result = iter.by_ref().collect::, _>>()?; + assert_eq!(result, vec![2, 3, 1, 2, 1]); + + let result = iter.values(); + assert_eq!(result, b"aabbbaaab".as_ref()); + Ok(()) + } + + #[test] + fn many_numbers() -> Result<(), Error> { + let mut data = vec![]; + for i in 0..136 { + data.push(format!("a{}", i)) + } + let expected_values = data.join(""); + let expected_lengths = data.iter().map(|x| x.len() as i32).collect::>(); + + let mut buffer = vec![]; + encode(data.into_iter(), &mut buffer); + + let mut iter = Decoder::try_new(&buffer)?; + + let result = iter.by_ref().collect::, _>>()?; + assert_eq!(result, expected_lengths); + + let result = iter.into_values(); + assert_eq!(result, expected_values.as_str().as_bytes()); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs new file mode 100644 index 000000000000..f46f22f84adb --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/bitmap.rs @@ -0,0 +1,102 @@ +use std::io::Write; + +const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; + +/// Sets bit at position `i` in `byte` +#[inline] +pub fn set(byte: u8, i: usize) -> u8 { + byte | BIT_MASK[i] +} + +/// An [`Iterator`] of bool that decodes a bitmap. +/// This is a specialization of [`super::super::bitpacked::Decoder`] for `num_bits == 1`. +#[derive(Debug)] +pub struct BitmapIter<'a> { + iter: std::slice::Iter<'a, u8>, + current_byte: &'a u8, + remaining: usize, + mask: u8, +} + +impl<'a> BitmapIter<'a> { + /// Returns a new [`BitmapIter`]. + /// # Panics + /// This function panics iff `offset / 8 > slice.len()` + #[inline] + pub fn new(slice: &'a [u8], offset: usize, len: usize) -> Self { + let bytes = &slice[offset / 8..]; + + let mut iter = bytes.iter(); + + let current_byte = iter.next().unwrap_or(&0); + + Self { + iter, + mask: 1u8.rotate_left(offset as u32), + remaining: len, + current_byte, + } + } +} + +impl<'a> Iterator for BitmapIter<'a> { + type Item = bool; + + #[inline] + fn next(&mut self) -> Option { + // easily predictable in branching + if self.remaining == 0 { + return None; + } else { + self.remaining -= 1; + } + let value = self.current_byte & self.mask != 0; + self.mask = self.mask.rotate_left(1); + if self.mask == 1 { + // reached a new byte => try to fetch it from the iterator + if let Some(v) = self.iter.next() { + self.current_byte = v + } + } + Some(value) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } +} + +/// Writes an iterator of bools into writer, with LSB first. +pub fn encode_bool>( + writer: &mut W, + mut iterator: I, +) -> std::io::Result<()> { + // the length of the iterator. + let length = iterator.size_hint().1.unwrap(); + + let chunks = length / 8; + let reminder = length % 8; + + (0..chunks).try_for_each(|_| { + let mut byte = 0u8; + (0..8).for_each(|i| { + if iterator.next().unwrap() { + byte = set(byte, i) + } + }); + writer.write_all(&[byte]) + })?; + + if reminder != 0 { + let mut last = 0u8; + iterator.enumerate().for_each(|(i, value)| { + if value { + last = set(last, i) + } + }); + writer.write_all(&[last]) + } else { + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/decoder.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/decoder.rs new file mode 100644 index 000000000000..859ed246c0e0 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/decoder.rs @@ -0,0 +1,142 @@ +use super::super::{ceil8, uleb128}; +use super::HybridEncoded; +use crate::parquet::error::Error; + +/// An [`Iterator`] of [`HybridEncoded`]. +#[derive(Debug, Clone)] +pub struct Decoder<'a> { + values: &'a [u8], + num_bits: usize, +} + +impl<'a> Decoder<'a> { + /// Returns a new [`Decoder`] + pub fn new(values: &'a [u8], num_bits: usize) -> Self { + Self { values, num_bits } + } + + /// Returns the number of bits being used by this decoder. + #[inline] + pub fn num_bits(&self) -> usize { + self.num_bits + } +} + +impl<'a> Iterator for Decoder<'a> { + type Item = Result, Error>; + + #[inline] // -18% improvement in bench + fn next(&mut self) -> Option { + if self.num_bits == 0 { + return None; + } + + if self.values.is_empty() { + return None; + } + + let (indicator, consumed) = match uleb128::decode(self.values) { + Ok((indicator, consumed)) => (indicator, consumed), + Err(e) => return Some(Err(e)), + }; + self.values = &self.values[consumed..]; + if self.values.is_empty() { + return None; + }; + + if indicator & 1 == 1 { + // is bitpacking + let bytes = (indicator as usize >> 1) * self.num_bits; + let bytes = std::cmp::min(bytes, self.values.len()); + let (result, remaining) = self.values.split_at(bytes); + self.values = remaining; + Some(Ok(HybridEncoded::Bitpacked(result))) + } else { + // is rle + let run_length = indicator as usize >> 1; + // repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width) + let rle_bytes = ceil8(self.num_bits); + let (result, remaining) = self.values.split_at(rle_bytes); + self.values = remaining; + Some(Ok(HybridEncoded::Rle(result, run_length))) + } + } +} + +#[cfg(test)] +mod tests { + use super::super::super::bitpacked; + use super::*; + + #[test] + fn basics_1() { + let bit_width = 1usize; + let length = 5; + let values = vec![ + 2, 0, 0, 0, // length + 0b00000011, 0b00001011, // data + ]; + + let mut decoder = Decoder::new(&values[4..6], bit_width); + + let run = decoder.next().unwrap(); + + if let HybridEncoded::Bitpacked(values) = run.unwrap() { + assert_eq!(values, &[0b00001011]); + let result = bitpacked::Decoder::::try_new(values, bit_width, length) + .unwrap() + .collect::>(); + assert_eq!(result, &[1, 1, 0, 1, 0]); + } else { + panic!() + }; + } + + #[test] + fn basics_2() { + // This test was validated by the result of what pyarrow3 outputs when + // the bitmap is used. + let bit_width = 1; + let values = vec![ + 3, 0, 0, 0, // length + 0b00000101, 0b11101011, 0b00000010, // data + ]; + let expected = &[1, 1, 0, 1, 0, 1, 1, 1, 0, 1]; + + let mut decoder = Decoder::new(&values[4..4 + 3], bit_width); + + let run = decoder.next().unwrap(); + + if let HybridEncoded::Bitpacked(values) = run.unwrap() { + assert_eq!(values, &[0b11101011, 0b00000010]); + let result = bitpacked::Decoder::::try_new(values, bit_width, 10) + .unwrap() + .collect::>(); + assert_eq!(result, expected); + } else { + panic!() + }; + } + + #[test] + fn basics_3() { + let bit_width = 1; + let length = 8; + let values = vec![ + 2, 0, 0, 0, // length + 0b00010000, // data + 0b00000001, + ]; + + let mut decoder = Decoder::new(&values[4..4 + 2], bit_width); + + let run = decoder.next().unwrap(); + + if let HybridEncoded::Rle(values, items) = run.unwrap() { + assert_eq!(values, &[0b00000001]); + assert_eq!(items, length); + } else { + panic!() + }; + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/encoder.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/encoder.rs new file mode 100644 index 000000000000..c4523a7da53b --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/encoder.rs @@ -0,0 +1,166 @@ +use std::io::Write; + +use super::bitpacked_encode; +use crate::parquet::encoding::{bitpacked, ceil8, uleb128}; + +/// RLE-hybrid encoding of `u32`. This currently only yields bitpacked values. +pub fn encode_u32>( + writer: &mut W, + iterator: I, + num_bits: u32, +) -> std::io::Result<()> { + let num_bits = num_bits as u8; + // the length of the iterator. + let length = iterator.size_hint().1.unwrap(); + + // write the length + indicator + let mut header = ceil8(length) as u64; + header <<= 1; + header |= 1; // it is bitpacked => first bit is set + let mut container = [0; 10]; + let used = uleb128::encode(header, &mut container); + writer.write_all(&container[..used])?; + + bitpacked_encode_u32(writer, iterator, num_bits as usize)?; + + Ok(()) +} + +const U32_BLOCK_LEN: usize = 32; + +fn bitpacked_encode_u32>( + writer: &mut W, + mut iterator: I, + num_bits: usize, +) -> std::io::Result<()> { + // the length of the iterator. + let length = iterator.size_hint().1.unwrap(); + + let chunks = length / U32_BLOCK_LEN; + let remainder = length - chunks * U32_BLOCK_LEN; + let mut buffer = [0u32; U32_BLOCK_LEN]; + + let compressed_chunk_size = ceil8(U32_BLOCK_LEN * num_bits); + + for _ in 0..chunks { + iterator + .by_ref() + .take(U32_BLOCK_LEN) + .zip(buffer.iter_mut()) + .for_each(|(item, buf)| *buf = item); + + let mut packed = [0u8; 4 * U32_BLOCK_LEN]; + bitpacked::encode_pack::(&buffer, num_bits, packed.as_mut()); + writer.write_all(&packed[..compressed_chunk_size])?; + } + + if remainder != 0 { + let compressed_remainder_size = ceil8(remainder * num_bits); + iterator + .by_ref() + .take(remainder) + .zip(buffer.iter_mut()) + .for_each(|(item, buf)| *buf = item); + + let mut packed = [0u8; 4 * U32_BLOCK_LEN]; + bitpacked::encode_pack(&buffer, num_bits, packed.as_mut()); + writer.write_all(&packed[..compressed_remainder_size])?; + }; + Ok(()) +} + +/// the bitpacked part of the encoder. +pub fn encode_bool>( + writer: &mut W, + iterator: I, +) -> std::io::Result<()> { + // the length of the iterator. + let length = iterator.size_hint().1.unwrap(); + + // write the length + indicator + let mut header = ceil8(length) as u64; + header <<= 1; + header |= 1; // it is bitpacked => first bit is set + let mut container = [0; 10]; + let used = uleb128::encode(header, &mut container); + + writer.write_all(&container[..used])?; + + // encode the iterator + bitpacked_encode(writer, iterator) +} + +#[cfg(test)] +mod tests { + use super::super::bitmap::BitmapIter; + use super::*; + + #[test] + fn bool_basics_1() -> std::io::Result<()> { + let iter = BitmapIter::new(&[0b10011101u8, 0b10011101], 0, 14); + + let mut vec = vec![]; + + encode_bool(&mut vec, iter)?; + + assert_eq!(vec, vec![(2 << 1 | 1), 0b10011101u8, 0b00011101]); + + Ok(()) + } + + #[test] + fn bool_from_iter() -> std::io::Result<()> { + let mut vec = vec![]; + + encode_bool( + &mut vec, + vec![true, true, true, true, true, true, true, true].into_iter(), + )?; + + assert_eq!(vec, vec![(1 << 1 | 1), 0b11111111]); + Ok(()) + } + + #[test] + fn test_encode_u32() -> std::io::Result<()> { + let mut vec = vec![]; + + encode_u32(&mut vec, vec![0, 1, 2, 1, 2, 1, 1, 0, 3].into_iter(), 2)?; + + assert_eq!( + vec, + vec![(2 << 1 | 1), 0b01_10_01_00, 0b00_01_01_10, 0b_00_00_00_11] + ); + Ok(()) + } + + #[test] + fn test_encode_u32_large() -> std::io::Result<()> { + let mut vec = vec![]; + + let values = (0..128).map(|x| x % 4); + + encode_u32(&mut vec, values, 2)?; + + let length = 128; + let expected = 0b11_10_01_00u8; + + let mut expected = vec![expected; length / 4]; + expected.insert(0, ((length / 8) as u8) << 1 | 1); + + assert_eq!(vec, expected); + Ok(()) + } + + #[test] + fn test_u32_other() -> std::io::Result<()> { + let values = vec![3, 3, 0, 3, 2, 3, 3, 3, 3, 1, 3, 3, 3, 0, 3].into_iter(); + + let mut vec = vec![]; + encode_u32(&mut vec, values, 2)?; + + let expected = vec![5, 207, 254, 247, 51]; + assert_eq!(expected, vec); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs new file mode 100644 index 000000000000..39e3a5bd2bac --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs @@ -0,0 +1,263 @@ +// See https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 +mod bitmap; +mod decoder; +mod encoder; +pub use bitmap::{encode_bool as bitpacked_encode, BitmapIter}; +pub use decoder::Decoder; +pub use encoder::{encode_bool, encode_u32}; + +use super::bitpacked; +use crate::parquet::error::Error; + +/// The two possible states of an RLE-encoded run. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum HybridEncoded<'a> { + /// A bitpacked slice. The consumer must know its bit-width to unpack it. + Bitpacked(&'a [u8]), + /// A RLE-encoded slice. The first attribute corresponds to the slice (that can be interpreted) + /// the second attribute corresponds to the number of repetitions. + Rle(&'a [u8], usize), +} + +#[derive(Debug, Clone)] +enum State<'a> { + None, + Bitpacked(bitpacked::Decoder<'a, u32>), + Rle(std::iter::Take>), + // Add a special branch for a single value to + // adhere to the strong law of small numbers. + Single(Option), +} + +/// [`Iterator`] of [`u32`] from a byte slice of Hybrid-RLE encoded values +#[derive(Debug, Clone)] +pub struct HybridRleDecoder<'a> { + decoder: Decoder<'a>, + state: State<'a>, + remaining: usize, +} + +#[inline] +fn read_next<'a>(decoder: &mut Decoder<'a>, remaining: usize) -> Result, Error> { + Ok(match decoder.next().transpose()? { + Some(HybridEncoded::Bitpacked(packed)) => { + let num_bits = decoder.num_bits(); + let length = std::cmp::min(packed.len() * 8 / num_bits, remaining); + let decoder = bitpacked::Decoder::::try_new(packed, num_bits, length)?; + State::Bitpacked(decoder) + }, + Some(HybridEncoded::Rle(pack, additional)) => { + let mut bytes = [0u8; std::mem::size_of::()]; + pack.iter().zip(bytes.iter_mut()).for_each(|(src, dst)| { + *dst = *src; + }); + let value = u32::from_le_bytes(bytes); + if additional == 1 { + State::Single(Some(value)) + } else { + State::Rle(std::iter::repeat(value).take(additional)) + } + }, + None => State::None, + }) +} + +impl<'a> HybridRleDecoder<'a> { + /// Returns a new [`HybridRleDecoder`] + pub fn try_new(data: &'a [u8], num_bits: u32, num_values: usize) -> Result { + let num_bits = num_bits as usize; + let mut decoder = Decoder::new(data, num_bits); + let state = read_next(&mut decoder, num_values)?; + Ok(Self { + decoder, + state, + remaining: num_values, + }) + } +} + +impl<'a> Iterator for HybridRleDecoder<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.remaining == 0 { + return None; + }; + let result = match &mut self.state { + State::Single(opt_val) => { + // make sure to take so that next calls will return 'None' + // indicating that the iterator is finished. + opt_val.take() + }, + State::Bitpacked(decoder) => decoder.next(), + State::Rle(iter) => iter.next(), + State::None => Some(0), + }; + if let Some(result) = result { + self.remaining -= 1; + Some(Ok(result)) + } else { + match read_next(&mut self.decoder, self.remaining) { + Ok(state) => { + self.state = state; + self.next() + }, + Err(e) => Some(Err(e)), + } + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } +} + +impl<'a> ExactSizeIterator for HybridRleDecoder<'a> {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn roundtrip() -> Result<(), Error> { + let mut buffer = vec![]; + let num_bits = 10u32; + + let data = (0..1000).collect::>(); + + encode_u32(&mut buffer, data.iter().cloned(), num_bits).unwrap(); + + let decoder = HybridRleDecoder::try_new(&buffer, num_bits, data.len())?; + + let result = decoder.collect::, _>>()?; + + assert_eq!(result, data); + Ok(()) + } + + #[test] + fn pyarrow_integration() -> Result<(), Error> { + // data encoded from pyarrow representing (0..1000) + let data = vec![ + 127, 0, 4, 32, 192, 0, 4, 20, 96, 192, 1, 8, 36, 160, 192, 2, 12, 52, 224, 192, 3, 16, + 68, 32, 193, 4, 20, 84, 96, 193, 5, 24, 100, 160, 193, 6, 28, 116, 224, 193, 7, 32, + 132, 32, 194, 8, 36, 148, 96, 194, 9, 40, 164, 160, 194, 10, 44, 180, 224, 194, 11, 48, + 196, 32, 195, 12, 52, 212, 96, 195, 13, 56, 228, 160, 195, 14, 60, 244, 224, 195, 15, + 64, 4, 33, 196, 16, 68, 20, 97, 196, 17, 72, 36, 161, 196, 18, 76, 52, 225, 196, 19, + 80, 68, 33, 197, 20, 84, 84, 97, 197, 21, 88, 100, 161, 197, 22, 92, 116, 225, 197, 23, + 96, 132, 33, 198, 24, 100, 148, 97, 198, 25, 104, 164, 161, 198, 26, 108, 180, 225, + 198, 27, 112, 196, 33, 199, 28, 116, 212, 97, 199, 29, 120, 228, 161, 199, 30, 124, + 244, 225, 199, 31, 128, 4, 34, 200, 32, 132, 20, 98, 200, 33, 136, 36, 162, 200, 34, + 140, 52, 226, 200, 35, 144, 68, 34, 201, 36, 148, 84, 98, 201, 37, 152, 100, 162, 201, + 38, 156, 116, 226, 201, 39, 160, 132, 34, 202, 40, 164, 148, 98, 202, 41, 168, 164, + 162, 202, 42, 172, 180, 226, 202, 43, 176, 196, 34, 203, 44, 180, 212, 98, 203, 45, + 184, 228, 162, 203, 46, 188, 244, 226, 203, 47, 192, 4, 35, 204, 48, 196, 20, 99, 204, + 49, 200, 36, 163, 204, 50, 204, 52, 227, 204, 51, 208, 68, 35, 205, 52, 212, 84, 99, + 205, 53, 216, 100, 163, 205, 54, 220, 116, 227, 205, 55, 224, 132, 35, 206, 56, 228, + 148, 99, 206, 57, 232, 164, 163, 206, 58, 236, 180, 227, 206, 59, 240, 196, 35, 207, + 60, 244, 212, 99, 207, 61, 248, 228, 163, 207, 62, 252, 244, 227, 207, 63, 0, 5, 36, + 208, 64, 4, 21, 100, 208, 65, 8, 37, 164, 208, 66, 12, 53, 228, 208, 67, 16, 69, 36, + 209, 68, 20, 85, 100, 209, 69, 24, 101, 164, 209, 70, 28, 117, 228, 209, 71, 32, 133, + 36, 210, 72, 36, 149, 100, 210, 73, 40, 165, 164, 210, 74, 44, 181, 228, 210, 75, 48, + 197, 36, 211, 76, 52, 213, 100, 211, 77, 56, 229, 164, 211, 78, 60, 245, 228, 211, 79, + 64, 5, 37, 212, 80, 68, 21, 101, 212, 81, 72, 37, 165, 212, 82, 76, 53, 229, 212, 83, + 80, 69, 37, 213, 84, 84, 85, 101, 213, 85, 88, 101, 165, 213, 86, 92, 117, 229, 213, + 87, 96, 133, 37, 214, 88, 100, 149, 101, 214, 89, 104, 165, 165, 214, 90, 108, 181, + 229, 214, 91, 112, 197, 37, 215, 92, 116, 213, 101, 215, 93, 120, 229, 165, 215, 94, + 124, 245, 229, 215, 95, 128, 5, 38, 216, 96, 132, 21, 102, 216, 97, 136, 37, 166, 216, + 98, 140, 53, 230, 216, 99, 144, 69, 38, 217, 100, 148, 85, 102, 217, 101, 152, 101, + 166, 217, 102, 156, 117, 230, 217, 103, 160, 133, 38, 218, 104, 164, 149, 102, 218, + 105, 168, 165, 166, 218, 106, 172, 181, 230, 218, 107, 176, 197, 38, 219, 108, 180, + 213, 102, 219, 109, 184, 229, 166, 219, 110, 188, 245, 230, 219, 111, 192, 5, 39, 220, + 112, 196, 21, 103, 220, 113, 200, 37, 167, 220, 114, 204, 53, 231, 220, 115, 208, 69, + 39, 221, 116, 212, 85, 103, 221, 117, 216, 101, 167, 221, 118, 220, 117, 231, 221, 119, + 224, 133, 39, 222, 120, 228, 149, 103, 222, 121, 232, 165, 167, 222, 122, 236, 181, + 231, 222, 123, 240, 197, 39, 223, 124, 244, 213, 103, 223, 125, 125, 248, 229, 167, + 223, 126, 252, 245, 231, 223, 127, 0, 6, 40, 224, 128, 4, 22, 104, 224, 129, 8, 38, + 168, 224, 130, 12, 54, 232, 224, 131, 16, 70, 40, 225, 132, 20, 86, 104, 225, 133, 24, + 102, 168, 225, 134, 28, 118, 232, 225, 135, 32, 134, 40, 226, 136, 36, 150, 104, 226, + 137, 40, 166, 168, 226, 138, 44, 182, 232, 226, 139, 48, 198, 40, 227, 140, 52, 214, + 104, 227, 141, 56, 230, 168, 227, 142, 60, 246, 232, 227, 143, 64, 6, 41, 228, 144, 68, + 22, 105, 228, 145, 72, 38, 169, 228, 146, 76, 54, 233, 228, 147, 80, 70, 41, 229, 148, + 84, 86, 105, 229, 149, 88, 102, 169, 229, 150, 92, 118, 233, 229, 151, 96, 134, 41, + 230, 152, 100, 150, 105, 230, 153, 104, 166, 169, 230, 154, 108, 182, 233, 230, 155, + 112, 198, 41, 231, 156, 116, 214, 105, 231, 157, 120, 230, 169, 231, 158, 124, 246, + 233, 231, 159, 128, 6, 42, 232, 160, 132, 22, 106, 232, 161, 136, 38, 170, 232, 162, + 140, 54, 234, 232, 163, 144, 70, 42, 233, 164, 148, 86, 106, 233, 165, 152, 102, 170, + 233, 166, 156, 118, 234, 233, 167, 160, 134, 42, 234, 168, 164, 150, 106, 234, 169, + 168, 166, 170, 234, 170, 172, 182, 234, 234, 171, 176, 198, 42, 235, 172, 180, 214, + 106, 235, 173, 184, 230, 170, 235, 174, 188, 246, 234, 235, 175, 192, 6, 43, 236, 176, + 196, 22, 107, 236, 177, 200, 38, 171, 236, 178, 204, 54, 235, 236, 179, 208, 70, 43, + 237, 180, 212, 86, 107, 237, 181, 216, 102, 171, 237, 182, 220, 118, 235, 237, 183, + 224, 134, 43, 238, 184, 228, 150, 107, 238, 185, 232, 166, 171, 238, 186, 236, 182, + 235, 238, 187, 240, 198, 43, 239, 188, 244, 214, 107, 239, 189, 248, 230, 171, 239, + 190, 252, 246, 235, 239, 191, 0, 7, 44, 240, 192, 4, 23, 108, 240, 193, 8, 39, 172, + 240, 194, 12, 55, 236, 240, 195, 16, 71, 44, 241, 196, 20, 87, 108, 241, 197, 24, 103, + 172, 241, 198, 28, 119, 236, 241, 199, 32, 135, 44, 242, 200, 36, 151, 108, 242, 201, + 40, 167, 172, 242, 202, 44, 183, 236, 242, 203, 48, 199, 44, 243, 204, 52, 215, 108, + 243, 205, 56, 231, 172, 243, 206, 60, 247, 236, 243, 207, 64, 7, 45, 244, 208, 68, 23, + 109, 244, 209, 72, 39, 173, 244, 210, 76, 55, 237, 244, 211, 80, 71, 45, 245, 212, 84, + 87, 109, 245, 213, 88, 103, 173, 245, 214, 92, 119, 237, 245, 215, 96, 135, 45, 246, + 216, 100, 151, 109, 246, 217, 104, 167, 173, 246, 218, 108, 183, 237, 246, 219, 112, + 199, 45, 247, 220, 116, 215, 109, 247, 221, 120, 231, 173, 247, 222, 124, 247, 237, + 247, 223, 128, 7, 46, 248, 224, 132, 23, 110, 248, 225, 136, 39, 174, 248, 226, 140, + 55, 238, 248, 227, 144, 71, 46, 249, 228, 148, 87, 110, 249, 229, 152, 103, 174, 249, + 230, 156, 119, 238, 249, 231, 160, 135, 46, 250, 232, 164, 151, 110, 250, 233, 168, + 167, 174, 250, 234, 172, 183, 238, 250, 235, 176, 199, 46, 251, 236, 180, 215, 110, + 251, 237, 184, 231, 174, 251, 238, 188, 247, 238, 251, 239, 192, 7, 47, 252, 240, 196, + 23, 111, 252, 241, 200, 39, 175, 252, 242, 204, 55, 239, 252, 243, 208, 71, 47, 253, + 244, 212, 87, 111, 253, 245, 216, 103, 175, 253, 246, 220, 119, 239, 253, 247, 224, + 135, 47, 254, 248, 228, 151, 111, 254, 249, + ]; + let num_bits = 10; + + let decoder = HybridRleDecoder::try_new(&data, num_bits, 1000)?; + + let result = decoder.collect::, _>>()?; + + assert_eq!(result, (0..1000).collect::>()); + Ok(()) + } + + #[test] + fn small() -> Result<(), Error> { + let data = vec![3, 2]; + + let num_bits = 3; + + let decoder = HybridRleDecoder::try_new(&data, num_bits, 1)?; + + let result = decoder.collect::, _>>()?; + + assert_eq!(result, &[2]); + Ok(()) + } + + #[test] + fn zero_bit_width() -> Result<(), Error> { + let data = vec![3]; + + let num_bits = 0; + + let decoder = HybridRleDecoder::try_new(&data, num_bits, 2)?; + + let result = decoder.collect::, _>>()?; + + assert_eq!(result, &[0, 0]); + Ok(()) + } + + #[test] + fn empty_values() -> Result<(), Error> { + let data = []; + + let num_bits = 1; + + let decoder = HybridRleDecoder::try_new(&data, num_bits, 100)?; + + let result = decoder.collect::, _>>()?; + + assert_eq!(result, vec![0; 100]); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/mod.rs b/crates/polars-parquet/src/parquet/encoding/mod.rs new file mode 100644 index 000000000000..79b608ab63b7 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/mod.rs @@ -0,0 +1,27 @@ +use std::convert::TryInto; + +pub mod bitpacked; +pub mod delta_bitpacked; +pub mod delta_byte_array; +pub mod delta_length_byte_array; +pub mod hybrid_rle; +pub mod plain_byte_array; +pub mod uleb128; +pub mod zigzag_leb128; + +pub use crate::parquet::parquet_bridge::Encoding; + +/// # Panics +/// This function panics iff `values.len() < 4`. +#[inline] +pub fn get_length(values: &[u8]) -> Option { + values + .get(0..4) + .map(|x| u32::from_le_bytes(x.try_into().unwrap()) as usize) +} + +/// Returns the ceil of value / 8 +#[inline] +pub fn ceil8(value: usize) -> usize { + value / 8 + ((value % 8 != 0) as usize) +} diff --git a/crates/polars-parquet/src/parquet/encoding/plain_byte_array.rs b/crates/polars-parquet/src/parquet/encoding/plain_byte_array.rs new file mode 100644 index 000000000000..d29f8c82c6de --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/plain_byte_array.rs @@ -0,0 +1,46 @@ +/// Decodes according to [Plain strings](https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0), +/// prefixes, lengths and values +/// # Implementation +/// This struct does not allocate on the heap. +use crate::parquet::error::Error; + +#[derive(Debug)] +pub struct BinaryIter<'a> { + values: &'a [u8], + length: Option, +} + +impl<'a> BinaryIter<'a> { + pub fn new(values: &'a [u8], length: Option) -> Self { + Self { values, length } + } +} + +impl<'a> Iterator for BinaryIter<'a> { + type Item = Result<&'a [u8], Error>; + + #[inline] + fn next(&mut self) -> Option { + if self.values.len() < 4 { + return None; + } + if let Some(x) = self.length.as_mut() { + *x = x.saturating_sub(1) + } + let length = u32::from_le_bytes(self.values[0..4].try_into().unwrap()) as usize; + self.values = &self.values[4..]; + if length > self.values.len() { + return Some(Err(Error::oos( + "A string in plain encoding declares a length that is out of range", + ))); + } + let (result, remaining) = self.values.split_at(length); + self.values = remaining; + Some(Ok(result)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.length.unwrap_or_default(), self.length) + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/uleb128.rs b/crates/polars-parquet/src/parquet/encoding/uleb128.rs new file mode 100644 index 000000000000..c91568e2ee86 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/uleb128.rs @@ -0,0 +1,97 @@ +use crate::parquet::error::Error; + +pub fn decode(values: &[u8]) -> Result<(u64, usize), Error> { + let mut result = 0; + let mut shift = 0; + + let mut consumed = 0; + for byte in values { + consumed += 1; + if shift == 63 && *byte > 1 { + panic!() + }; + + result |= u64::from(byte & 0b01111111) << shift; + + if byte & 0b10000000 == 0 { + break; + } + + shift += 7; + } + Ok((result, consumed)) +} + +/// Encodes `value` in ULEB128 into `container`. The exact number of bytes written +/// depends on `value`, and cannot be determined upfront. The maximum number of bytes +/// required are 10. +/// # Panic +/// This function may panic if `container.len() < 10` and `value` requires more bytes. +pub fn encode(mut value: u64, container: &mut [u8]) -> usize { + let mut consumed = 0; + let mut iter = container.iter_mut(); + loop { + let mut byte = (value as u8) & !128; + value >>= 7; + if value != 0 { + byte |= 128; + } + *iter.next().unwrap() = byte; + consumed += 1; + if value == 0 { + break; + } + } + consumed +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn decode_1() { + let data = vec![0xe5, 0x8e, 0x26, 0xDE, 0xAD, 0xBE, 0xEF]; + let (value, len) = decode(&data).unwrap(); + assert_eq!(value, 624_485); + assert_eq!(len, 3); + } + + #[test] + fn decode_2() { + let data = vec![0b00010000, 0b00000001, 0b00000011, 0b00000011]; + let (value, len) = decode(&data).unwrap(); + assert_eq!(value, 16); + assert_eq!(len, 1); + } + + #[test] + fn round_trip() { + let original = 123124234u64; + let mut container = [0u8; 10]; + let encoded_len = encode(original, &mut container); + let (value, len) = decode(&container).unwrap(); + assert_eq!(value, original); + assert_eq!(len, encoded_len); + } + + #[test] + fn min_value() { + let original = u64::MIN; + let mut container = [0u8; 10]; + let encoded_len = encode(original, &mut container); + let (value, len) = decode(&container).unwrap(); + assert_eq!(value, original); + assert_eq!(len, encoded_len); + } + + #[test] + fn max_value() { + let original = u64::MAX; + let mut container = [0u8; 10]; + let encoded_len = encode(original, &mut container); + let (value, len) = decode(&container).unwrap(); + assert_eq!(value, original); + assert_eq!(len, encoded_len); + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/zigzag_leb128.rs b/crates/polars-parquet/src/parquet/encoding/zigzag_leb128.rs new file mode 100644 index 000000000000..0a673136cc73 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/zigzag_leb128.rs @@ -0,0 +1,69 @@ +use super::uleb128; +use crate::parquet::error::Error; + +pub fn decode(values: &[u8]) -> Result<(i64, usize), Error> { + let (u, consumed) = uleb128::decode(values)?; + Ok(((u >> 1) as i64 ^ -((u & 1) as i64), consumed)) +} + +pub fn encode(value: i64) -> ([u8; 10], usize) { + let value = ((value << 1) ^ (value >> (64 - 1))) as u64; + let mut a = [0u8; 10]; + let produced = uleb128::encode(value, &mut a); + (a, produced) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode() { + // see e.g. https://stackoverflow.com/a/2211086/931303 + let cases = vec![ + (0u8, 0i64), + (1, -1), + (2, 1), + (3, -2), + (4, 2), + (5, -3), + (6, 3), + (7, -4), + (8, 4), + (9, -5), + ]; + for (data, expected) in cases { + let (result, _) = decode(&[data]).unwrap(); + assert_eq!(result, expected) + } + } + + #[test] + fn test_encode() { + let cases = vec![ + (0u8, 0i64), + (1, -1), + (2, 1), + (3, -2), + (4, 2), + (5, -3), + (6, 3), + (7, -4), + (8, 4), + (9, -5), + ]; + for (expected, data) in cases { + let (result, size) = encode(data); + assert_eq!(size, 1); + assert_eq!(result[0], expected) + } + } + + #[test] + fn test_roundtrip() { + let value = -1001212312; + let (data, size) = encode(value); + let (result, _) = decode(&data[..size]).unwrap(); + assert_eq!(value, result); + } +} diff --git a/crates/polars-parquet/src/parquet/error.rs b/crates/polars-parquet/src/parquet/error.rs new file mode 100644 index 000000000000..78022fd5d4ec --- /dev/null +++ b/crates/polars-parquet/src/parquet/error.rs @@ -0,0 +1,134 @@ +//! Contains [`Error`] + +/// List of features whose non-activation may cause a runtime error. +/// Used to indicate which lack of feature caused [`Error::FeatureNotActive`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum Feature { + /// Snappy compression and decompression + Snappy, + /// Brotli compression and decompression + Brotli, + /// Gzip compression and decompression + Gzip, + /// Lz4 raw compression and decompression + Lz4, + /// Zstd compression and decompression + Zstd, +} + +/// Errors generated by this crate +#[derive(Debug, Clone)] +#[non_exhaustive] +pub enum Error { + /// When the parquet file is known to be out of spec. + OutOfSpec(String), + /// Error presented when trying to use a code branch that requires activating a feature. + FeatureNotActive(Feature, String), + /// Error presented when trying to use a feature from parquet that is not yet supported + FeatureNotSupported(String), + /// When encoding, the user passed an invalid parameter + InvalidParameter(String), + /// When decoding or decompressing, the page would allocate more memory than allowed + WouldOverAllocate, +} + +impl Error { + pub(crate) fn oos>(message: I) -> Self { + Self::OutOfSpec(message.into()) + } +} + +impl std::error::Error for Error {} + +impl std::fmt::Display for Error { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Error::OutOfSpec(message) => { + write!(fmt, "File out of specification: {}", message) + }, + Error::FeatureNotActive(feature, reason) => { + write!( + fmt, + "The feature \"{:?}\" needs to be active to {}", + feature, reason + ) + }, + Error::FeatureNotSupported(reason) => { + write!(fmt, "Not yet supported: {}", reason) + }, + Error::InvalidParameter(message) => { + write!(fmt, "Invalid parameter: {}", message) + }, + Error::WouldOverAllocate => { + write!(fmt, "Operation would exceed memory use threshold") + }, + } + } +} + +#[cfg(feature = "snappy")] +impl From for Error { + fn from(e: snap::Error) -> Error { + Error::OutOfSpec(format!("underlying snap error: {}", e)) + } +} + +#[cfg(feature = "lz4_flex")] +impl From for Error { + fn from(e: lz4_flex::block::DecompressError) -> Error { + Error::OutOfSpec(format!("underlying lz4_flex error: {}", e)) + } +} + +#[cfg(feature = "lz4_flex")] +impl From for Error { + fn from(e: lz4_flex::block::CompressError) -> Error { + Error::OutOfSpec(format!("underlying lz4_flex error: {}", e)) + } +} + +impl From for Error { + fn from(e: parquet_format_safe::thrift::Error) -> Error { + Error::OutOfSpec(format!("Invalid thrift: {}", e)) + } +} + +impl From for Error { + fn from(e: std::io::Error) -> Error { + Error::OutOfSpec(format!("underlying IO error: {}", e)) + } +} + +impl From for Error { + fn from(e: std::collections::TryReserveError) -> Error { + Error::OutOfSpec(format!("OOM: {}", e)) + } +} + +impl From for Error { + fn from(e: std::num::TryFromIntError) -> Error { + Error::OutOfSpec(format!("Number must be zero or positive: {}", e)) + } +} + +impl From for Error { + fn from(e: std::array::TryFromSliceError) -> Error { + Error::OutOfSpec(format!("Can't deserialize to parquet native type: {}", e)) + } +} + +/// A specialized `Result` for Parquet errors. +pub type Result = std::result::Result; + +impl From for polars_error::PolarsError { + fn from(e: Error) -> polars_error::PolarsError { + polars_error::PolarsError::ComputeError(format!("parquet: {}", e).into()) + } +} + +impl From for Error { + fn from(e: polars_error::PolarsError) -> Error { + Error::OutOfSpec(format!("OOM: {}", e)) + } +} diff --git a/crates/polars-parquet/src/parquet/indexes/index.rs b/crates/polars-parquet/src/parquet/indexes/index.rs new file mode 100644 index 000000000000..08206659da3c --- /dev/null +++ b/crates/polars-parquet/src/parquet/indexes/index.rs @@ -0,0 +1,322 @@ +use std::any::Any; + +use parquet_format_safe::ColumnIndex; + +use crate::parquet::error::Error; +use crate::parquet::parquet_bridge::BoundaryOrder; +use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; +use crate::parquet::types::NativeType; + +/// Trait object representing a [`ColumnIndex`] in Rust's native format. +/// +/// See [`NativeIndex`], [`ByteIndex`] and [`FixedLenByteIndex`] for concrete implementations. +pub trait Index: Send + Sync + std::fmt::Debug { + fn as_any(&self) -> &dyn Any; + + fn physical_type(&self) -> &PhysicalType; +} + +impl PartialEq for dyn Index + '_ { + fn eq(&self, that: &dyn Index) -> bool { + equal(self, that) + } +} + +impl Eq for dyn Index + '_ {} + +fn equal(lhs: &dyn Index, rhs: &dyn Index) -> bool { + if lhs.physical_type() != rhs.physical_type() { + return false; + } + + match lhs.physical_type() { + PhysicalType::Boolean => { + lhs.as_any().downcast_ref::().unwrap() + == rhs.as_any().downcast_ref::().unwrap() + }, + PhysicalType::Int32 => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + }, + PhysicalType::Int64 => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + }, + PhysicalType::Int96 => { + lhs.as_any() + .downcast_ref::>() + .unwrap() + == rhs + .as_any() + .downcast_ref::>() + .unwrap() + }, + PhysicalType::Float => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + }, + PhysicalType::Double => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + }, + PhysicalType::ByteArray => { + lhs.as_any().downcast_ref::().unwrap() + == rhs.as_any().downcast_ref::().unwrap() + }, + PhysicalType::FixedLenByteArray(_) => { + lhs.as_any().downcast_ref::().unwrap() + == rhs.as_any().downcast_ref::().unwrap() + }, + } +} + +/// An index of a column of [`NativeType`] physical representation +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct NativeIndex { + /// The primitive type + pub primitive_type: PrimitiveType, + /// The indexes, one item per page + pub indexes: Vec>, + /// the order + pub boundary_order: BoundaryOrder, +} + +impl NativeIndex { + /// Creates a new [`NativeIndex`] + pub(crate) fn try_new( + index: ColumnIndex, + primitive_type: PrimitiveType, + ) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min.as_slice().try_into()?; + let max = max.as_slice().try_into()?; + (Some(T::from_le_bytes(min)), Some(T::from_le_bytes(max))) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, Error>>()?; + + Ok(Self { + primitive_type, + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +/// The index of a page, containing the min and max values of the page. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PageIndex { + /// The minimum value in the page. It is None when all values are null + pub min: Option, + /// The maximum value in the page. It is None when all values are null + pub max: Option, + /// The number of null values in the page + pub null_count: Option, +} + +impl Index for NativeIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &T::TYPE + } +} + +/// An index of a column of bytes physical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ByteIndex { + /// The [`PrimitiveType`]. + pub primitive_type: PrimitiveType, + /// The indexes, one item per page + pub indexes: Vec>>, + pub boundary_order: BoundaryOrder, +} + +impl ByteIndex { + pub(crate) fn try_new( + index: ColumnIndex, + primitive_type: PrimitiveType, + ) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, Error>>()?; + + Ok(Self { + primitive_type, + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for ByteIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &PhysicalType::ByteArray + } +} + +/// An index of a column of fixed len byte physical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct FixedLenByteIndex { + /// The [`PrimitiveType`]. + pub primitive_type: PrimitiveType, + /// The indexes, one item per page + pub indexes: Vec>>, + pub boundary_order: BoundaryOrder, +} + +impl FixedLenByteIndex { + pub(crate) fn try_new( + index: ColumnIndex, + primitive_type: PrimitiveType, + ) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, Error>>()?; + + Ok(Self { + primitive_type, + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for FixedLenByteIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &self.primitive_type.physical_type + } +} + +/// An index of a column of boolean physical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct BooleanIndex { + /// The indexes, one item per page + pub indexes: Vec>, + pub boundary_order: BoundaryOrder, +} + +impl BooleanIndex { + pub(crate) fn try_new(index: ColumnIndex) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min[0] == 1; + let max = max[0] == 1; + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, Error>>()?; + + Ok(Self { + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for BooleanIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &PhysicalType::Boolean + } +} diff --git a/crates/polars-parquet/src/parquet/indexes/intervals.rs b/crates/polars-parquet/src/parquet/indexes/intervals.rs new file mode 100644 index 000000000000..f6cbdf9432a3 --- /dev/null +++ b/crates/polars-parquet/src/parquet/indexes/intervals.rs @@ -0,0 +1,137 @@ +use parquet_format_safe::PageLocation; +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use crate::parquet::error::Error; + +/// An interval +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct Interval { + /// Its start + pub start: usize, + /// Its length + pub length: usize, +} + +impl Interval { + /// Create a new interval + pub fn new(start: usize, length: usize) -> Self { + Self { start, length } + } +} + +/// Returns the set of (row) intervals of the pages. +/// # Errors +/// This function errors if the locations are not castable to `usize` or such that +/// their ranges of row are larger than `num_rows`. +pub fn compute_page_row_intervals( + locations: &[PageLocation], + num_rows: usize, +) -> Result, Error> { + if locations.is_empty() { + return Ok(vec![]); + }; + + let last = (|| { + let start: usize = locations.last().unwrap().first_row_index.try_into()?; + let length = num_rows + .checked_sub(start) + .ok_or_else(|| Error::oos("Page start cannot be smaller than the number of rows"))?; + Result::<_, Error>::Ok(Interval::new(start, length)) + })(); + + let pages_lengths = locations + .windows(2) + .map(|x| { + let start = x[0].first_row_index.try_into()?; + + let length = x[1] + .first_row_index + .checked_sub(x[0].first_row_index) + .ok_or_else(|| Error::oos("Page start cannot be smaller than the number of rows"))? + .try_into()?; + + Ok(Interval::new(start, length)) + }) + .chain(std::iter::once(last)); + pages_lengths.collect() +} + +/// Returns the set of intervals `(start, len)` containing all the +/// selected rows (for a given column) +pub fn compute_rows( + selected: &[bool], + locations: &[PageLocation], + num_rows: usize, +) -> Result, Error> { + let page_intervals = compute_page_row_intervals(locations, num_rows)?; + + Ok(selected + .iter() + .zip(page_intervals.iter().copied()) + .filter_map( + |(&is_selected, page)| { + if is_selected { + Some(page) + } else { + None + } + }, + ) + .collect()) +} + +/// An enum describing a page that was either selected in a filter pushdown or skipped +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct FilteredPage { + /// Location of the page in the file + pub start: u64, + pub length: usize, + /// rows to select from the page + pub selected_rows: Vec, + pub num_rows: usize, +} + +fn is_in(probe: Interval, intervals: &[Interval]) -> Vec { + intervals + .iter() + .filter_map(|interval| { + let interval_end = interval.start + interval.length; + let probe_end = probe.start + probe.length; + let overlaps = (probe.start < interval_end) && (probe_end > interval.start); + if overlaps { + let start = interval.start.max(probe.start); + let end = interval_end.min(probe_end); + Some(Interval::new(start - probe.start, end - start)) + } else { + None + } + }) + .collect() +} + +/// Given a set of selected [Interval]s of rows and the set of [`PageLocation`], returns the +/// a set of [`FilteredPage`] with the same number of items as `locations`. +pub fn select_pages( + intervals: &[Interval], + locations: &[PageLocation], + num_rows: usize, +) -> Result, Error> { + let page_intervals = compute_page_row_intervals(locations, num_rows)?; + + page_intervals + .into_iter() + .zip(locations.iter()) + .map(|(interval, location)| { + let selected_rows = is_in(interval, intervals); + Ok(FilteredPage { + start: location.offset.try_into()?, + length: location.compressed_page_size.try_into()?, + selected_rows, + num_rows: interval.length, + }) + }) + .collect() +} diff --git a/crates/polars-parquet/src/parquet/indexes/mod.rs b/crates/polars-parquet/src/parquet/indexes/mod.rs new file mode 100644 index 000000000000..f652f8bb4be3 --- /dev/null +++ b/crates/polars-parquet/src/parquet/indexes/mod.rs @@ -0,0 +1,234 @@ +mod index; +mod intervals; + +pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; + +pub use self::index::{BooleanIndex, ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; +pub use crate::parquet::parquet_bridge::BoundaryOrder; +pub use crate::parquet::thrift_format::PageLocation; + +#[cfg(test)] +mod tests { + use super::*; + use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; + + #[test] + fn test_basic() { + let locations = &[PageLocation { + offset: 100, + compressed_page_size: 10, + first_row_index: 0, + }]; + let num_rows = 10; + + let row_intervals = compute_rows(&[true; 1], locations, num_rows).unwrap(); + assert_eq!(row_intervals, vec![Interval::new(0, 10)]) + } + + #[test] + fn test_multiple() { + // two pages + let index = ByteIndex { + primitive_type: PrimitiveType::from_physical("c1".to_string(), PhysicalType::ByteArray), + indexes: vec![ + PageIndex { + min: Some(vec![0]), + max: Some(vec![8, 9]), + null_count: Some(0), + }, + PageIndex { + min: Some(vec![20]), + max: Some(vec![98, 99]), + null_count: Some(0), + }, + ], + boundary_order: Default::default(), + }; + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 110, + compressed_page_size: 20, + first_row_index: 5, + }, + ]; + let num_rows = 10; + + // filter of the form `x > "a"` + let selector = |page: &PageIndex>| { + page.max + .as_ref() + .map(|x| x.as_slice()[0] > 97) + .unwrap_or(false) // no max is present => all nulls => not selected + }; + let selected = index.indexes.iter().map(selector).collect::>(); + + let rows = compute_rows(&selected, locations, num_rows).unwrap(); + assert_eq!(rows, vec![Interval::new(5, 5)]); + + let pages = select_pages(&rows, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage { + start: 100, + length: 10, + selected_rows: vec![], + num_rows: 5 + }, + FilteredPage { + start: 110, + length: 20, + selected_rows: vec![Interval::new(0, 5)], + num_rows: 5 + } + ] + ); + } + + #[test] + fn test_other_column() { + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 20, + first_row_index: 0, + }, + PageLocation { + offset: 120, + compressed_page_size: 20, + first_row_index: 10, + }, + ]; + let num_rows = 100; + + let intervals = &[Interval::new(5, 5)]; + + let pages = select_pages(intervals, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage { + start: 100, + length: 20, + selected_rows: vec![Interval::new(5, 5)], + num_rows: 10, + }, + FilteredPage { + start: 120, + length: 20, + selected_rows: vec![], + num_rows: 90 + }, + ] + ); + } + + #[test] + fn test_other_interval_in_middle() { + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 20, + first_row_index: 0, + }, + PageLocation { + offset: 120, + compressed_page_size: 20, + first_row_index: 10, + }, + PageLocation { + offset: 140, + compressed_page_size: 20, + first_row_index: 100, + }, + ]; + let num_rows = 200; + + // interval partially intersects 2 pages (0 and 1) + let intervals = &[Interval::new(5, 6)]; + + let pages = select_pages(intervals, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage { + start: 100, + length: 20, + selected_rows: vec![Interval::new(5, 5)], + num_rows: 10, + }, + FilteredPage { + start: 120, + length: 20, + selected_rows: vec![Interval::new(0, 1)], + num_rows: 90, + }, + FilteredPage { + start: 140, + length: 20, + selected_rows: vec![], + num_rows: 100 + }, + ] + ); + } + + #[test] + fn test_other_column2() { + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 20, + first_row_index: 0, + }, + PageLocation { + offset: 120, + compressed_page_size: 20, + first_row_index: 10, + }, + PageLocation { + offset: 140, + compressed_page_size: 20, + first_row_index: 100, + }, + ]; + let num_rows = 200; + + // interval partially intersects 1 page (0) + let intervals = &[Interval::new(0, 1)]; + + let pages = select_pages(intervals, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage { + start: 100, + length: 20, + selected_rows: vec![Interval::new(0, 1)], + num_rows: 10, + }, + FilteredPage { + start: 120, + length: 20, + selected_rows: vec![], + num_rows: 90 + }, + FilteredPage { + start: 140, + length: 20, + selected_rows: vec![], + num_rows: 100 + }, + ] + ); + } +} diff --git a/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs b/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs new file mode 100644 index 000000000000..f5487e323fbf --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs @@ -0,0 +1,210 @@ +use std::sync::Arc; + +use parquet_format_safe::{ColumnChunk, ColumnMetaData, Encoding}; + +use super::column_descriptor::ColumnDescriptor; +use crate::parquet::compression::Compression; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::PhysicalType; +use crate::parquet::statistics::{deserialize_statistics, Statistics}; + +#[cfg(feature = "serde_types")] +mod serde_types { + pub use std::io::Cursor; + + pub use parquet_format_safe::thrift::protocol::{ + TCompactInputProtocol, TCompactOutputProtocol, + }; + pub use serde::de::Error as DeserializeError; + pub use serde::ser::Error as SerializeError; + pub use serde::{Deserialize, Deserializer, Serialize, Serializer}; +} +#[cfg(feature = "serde_types")] +use serde_types::*; + +/// Metadata for a column chunk. +// This contains the `ColumnDescriptor` associated with the chunk so that deserializers have +// access to the descriptor (e.g. physical, converted, logical). +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct ColumnChunkMetaData { + #[cfg_attr( + feature = "serde_types", + serde(serialize_with = "serialize_column_chunk") + )] + #[cfg_attr( + feature = "serde_types", + serde(deserialize_with = "deserialize_column_chunk") + )] + column_chunk: ColumnChunk, + column_descr: ColumnDescriptor, +} + +#[cfg(feature = "serde_types")] +fn serialize_column_chunk( + column_chunk: &ColumnChunk, + serializer: S, +) -> std::result::Result +where + S: Serializer, +{ + let mut buf = vec![]; + let cursor = Cursor::new(&mut buf[..]); + let mut protocol = TCompactOutputProtocol::new(cursor); + column_chunk + .write_to_out_protocol(&mut protocol) + .map_err(S::Error::custom)?; + serializer.serialize_bytes(&buf) +} + +#[cfg(feature = "serde_types")] +fn deserialize_column_chunk<'de, D>(deserializer: D) -> std::result::Result +where + D: Deserializer<'de>, +{ + let buf = Vec::::deserialize(deserializer)?; + let mut cursor = Cursor::new(&buf[..]); + let mut protocol = TCompactInputProtocol::new(&mut cursor, usize::MAX); + ColumnChunk::read_from_in_protocol(&mut protocol).map_err(D::Error::custom) +} + +// Represents common operations for a column chunk. +impl ColumnChunkMetaData { + /// Returns a new [`ColumnChunkMetaData`] + pub fn new(column_chunk: ColumnChunk, column_descr: ColumnDescriptor) -> Self { + Self { + column_chunk, + column_descr, + } + } + + /// File where the column chunk is stored. + /// + /// If not set, assumed to belong to the same file as the metadata. + /// This path is relative to the current file. + pub fn file_path(&self) -> &Option { + &self.column_chunk.file_path + } + + /// Byte offset in `file_path()`. + pub fn file_offset(&self) -> i64 { + self.column_chunk.file_offset + } + + /// Returns this column's [`ColumnChunk`] + pub fn column_chunk(&self) -> &ColumnChunk { + &self.column_chunk + } + + /// The column's [`ColumnMetaData`] + pub fn metadata(&self) -> &ColumnMetaData { + self.column_chunk.meta_data.as_ref().unwrap() + } + + /// The [`ColumnDescriptor`] for this column. This descriptor contains the physical and logical type + /// of the pages. + pub fn descriptor(&self) -> &ColumnDescriptor { + &self.column_descr + } + + /// The [`PhysicalType`] of this column. + pub fn physical_type(&self) -> PhysicalType { + self.column_descr.descriptor.primitive_type.physical_type + } + + /// Decodes the raw statistics into [`Statistics`]. + pub fn statistics(&self) -> Option>> { + self.metadata() + .statistics + .as_ref() + .map(|x| deserialize_statistics(x, self.column_descr.descriptor.primitive_type.clone())) + } + + /// Total number of values in this column chunk. Note that this is not necessarily the number + /// of rows. E.g. the (nested) array `[[1, 2], [3]]` has 2 rows and 3 values. + pub fn num_values(&self) -> i64 { + self.metadata().num_values + } + + /// [`Compression`] for this column. + pub fn compression(&self) -> Compression { + self.metadata().codec.try_into().unwrap() + } + + /// Returns the total compressed data size of this column chunk. + pub fn compressed_size(&self) -> i64 { + self.metadata().total_compressed_size + } + + /// Returns the total uncompressed data size of this column chunk. + pub fn uncompressed_size(&self) -> i64 { + self.metadata().total_uncompressed_size + } + + /// Returns the offset for the column data. + pub fn data_page_offset(&self) -> i64 { + self.metadata().data_page_offset + } + + /// Returns `true` if this column chunk contains a index page, `false` otherwise. + pub fn has_index_page(&self) -> bool { + self.metadata().index_page_offset.is_some() + } + + /// Returns the offset for the index page. + pub fn index_page_offset(&self) -> Option { + self.metadata().index_page_offset + } + + /// Returns the offset for the dictionary page, if any. + pub fn dictionary_page_offset(&self) -> Option { + self.metadata().dictionary_page_offset + } + + /// Returns the encoding for this column + pub fn column_encoding(&self) -> &Vec { + &self.metadata().encodings + } + + /// Returns the offset and length in bytes of the column chunk within the file + pub fn byte_range(&self) -> (u64, u64) { + let start = if let Some(dict_page_offset) = self.dictionary_page_offset() { + dict_page_offset as u64 + } else { + self.data_page_offset() as u64 + }; + let length = self.compressed_size() as u64; + // this has been validated in [`try_from_thrift`] + (start, length) + } + + /// Method to convert from Thrift. + pub(crate) fn try_from_thrift( + column_descr: ColumnDescriptor, + column_chunk: ColumnChunk, + ) -> Result { + // validate metadata + if let Some(meta) = &column_chunk.meta_data { + let _: u64 = meta.total_compressed_size.try_into()?; + + if let Some(offset) = meta.dictionary_page_offset { + let _: u64 = offset.try_into()?; + } + let _: u64 = meta.data_page_offset.try_into()?; + + let _: Compression = meta.codec.try_into()?; + } else { + return Err(Error::oos("Column chunk requires metadata")); + } + + Ok(Self { + column_chunk, + column_descr, + }) + } + + /// Method to convert to Thrift. + pub fn into_thrift(self) -> ColumnChunk { + self.column_chunk + } +} diff --git a/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs b/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs new file mode 100644 index 000000000000..2c9a0d1f6e48 --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/column_descriptor.rs @@ -0,0 +1,50 @@ +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use crate::parquet::schema::types::{ParquetType, PrimitiveType}; + +/// A descriptor of a parquet column. It contains the necessary information to deserialize +/// a parquet column. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct Descriptor { + /// The [`PrimitiveType`] of this column + pub primitive_type: PrimitiveType, + + /// The maximum definition level + pub max_def_level: i16, + + /// The maximum repetition level + pub max_rep_level: i16, +} + +/// A descriptor for leaf-level primitive columns. +/// This encapsulates information such as definition and repetition levels and is used to +/// re-assemble nested data. +#[derive(Debug, PartialEq, Clone)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct ColumnDescriptor { + /// The descriptor this columns' leaf. + pub descriptor: Descriptor, + + /// The path of this column. For instance, "a.b.c.d". + pub path_in_schema: Vec, + + /// The [`ParquetType`] this descriptor is a leaf of + pub base_type: ParquetType, +} + +impl ColumnDescriptor { + /// Creates new descriptor for leaf-level column. + pub fn new( + descriptor: Descriptor, + path_in_schema: Vec, + base_type: ParquetType, + ) -> Self { + Self { + descriptor, + path_in_schema, + base_type, + } + } +} diff --git a/crates/polars-parquet/src/parquet/metadata/column_order.rs b/crates/polars-parquet/src/parquet/metadata/column_order.rs new file mode 100644 index 000000000000..4d66f615bfa0 --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/column_order.rs @@ -0,0 +1,30 @@ +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use super::sort::SortOrder; + +/// Column order that specifies what method was used to aggregate min/max values for +/// statistics. +/// +/// If column order is undefined, then it is the legacy behaviour and all values should +/// be compared as signed values/bytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum ColumnOrder { + /// Column uses the order defined by its logical or physical type + /// (if there is no logical type), parquet-format 2.4.0+. + TypeDefinedOrder(SortOrder), + /// Undefined column order, means legacy behaviour before parquet-format 2.4.0. + /// Sort order is always SIGNED. + Undefined, +} + +impl ColumnOrder { + /// Returns sort order associated with this column order. + pub fn sort_order(&self) -> SortOrder { + match *self { + ColumnOrder::TypeDefinedOrder(order) => order, + ColumnOrder::Undefined => SortOrder::Signed, + } + } +} diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs new file mode 100644 index 000000000000..1c101fa9a561 --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -0,0 +1,129 @@ +use parquet_format_safe::ColumnOrder as TColumnOrder; + +use super::column_order::ColumnOrder; +use super::schema_descriptor::SchemaDescriptor; +use super::RowGroupMetaData; +use crate::parquet::error::Error; +use crate::parquet::metadata::get_sort_order; +pub use crate::parquet::thrift_format::KeyValue; + +/// Metadata for a Parquet file. +// This is almost equal to [`parquet_format_safe::FileMetaData`] but contains the descriptors, +// which are crucial to deserialize pages. +#[derive(Debug, Clone)] +pub struct FileMetaData { + /// version of this file. + pub version: i32, + /// number of rows in the file. + pub num_rows: usize, + /// String message for application that wrote this file. + /// + /// This should have the following format: + /// ` version (build )`. + /// + /// ```shell + /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b) + /// ``` + pub created_by: Option, + /// The row groups of this file + pub row_groups: Vec, + /// key_value_metadata of this file. + pub key_value_metadata: Option>, + /// schema descriptor. + pub schema_descr: SchemaDescriptor, + /// Column (sort) order used for `min` and `max` values of each column in this file. + /// + /// Each column order corresponds to one column, determined by its position in the + /// list, matching the position of the column in the schema. + /// + /// When `None` is returned, there are no column orders available, and each column + /// should be assumed to have undefined (legacy) column order. + pub column_orders: Option>, +} + +impl FileMetaData { + /// Returns the [`SchemaDescriptor`] that describes schema of this file. + pub fn schema(&self) -> &SchemaDescriptor { + &self.schema_descr + } + + /// returns the metadata + pub fn key_value_metadata(&self) -> &Option> { + &self.key_value_metadata + } + + /// Returns column order for `i`th column in this file. + /// If column orders are not available, returns undefined (legacy) column order. + pub fn column_order(&self, i: usize) -> ColumnOrder { + self.column_orders + .as_ref() + .map(|data| data[i]) + .unwrap_or(ColumnOrder::Undefined) + } + + /// Deserializes [`crate::parquet::thrift_format::FileMetaData`] into this struct + pub fn try_from_thrift(metadata: parquet_format_safe::FileMetaData) -> Result { + let schema_descr = SchemaDescriptor::try_from_thrift(&metadata.schema)?; + + let row_groups = metadata + .row_groups + .into_iter() + .map(|rg| RowGroupMetaData::try_from_thrift(&schema_descr, rg)) + .collect::>()?; + + let column_orders = metadata + .column_orders + .map(|orders| parse_column_orders(&orders, &schema_descr)); + + Ok(FileMetaData { + version: metadata.version, + num_rows: metadata.num_rows.try_into()?, + created_by: metadata.created_by, + row_groups, + key_value_metadata: metadata.key_value_metadata, + schema_descr, + column_orders, + }) + } + + /// Serializes itself to thrift's [`parquet_format_safe::FileMetaData`]. + pub fn into_thrift(self) -> parquet_format_safe::FileMetaData { + parquet_format_safe::FileMetaData { + version: self.version, + schema: self.schema_descr.into_thrift(), + num_rows: self.num_rows as i64, + row_groups: self + .row_groups + .into_iter() + .map(|v| v.into_thrift()) + .collect(), + key_value_metadata: self.key_value_metadata, + created_by: self.created_by, + column_orders: None, // todo + encryption_algorithm: None, + footer_signing_key_metadata: None, + } + } +} + +/// Parses [`ColumnOrder`] from Thrift definition. +fn parse_column_orders( + orders: &[TColumnOrder], + schema_descr: &SchemaDescriptor, +) -> Vec { + schema_descr + .columns() + .iter() + .zip(orders.iter()) + .map(|(column, order)| match order { + TColumnOrder::TYPEORDER(_) => { + let sort_order = get_sort_order( + &column.descriptor.primitive_type.logical_type, + &column.descriptor.primitive_type.converted_type, + &column.descriptor.primitive_type.physical_type, + ); + ColumnOrder::TypeDefinedOrder(sort_order) + }, + }) + .collect() +} diff --git a/crates/polars-parquet/src/parquet/metadata/mod.rs b/crates/polars-parquet/src/parquet/metadata/mod.rs new file mode 100644 index 000000000000..2dfe81138fdd --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/mod.rs @@ -0,0 +1,17 @@ +mod column_chunk_metadata; +mod column_descriptor; +mod column_order; +mod file_metadata; +mod row_metadata; +mod schema_descriptor; +mod sort; + +pub use column_chunk_metadata::ColumnChunkMetaData; +pub use column_descriptor::{ColumnDescriptor, Descriptor}; +pub use column_order::ColumnOrder; +pub use file_metadata::{FileMetaData, KeyValue}; +pub use row_metadata::RowGroupMetaData; +pub use schema_descriptor::SchemaDescriptor; +pub use sort::*; + +pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetaData; diff --git a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs new file mode 100644 index 000000000000..60137ca6167a --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs @@ -0,0 +1,103 @@ +use parquet_format_safe::RowGroup; +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use super::column_chunk_metadata::ColumnChunkMetaData; +use super::schema_descriptor::SchemaDescriptor; +use crate::parquet::error::{Error, Result}; +use crate::parquet::write::ColumnOffsetsMetadata; + +/// Metadata for a row group. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct RowGroupMetaData { + columns: Vec, + num_rows: usize, + total_byte_size: usize, +} + +impl RowGroupMetaData { + /// Create a new [`RowGroupMetaData`] + pub fn new( + columns: Vec, + num_rows: usize, + total_byte_size: usize, + ) -> RowGroupMetaData { + Self { + columns, + num_rows, + total_byte_size, + } + } + + /// Returns slice of column chunk metadata. + pub fn columns(&self) -> &[ColumnChunkMetaData] { + &self.columns + } + + /// Number of rows in this row group. + pub fn num_rows(&self) -> usize { + self.num_rows + } + + /// Total byte size of all uncompressed column data in this row group. + pub fn total_byte_size(&self) -> usize { + self.total_byte_size + } + + /// Total size of all compressed column data in this row group. + pub fn compressed_size(&self) -> usize { + self.columns + .iter() + .map(|c| c.compressed_size() as usize) + .sum::() + } + + /// Method to convert from Thrift. + pub(crate) fn try_from_thrift( + schema_descr: &SchemaDescriptor, + rg: RowGroup, + ) -> Result { + if schema_descr.columns().len() != rg.columns.len() { + return Err(Error::oos(format!("The number of columns in the row group ({}) must be equal to the number of columns in the schema ({})", rg.columns.len(), schema_descr.columns().len()))); + } + let total_byte_size = rg.total_byte_size.try_into()?; + let num_rows = rg.num_rows.try_into()?; + let columns = rg + .columns + .into_iter() + .zip(schema_descr.columns()) + .map(|(column_chunk, descriptor)| { + ColumnChunkMetaData::try_from_thrift(descriptor.clone(), column_chunk) + }) + .collect::>>()?; + + Ok(RowGroupMetaData { + columns, + num_rows, + total_byte_size, + }) + } + + /// Method to convert to Thrift. + pub(crate) fn into_thrift(self) -> RowGroup { + let file_offset = self + .columns + .iter() + .map(|c| { + ColumnOffsetsMetadata::from_column_chunk_metadata(c).calc_row_group_file_offset() + }) + .next() + .unwrap_or(None); + let total_compressed_size = Some(self.compressed_size() as i64); + RowGroup { + columns: self.columns.into_iter().map(|v| v.into_thrift()).collect(), + total_byte_size: self.total_byte_size as i64, + num_rows: self.num_rows as i64, + sorting_columns: None, + file_offset, + total_compressed_size, + ordinal: None, + } + } +} diff --git a/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs b/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs new file mode 100644 index 000000000000..a5a3a7b10735 --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/schema_descriptor.rs @@ -0,0 +1,141 @@ +use parquet_format_safe::SchemaElement; +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use super::column_descriptor::{ColumnDescriptor, Descriptor}; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::io_message::from_message; +use crate::parquet::schema::types::{FieldInfo, ParquetType}; +use crate::parquet::schema::Repetition; + +/// A schema descriptor. This encapsulates the top-level schemas for all the columns, +/// as well as all descriptors for all the primitive columns. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct SchemaDescriptor { + name: String, + // The top-level schema (the "message" type). + fields: Vec, + + // All the descriptors for primitive columns in this schema, constructed from + // `schema` in DFS order. + leaves: Vec, +} + +impl SchemaDescriptor { + /// Creates new schema descriptor from Parquet schema. + pub fn new(name: String, fields: Vec) -> Self { + let mut leaves = vec![]; + for f in &fields { + let mut path = vec![]; + build_tree(f, f, 0, 0, &mut leaves, &mut path); + } + + Self { + name, + fields, + leaves, + } + } + + /// The [`ColumnDescriptor`] (leafs) of this schema. + /// + /// Note that, for nested fields, this may contain more entries than the number of fields + /// in the file - e.g. a struct field may have two columns. + pub fn columns(&self) -> &[ColumnDescriptor] { + &self.leaves + } + + /// The schemas' name. + pub fn name(&self) -> &str { + &self.name + } + + /// The schemas' fields. + pub fn fields(&self) -> &[ParquetType] { + &self.fields + } + + pub(crate) fn into_thrift(self) -> Vec { + ParquetType::GroupType { + field_info: FieldInfo { + name: self.name, + repetition: Repetition::Optional, + id: None, + }, + logical_type: None, + converted_type: None, + fields: self.fields, + } + .to_thrift() + } + + fn try_from_type(type_: ParquetType) -> Result { + match type_ { + ParquetType::GroupType { + field_info, fields, .. + } => Ok(Self::new(field_info.name, fields)), + _ => Err(Error::oos("The parquet schema MUST be a group type")), + } + } + + pub(crate) fn try_from_thrift(elements: &[SchemaElement]) -> Result { + let schema = ParquetType::try_from_thrift(elements)?; + Self::try_from_type(schema) + } + + /// Creates a schema from + pub fn try_from_message(message: &str) -> Result { + let schema = from_message(message)?; + Self::try_from_type(schema) + } +} + +fn build_tree<'a>( + tp: &'a ParquetType, + base_tp: &ParquetType, + mut max_rep_level: i16, + mut max_def_level: i16, + leaves: &mut Vec, + path_so_far: &mut Vec<&'a str>, +) { + path_so_far.push(tp.name()); + match tp.get_field_info().repetition { + Repetition::Optional => { + max_def_level += 1; + }, + Repetition::Repeated => { + max_def_level += 1; + max_rep_level += 1; + }, + _ => {}, + } + + match tp { + ParquetType::PrimitiveType(p) => { + let path_in_schema = path_so_far.iter().copied().map(String::from).collect(); + leaves.push(ColumnDescriptor::new( + Descriptor { + primitive_type: p.clone(), + max_def_level, + max_rep_level, + }, + path_in_schema, + base_tp.clone(), + )); + }, + ParquetType::GroupType { ref fields, .. } => { + for f in fields { + build_tree( + f, + base_tp, + max_rep_level, + max_def_level, + leaves, + path_so_far, + ); + path_so_far.pop(); + } + }, + } +} diff --git a/crates/polars-parquet/src/parquet/metadata/sort.rs b/crates/polars-parquet/src/parquet/metadata/sort.rs new file mode 100644 index 000000000000..93aac06605b6 --- /dev/null +++ b/crates/polars-parquet/src/parquet/metadata/sort.rs @@ -0,0 +1,94 @@ +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use crate::parquet::schema::types::{ + IntegerType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, +}; + +/// Sort order for page and column statistics. +/// +/// Types are associated with sort orders and column stats are aggregated using a sort +/// order, and a sort order should be considered when comparing values with statistics +/// min/max. +/// +/// See reference in +/// +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum SortOrder { + /// Signed (either value or legacy byte-wise) comparison. + Signed, + /// Unsigned (depending on physical type either value or byte-wise) comparison. + Unsigned, + /// Comparison is undefined. + Undefined, +} + +/// Returns sort order for a physical/logical type. +pub fn get_sort_order( + logical_type: &Option, + converted_type: &Option, + physical_type: &PhysicalType, +) -> SortOrder { + if let Some(logical_type) = logical_type { + return get_logical_sort_order(logical_type); + }; + if let Some(converted_type) = converted_type { + return get_converted_sort_order(converted_type); + }; + get_physical_sort_order(physical_type) +} + +fn get_logical_sort_order(logical_type: &PrimitiveLogicalType) -> SortOrder { + // TODO: Should this take converted and logical type, for compatibility? + use PrimitiveLogicalType::*; + match logical_type { + String | Enum | Json | Bson => SortOrder::Unsigned, + Integer(t) => match t { + IntegerType::Int8 | IntegerType::Int16 | IntegerType::Int32 | IntegerType::Int64 => { + SortOrder::Signed + }, + _ => SortOrder::Unsigned, + }, + Decimal(_, _) => SortOrder::Signed, + Date => SortOrder::Signed, + Time { .. } => SortOrder::Signed, + Timestamp { .. } => SortOrder::Signed, + Unknown => SortOrder::Undefined, + Uuid => SortOrder::Unsigned, + } +} + +fn get_converted_sort_order(converted_type: &PrimitiveConvertedType) -> SortOrder { + use PrimitiveConvertedType::*; + match converted_type { + // Unsigned byte-wise comparison. + Utf8 | Json | Bson | Enum => SortOrder::Unsigned, + Int8 | Int16 | Int32 | Int64 => SortOrder::Signed, + Uint8 | Uint16 | Uint32 | Uint64 => SortOrder::Unsigned, + // Signed comparison of the represented value. + Decimal(_, _) => SortOrder::Signed, + Date => SortOrder::Signed, + TimeMillis | TimeMicros | TimestampMillis | TimestampMicros => SortOrder::Signed, + Interval => SortOrder::Undefined, + } +} + +fn get_physical_sort_order(physical_type: &PhysicalType) -> SortOrder { + use PhysicalType::*; + match physical_type { + // Order: false, true + Boolean => SortOrder::Unsigned, + Int32 | Int64 => SortOrder::Signed, + Int96 => SortOrder::Undefined, + // Notes to remember when comparing float/double values: + // If the min is a NaN, it should be ignored. + // If the max is a NaN, it should be ignored. + // If the min is +0, the row group may contain -0 values as well. + // If the max is -0, the row group may contain +0 values as well. + // When looking for NaN values, min and max should be ignored. + Float | Double => SortOrder::Signed, + // Unsigned byte-wise comparison + ByteArray | FixedLenByteArray(_) => SortOrder::Unsigned, + } +} diff --git a/crates/polars-parquet/src/parquet/mod.rs b/crates/polars-parquet/src/parquet/mod.rs new file mode 100644 index 000000000000..05166f650e2f --- /dev/null +++ b/crates/polars-parquet/src/parquet/mod.rs @@ -0,0 +1,37 @@ +#[macro_use] +pub mod error; +#[cfg(feature = "bloom_filter")] +pub mod bloom_filter; +pub mod compression; +pub mod deserialize; +pub mod encoding; +pub mod indexes; +pub mod metadata; +pub mod page; +mod parquet_bridge; +pub mod read; +pub mod schema; +pub mod statistics; +pub mod types; +pub mod write; + +use parquet_format_safe as thrift_format; +pub use streaming_decompression::{fallible_streaming_iterator, FallibleStreamingIterator}; + +const HEADER_SIZE: u64 = PARQUET_MAGIC.len() as u64; +const FOOTER_SIZE: u64 = 8; +const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; + +/// The number of bytes read at the end of the parquet file on first read +const DEFAULT_FOOTER_READ_SIZE: u64 = 64 * 1024; + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + pub fn get_path() -> PathBuf { + let dir = env!("CARGO_MANIFEST_DIR"); + + PathBuf::from(dir).join("testing/parquet-testing/data") + } +} diff --git a/crates/polars-parquet/src/parquet/page/mod.rs b/crates/polars-parquet/src/parquet/page/mod.rs new file mode 100644 index 000000000000..0fcf3a635469 --- /dev/null +++ b/crates/polars-parquet/src/parquet/page/mod.rs @@ -0,0 +1,428 @@ +use std::sync::Arc; + +use crate::parquet::compression::Compression; +use crate::parquet::encoding::{get_length, Encoding}; +use crate::parquet::error::{Error, Result}; +use crate::parquet::indexes::Interval; +use crate::parquet::metadata::Descriptor; +pub use crate::parquet::parquet_bridge::{DataPageHeaderExt, PageType}; +use crate::parquet::statistics::{deserialize_statistics, Statistics}; +pub use crate::parquet::thrift_format::{ + DataPageHeader as DataPageHeaderV1, DataPageHeaderV2, PageHeader as ParquetPageHeader, +}; + +/// A [`CompressedDataPage`] is compressed, encoded representation of a Parquet data page. +/// It holds actual data and thus cloning it is expensive. +#[derive(Debug)] +pub struct CompressedDataPage { + pub(crate) header: DataPageHeader, + pub(crate) buffer: Vec, + pub(crate) compression: Compression, + uncompressed_page_size: usize, + pub(crate) descriptor: Descriptor, + + // The offset and length in rows + pub(crate) selected_rows: Option>, +} + +impl CompressedDataPage { + /// Returns a new [`CompressedDataPage`]. + pub fn new( + header: DataPageHeader, + buffer: Vec, + compression: Compression, + uncompressed_page_size: usize, + descriptor: Descriptor, + rows: Option, + ) -> Self { + Self::new_read( + header, + buffer, + compression, + uncompressed_page_size, + descriptor, + rows.map(|x| vec![Interval::new(0, x)]), + ) + } + + /// Returns a new [`CompressedDataPage`]. + pub(crate) fn new_read( + header: DataPageHeader, + buffer: Vec, + compression: Compression, + uncompressed_page_size: usize, + descriptor: Descriptor, + selected_rows: Option>, + ) -> Self { + Self { + header, + buffer, + compression, + uncompressed_page_size, + descriptor, + selected_rows, + } + } + + pub fn header(&self) -> &DataPageHeader { + &self.header + } + + pub fn uncompressed_size(&self) -> usize { + self.uncompressed_page_size + } + + pub fn compressed_size(&self) -> usize { + self.buffer.len() + } + + /// The compression of the data in this page. + /// Note that what is compressed in a page depends on its version: + /// in V1, the whole data (`[repetition levels][definition levels][values]`) is compressed; in V2 only the values are compressed. + pub fn compression(&self) -> Compression { + self.compression + } + + /// the rows to be selected by this page. + /// When `None`, all rows are to be considered. + pub fn selected_rows(&self) -> Option<&[Interval]> { + self.selected_rows.as_deref() + } + + pub fn num_values(&self) -> usize { + self.header.num_values() + } + + /// Decodes the raw statistics into a statistics + pub fn statistics(&self) -> Option>> { + match &self.header { + DataPageHeader::V1(d) => d + .statistics + .as_ref() + .map(|x| deserialize_statistics(x, self.descriptor.primitive_type.clone())), + DataPageHeader::V2(d) => d + .statistics + .as_ref() + .map(|x| deserialize_statistics(x, self.descriptor.primitive_type.clone())), + } + } + + #[inline] + pub fn select_rows(&mut self, selected_rows: Vec) { + self.selected_rows = Some(selected_rows); + } +} + +#[derive(Debug, Clone)] +pub enum DataPageHeader { + V1(DataPageHeaderV1), + V2(DataPageHeaderV2), +} + +impl DataPageHeader { + pub fn num_values(&self) -> usize { + match &self { + DataPageHeader::V1(d) => d.num_values as usize, + DataPageHeader::V2(d) => d.num_values as usize, + } + } +} + +/// A [`DataPage`] is an uncompressed, encoded representation of a Parquet data page. It holds actual data +/// and thus cloning it is expensive. +#[derive(Debug, Clone)] +pub struct DataPage { + pub(super) header: DataPageHeader, + pub(super) buffer: Vec, + pub descriptor: Descriptor, + pub selected_rows: Option>, +} + +impl DataPage { + pub fn new( + header: DataPageHeader, + buffer: Vec, + descriptor: Descriptor, + rows: Option, + ) -> Self { + Self::new_read( + header, + buffer, + descriptor, + rows.map(|x| vec![Interval::new(0, x)]), + ) + } + + pub(crate) fn new_read( + header: DataPageHeader, + buffer: Vec, + descriptor: Descriptor, + selected_rows: Option>, + ) -> Self { + Self { + header, + buffer, + descriptor, + selected_rows, + } + } + + pub fn header(&self) -> &DataPageHeader { + &self.header + } + + pub fn buffer(&self) -> &[u8] { + &self.buffer + } + + /// the rows to be selected by this page. + /// When `None`, all rows are to be considered. + pub fn selected_rows(&self) -> Option<&[Interval]> { + self.selected_rows.as_deref() + } + + /// Returns a mutable reference to the internal buffer. + /// Useful to recover the buffer after the page has been decoded. + pub fn buffer_mut(&mut self) -> &mut Vec { + &mut self.buffer + } + + pub fn num_values(&self) -> usize { + self.header.num_values() + } + + pub fn encoding(&self) -> Encoding { + match &self.header { + DataPageHeader::V1(d) => d.encoding(), + DataPageHeader::V2(d) => d.encoding(), + } + } + + pub fn definition_level_encoding(&self) -> Encoding { + match &self.header { + DataPageHeader::V1(d) => d.definition_level_encoding(), + DataPageHeader::V2(_) => Encoding::Rle, + } + } + + pub fn repetition_level_encoding(&self) -> Encoding { + match &self.header { + DataPageHeader::V1(d) => d.repetition_level_encoding(), + DataPageHeader::V2(_) => Encoding::Rle, + } + } + + /// Decodes the raw statistics into a statistics + pub fn statistics(&self) -> Option>> { + match &self.header { + DataPageHeader::V1(d) => d + .statistics + .as_ref() + .map(|x| deserialize_statistics(x, self.descriptor.primitive_type.clone())), + DataPageHeader::V2(d) => d + .statistics + .as_ref() + .map(|x| deserialize_statistics(x, self.descriptor.primitive_type.clone())), + } + } +} + +/// A [`Page`] is an uncompressed, encoded representation of a Parquet page. It may hold actual data +/// and thus cloning it may be expensive. +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +pub enum Page { + /// A [`DataPage`] + Data(DataPage), + /// A [`DictPage`] + Dict(DictPage), +} + +impl Page { + pub(crate) fn buffer(&mut self) -> &mut Vec { + match self { + Self::Data(page) => &mut page.buffer, + Self::Dict(page) => &mut page.buffer, + } + } +} + +/// A [`CompressedPage`] is a compressed, encoded representation of a Parquet page. It holds actual data +/// and thus cloning it is expensive. +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +pub enum CompressedPage { + Data(CompressedDataPage), + Dict(CompressedDictPage), +} + +impl CompressedPage { + pub(crate) fn buffer(&mut self) -> &mut Vec { + match self { + CompressedPage::Data(page) => &mut page.buffer, + CompressedPage::Dict(page) => &mut page.buffer, + } + } + + pub(crate) fn compression(&self) -> Compression { + match self { + CompressedPage::Data(page) => page.compression(), + CompressedPage::Dict(page) => page.compression(), + } + } + + pub(crate) fn num_values(&self) -> usize { + match self { + CompressedPage::Data(page) => page.num_values(), + CompressedPage::Dict(_) => 0, + } + } + + pub(crate) fn selected_rows(&self) -> Option<&[Interval]> { + match self { + CompressedPage::Data(page) => page.selected_rows(), + CompressedPage::Dict(_) => None, + } + } + + pub(crate) fn uncompressed_size(&self) -> usize { + match self { + CompressedPage::Data(page) => page.uncompressed_page_size, + CompressedPage::Dict(page) => page.uncompressed_page_size, + } + } +} + +/// An uncompressed, encoded dictionary page. +#[derive(Debug)] +pub struct DictPage { + pub buffer: Vec, + pub num_values: usize, + pub is_sorted: bool, +} + +impl DictPage { + pub fn new(buffer: Vec, num_values: usize, is_sorted: bool) -> Self { + Self { + buffer, + num_values, + is_sorted, + } + } +} + +/// A compressed, encoded dictionary page. +#[derive(Debug)] +pub struct CompressedDictPage { + pub(crate) buffer: Vec, + compression: Compression, + pub(crate) num_values: usize, + pub(crate) uncompressed_page_size: usize, + pub is_sorted: bool, +} + +impl CompressedDictPage { + pub fn new( + buffer: Vec, + compression: Compression, + uncompressed_page_size: usize, + num_values: usize, + is_sorted: bool, + ) -> Self { + Self { + buffer, + compression, + uncompressed_page_size, + num_values, + is_sorted, + } + } + + /// The compression of the data in this page. + pub fn compression(&self) -> Compression { + self.compression + } +} + +/// Splits the page buffer into 3 slices corresponding to (encoded rep levels, encoded def levels, encoded values) for v1 pages. +#[inline] +pub fn split_buffer_v1( + buffer: &[u8], + has_rep: bool, + has_def: bool, +) -> Result<(&[u8], &[u8], &[u8])> { + let (rep, buffer) = if has_rep { + let level_buffer_length = get_length(buffer).ok_or_else(|| { + Error::oos("The number of bytes declared in v1 rep levels is higher than the page size") + })?; + ( + buffer.get(4..4 + level_buffer_length).ok_or_else(|| { + Error::oos( + "The number of bytes declared in v1 rep levels is higher than the page size", + ) + })?, + buffer.get(4 + level_buffer_length..).ok_or_else(|| { + Error::oos( + "The number of bytes declared in v1 rep levels is higher than the page size", + ) + })?, + ) + } else { + (&[] as &[u8], buffer) + }; + + let (def, buffer) = if has_def { + let level_buffer_length = get_length(buffer).ok_or_else(|| { + Error::oos("The number of bytes declared in v1 rep levels is higher than the page size") + })?; + ( + buffer.get(4..4 + level_buffer_length).ok_or_else(|| { + Error::oos( + "The number of bytes declared in v1 def levels is higher than the page size", + ) + })?, + buffer.get(4 + level_buffer_length..).ok_or_else(|| { + Error::oos( + "The number of bytes declared in v1 def levels is higher than the page size", + ) + })?, + ) + } else { + (&[] as &[u8], buffer) + }; + + Ok((rep, def, buffer)) +} + +/// Splits the page buffer into 3 slices corresponding to (encoded rep levels, encoded def levels, encoded values) for v2 pages. +pub fn split_buffer_v2( + buffer: &[u8], + rep_level_buffer_length: usize, + def_level_buffer_length: usize, +) -> Result<(&[u8], &[u8], &[u8])> { + Ok(( + &buffer[..rep_level_buffer_length], + &buffer[rep_level_buffer_length..rep_level_buffer_length + def_level_buffer_length], + &buffer[rep_level_buffer_length + def_level_buffer_length..], + )) +} + +/// Splits the page buffer into 3 slices corresponding to (encoded rep levels, encoded def levels, encoded values). +pub fn split_buffer(page: &DataPage) -> Result<(&[u8], &[u8], &[u8])> { + match page.header() { + DataPageHeader::V1(_) => split_buffer_v1( + page.buffer(), + page.descriptor.max_rep_level > 0, + page.descriptor.max_def_level > 0, + ), + DataPageHeader::V2(header) => { + let def_level_buffer_length: usize = header.definition_levels_byte_length.try_into()?; + let rep_level_buffer_length: usize = header.repetition_levels_byte_length.try_into()?; + split_buffer_v2( + page.buffer(), + rep_level_buffer_length, + def_level_buffer_length, + ) + }, + } +} diff --git a/crates/polars-parquet/src/parquet/parquet_bridge.rs b/crates/polars-parquet/src/parquet/parquet_bridge.rs new file mode 100644 index 000000000000..eec75e4994ca --- /dev/null +++ b/crates/polars-parquet/src/parquet/parquet_bridge.rs @@ -0,0 +1,704 @@ +// Bridges structs from thrift-generated code to rust enums. +use std::convert::TryFrom; + +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use super::thrift_format::{ + BoundaryOrder as ParquetBoundaryOrder, CompressionCodec, DataPageHeader, DataPageHeaderV2, + DecimalType, Encoding as ParquetEncoding, FieldRepetitionType, IntType, + LogicalType as ParquetLogicalType, PageType as ParquetPageType, TimeType, + TimeUnit as ParquetTimeUnit, TimestampType, +}; +use crate::parquet::error::Error; + +/// The repetition of a parquet field +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum Repetition { + /// When the field has no null values + Required, + /// When the field may have null values + Optional, + /// When the field may be repeated (list field) + Repeated, +} + +impl TryFrom for Repetition { + type Error = Error; + + fn try_from(repetition: FieldRepetitionType) -> Result { + Ok(match repetition { + FieldRepetitionType::REQUIRED => Repetition::Required, + FieldRepetitionType::OPTIONAL => Repetition::Optional, + FieldRepetitionType::REPEATED => Repetition::Repeated, + _ => return Err(Error::oos("Thrift out of range")), + }) + } +} + +impl From for FieldRepetitionType { + fn from(repetition: Repetition) -> Self { + match repetition { + Repetition::Required => FieldRepetitionType::REQUIRED, + Repetition::Optional => FieldRepetitionType::OPTIONAL, + Repetition::Repeated => FieldRepetitionType::REPEATED, + } + } +} + +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum Compression { + Uncompressed, + Snappy, + Gzip, + Lzo, + Brotli, + Lz4, + Zstd, + Lz4Raw, +} + +impl TryFrom for Compression { + type Error = Error; + + fn try_from(codec: CompressionCodec) -> Result { + Ok(match codec { + CompressionCodec::UNCOMPRESSED => Compression::Uncompressed, + CompressionCodec::SNAPPY => Compression::Snappy, + CompressionCodec::GZIP => Compression::Gzip, + CompressionCodec::LZO => Compression::Lzo, + CompressionCodec::BROTLI => Compression::Brotli, + CompressionCodec::LZ4 => Compression::Lz4, + CompressionCodec::ZSTD => Compression::Zstd, + CompressionCodec::LZ4_RAW => Compression::Lz4Raw, + _ => return Err(Error::oos("Thrift out of range")), + }) + } +} + +impl From for CompressionCodec { + fn from(codec: Compression) -> Self { + match codec { + Compression::Uncompressed => CompressionCodec::UNCOMPRESSED, + Compression::Snappy => CompressionCodec::SNAPPY, + Compression::Gzip => CompressionCodec::GZIP, + Compression::Lzo => CompressionCodec::LZO, + Compression::Brotli => CompressionCodec::BROTLI, + Compression::Lz4 => CompressionCodec::LZ4, + Compression::Zstd => CompressionCodec::ZSTD, + Compression::Lz4Raw => CompressionCodec::LZ4_RAW, + } + } +} + +/// Defines the compression settings for writing a parquet file. +/// +/// If None is provided as a compression setting, then the default compression level is used. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub enum CompressionOptions { + Uncompressed, + Snappy, + Gzip(Option), + Lzo, + Brotli(Option), + Lz4, + Zstd(Option), + Lz4Raw, +} + +impl From for Compression { + fn from(value: CompressionOptions) -> Self { + match value { + CompressionOptions::Uncompressed => Compression::Uncompressed, + CompressionOptions::Snappy => Compression::Snappy, + CompressionOptions::Gzip(_) => Compression::Gzip, + CompressionOptions::Lzo => Compression::Lzo, + CompressionOptions::Brotli(_) => Compression::Brotli, + CompressionOptions::Lz4 => Compression::Lz4, + CompressionOptions::Zstd(_) => Compression::Zstd, + CompressionOptions::Lz4Raw => Compression::Lz4Raw, + } + } +} + +impl From for CompressionCodec { + fn from(codec: CompressionOptions) -> Self { + match codec { + CompressionOptions::Uncompressed => CompressionCodec::UNCOMPRESSED, + CompressionOptions::Snappy => CompressionCodec::SNAPPY, + CompressionOptions::Gzip(_) => CompressionCodec::GZIP, + CompressionOptions::Lzo => CompressionCodec::LZO, + CompressionOptions::Brotli(_) => CompressionCodec::BROTLI, + CompressionOptions::Lz4 => CompressionCodec::LZ4, + CompressionOptions::Zstd(_) => CompressionCodec::ZSTD, + CompressionOptions::Lz4Raw => CompressionCodec::LZ4_RAW, + } + } +} + +/// Defines valid compression levels. +pub(crate) trait CompressionLevel { + const MINIMUM_LEVEL: T; + const MAXIMUM_LEVEL: T; + + /// Tests if the provided compression level is valid. + fn is_valid_level(level: T) -> Result<(), Error> { + let compression_range = Self::MINIMUM_LEVEL..=Self::MAXIMUM_LEVEL; + if compression_range.contains(&level) { + Ok(()) + } else { + Err(Error::InvalidParameter(format!( + "valid compression range {}..={} exceeded.", + compression_range.start(), + compression_range.end() + ))) + } + } +} + +/// Represents a valid brotli compression level. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub struct BrotliLevel(u32); + +impl Default for BrotliLevel { + fn default() -> Self { + Self(1) + } +} + +impl CompressionLevel for BrotliLevel { + const MINIMUM_LEVEL: u32 = 0; + const MAXIMUM_LEVEL: u32 = 11; +} + +impl BrotliLevel { + /// Attempts to create a brotli compression level. + /// + /// Compression levels must be valid. + pub fn try_new(level: u32) -> Result { + Self::is_valid_level(level).map(|_| Self(level)) + } + + /// Returns the compression level. + pub fn compression_level(&self) -> u32 { + self.0 + } +} + +/// Represents a valid gzip compression level. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub struct GzipLevel(u8); + +impl Default for GzipLevel { + fn default() -> Self { + // The default as of miniz_oxide 0.5.1 is 6 for compression level + // (miniz_oxide::deflate::CompressionLevel::DefaultLevel) + Self(6) + } +} + +impl CompressionLevel for GzipLevel { + const MINIMUM_LEVEL: u8 = 0; + const MAXIMUM_LEVEL: u8 = 10; +} + +impl GzipLevel { + /// Attempts to create a gzip compression level. + /// + /// Compression levels must be valid (i.e. be acceptable for [`flate2::Compression`]). + pub fn try_new(level: u8) -> Result { + Self::is_valid_level(level).map(|_| Self(level)) + } + + /// Returns the compression level. + pub fn compression_level(&self) -> u8 { + self.0 + } +} + +#[cfg(feature = "gzip")] +impl From for flate2::Compression { + fn from(level: GzipLevel) -> Self { + Self::new(level.compression_level() as u32) + } +} + +/// Represents a valid zstd compression level. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub struct ZstdLevel(i32); + +impl CompressionLevel for ZstdLevel { + // zstd binds to C, and hence zstd::compression_level_range() is not const as this calls the + // underlying C library. + const MINIMUM_LEVEL: i32 = 1; + const MAXIMUM_LEVEL: i32 = 22; +} + +impl ZstdLevel { + /// Attempts to create a zstd compression level from a given compression level. + /// + /// Compression levels must be valid (i.e. be acceptable for [`zstd::compression_level_range`]). + pub fn try_new(level: i32) -> Result { + Self::is_valid_level(level).map(|_| Self(level)) + } + + /// Returns the compression level. + pub fn compression_level(&self) -> i32 { + self.0 + } +} + +#[cfg(feature = "zstd")] +impl Default for ZstdLevel { + fn default() -> Self { + Self(zstd::DEFAULT_COMPRESSION_LEVEL) + } +} + +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub enum PageType { + DataPage, + DataPageV2, + DictionaryPage, +} + +impl TryFrom for PageType { + type Error = Error; + + fn try_from(type_: ParquetPageType) -> Result { + Ok(match type_ { + ParquetPageType::DATA_PAGE => PageType::DataPage, + ParquetPageType::DATA_PAGE_V2 => PageType::DataPageV2, + ParquetPageType::DICTIONARY_PAGE => PageType::DictionaryPage, + _ => return Err(Error::oos("Thrift out of range")), + }) + } +} + +impl From for ParquetPageType { + fn from(type_: PageType) -> Self { + match type_ { + PageType::DataPage => ParquetPageType::DATA_PAGE, + PageType::DataPageV2 => ParquetPageType::DATA_PAGE_V2, + PageType::DictionaryPage => ParquetPageType::DICTIONARY_PAGE, + } + } +} + +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub enum Encoding { + /// Default encoding. + /// BOOLEAN - 1 bit per value. 0 is false; 1 is true. + /// INT32 - 4 bytes per value. Stored as little-endian. + /// INT64 - 8 bytes per value. Stored as little-endian. + /// FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + /// DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + /// BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + /// FIXED_LEN_BYTE_ARRAY - Just the bytes. + Plain, + /// Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + /// plain type. + /// in a data page use RLE_DICTIONARY instead. + /// in a Dictionary page use PLAIN instead + PlainDictionary, + /// Group packed run length encoding. Usable for definition/repetition levels + /// encoding and Booleans (on one bit: 0 is false; 1 is true.) + Rle, + /// Bit packed encoding. This can only be used if the data has a known max + /// width. Usable for definition/repetition levels encoding. + BitPacked, + /// Delta encoding for integers. This can be used for int columns and works best + /// on sorted data + DeltaBinaryPacked, + /// Encoding for byte arrays to separate the length values and the data. The lengths + /// are encoded using DELTA_BINARY_PACKED + DeltaLengthByteArray, + /// Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. + /// Suffixes are stored as delta length byte arrays. + DeltaByteArray, + /// Dictionary encoding: the ids are encoded using the RLE encoding + RleDictionary, + /// Encoding for floating-point data. + /// K byte-streams are created where K is the size in bytes of the data type. + /// The individual bytes of an FP value are scattered to the corresponding stream and + /// the streams are concatenated. + /// This itself does not reduce the size of the data but can lead to better compression + /// afterwards. + ByteStreamSplit, +} + +impl TryFrom for Encoding { + type Error = Error; + + fn try_from(encoding: ParquetEncoding) -> Result { + Ok(match encoding { + ParquetEncoding::PLAIN => Encoding::Plain, + ParquetEncoding::PLAIN_DICTIONARY => Encoding::PlainDictionary, + ParquetEncoding::RLE => Encoding::Rle, + ParquetEncoding::BIT_PACKED => Encoding::BitPacked, + ParquetEncoding::DELTA_BINARY_PACKED => Encoding::DeltaBinaryPacked, + ParquetEncoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DeltaLengthByteArray, + ParquetEncoding::DELTA_BYTE_ARRAY => Encoding::DeltaByteArray, + ParquetEncoding::RLE_DICTIONARY => Encoding::RleDictionary, + ParquetEncoding::BYTE_STREAM_SPLIT => Encoding::ByteStreamSplit, + _ => return Err(Error::oos("Thrift out of range")), + }) + } +} + +impl From for ParquetEncoding { + fn from(encoding: Encoding) -> Self { + match encoding { + Encoding::Plain => ParquetEncoding::PLAIN, + Encoding::PlainDictionary => ParquetEncoding::PLAIN_DICTIONARY, + Encoding::Rle => ParquetEncoding::RLE, + Encoding::BitPacked => ParquetEncoding::BIT_PACKED, + Encoding::DeltaBinaryPacked => ParquetEncoding::DELTA_BINARY_PACKED, + Encoding::DeltaLengthByteArray => ParquetEncoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DeltaByteArray => ParquetEncoding::DELTA_BYTE_ARRAY, + Encoding::RleDictionary => ParquetEncoding::RLE_DICTIONARY, + Encoding::ByteStreamSplit => ParquetEncoding::BYTE_STREAM_SPLIT, + } + } +} + +/// Enum to annotate whether lists of min/max elements inside ColumnIndex +/// are ordered and if so, in which direction. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub enum BoundaryOrder { + Unordered, + Ascending, + Descending, +} + +impl Default for BoundaryOrder { + fn default() -> Self { + Self::Unordered + } +} + +impl TryFrom for BoundaryOrder { + type Error = Error; + + fn try_from(encoding: ParquetBoundaryOrder) -> Result { + Ok(match encoding { + ParquetBoundaryOrder::UNORDERED => BoundaryOrder::Unordered, + ParquetBoundaryOrder::ASCENDING => BoundaryOrder::Ascending, + ParquetBoundaryOrder::DESCENDING => BoundaryOrder::Descending, + _ => return Err(Error::oos("BoundaryOrder Thrift value out of range")), + }) + } +} + +impl From for ParquetBoundaryOrder { + fn from(encoding: BoundaryOrder) -> Self { + match encoding { + BoundaryOrder::Unordered => ParquetBoundaryOrder::UNORDERED, + BoundaryOrder::Ascending => ParquetBoundaryOrder::ASCENDING, + BoundaryOrder::Descending => ParquetBoundaryOrder::DESCENDING, + } + } +} + +pub trait DataPageHeaderExt { + fn encoding(&self) -> Encoding; + fn repetition_level_encoding(&self) -> Encoding; + fn definition_level_encoding(&self) -> Encoding; +} + +impl DataPageHeaderExt for DataPageHeader { + fn encoding(&self) -> Encoding { + self.encoding.try_into().unwrap() + } + + fn repetition_level_encoding(&self) -> Encoding { + self.repetition_level_encoding.try_into().unwrap() + } + + fn definition_level_encoding(&self) -> Encoding { + self.definition_level_encoding.try_into().unwrap() + } +} + +impl DataPageHeaderExt for DataPageHeaderV2 { + fn encoding(&self) -> Encoding { + self.encoding.try_into().unwrap() + } + + fn repetition_level_encoding(&self) -> Encoding { + Encoding::Rle + } + + fn definition_level_encoding(&self) -> Encoding { + Encoding::Rle + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum TimeUnit { + Milliseconds, + Microseconds, + Nanoseconds, +} + +impl From for TimeUnit { + fn from(encoding: ParquetTimeUnit) -> Self { + match encoding { + ParquetTimeUnit::MILLIS(_) => TimeUnit::Milliseconds, + ParquetTimeUnit::MICROS(_) => TimeUnit::Microseconds, + ParquetTimeUnit::NANOS(_) => TimeUnit::Nanoseconds, + } + } +} + +impl From for ParquetTimeUnit { + fn from(unit: TimeUnit) -> Self { + match unit { + TimeUnit::Milliseconds => ParquetTimeUnit::MILLIS(Default::default()), + TimeUnit::Microseconds => ParquetTimeUnit::MICROS(Default::default()), + TimeUnit::Nanoseconds => ParquetTimeUnit::NANOS(Default::default()), + } + } +} + +/// Enum of all valid logical integer types +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum IntegerType { + Int8, + Int16, + Int32, + Int64, + UInt8, + UInt16, + UInt32, + UInt64, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum PrimitiveLogicalType { + String, + Enum, + Decimal(usize, usize), + Date, + Time { + unit: TimeUnit, + is_adjusted_to_utc: bool, + }, + Timestamp { + unit: TimeUnit, + is_adjusted_to_utc: bool, + }, + Integer(IntegerType), + Unknown, + Json, + Bson, + Uuid, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum GroupLogicalType { + Map, + List, +} + +impl From for ParquetLogicalType { + fn from(type_: GroupLogicalType) -> Self { + match type_ { + GroupLogicalType::Map => ParquetLogicalType::MAP(Default::default()), + GroupLogicalType::List => ParquetLogicalType::LIST(Default::default()), + } + } +} + +impl From<(i32, bool)> for IntegerType { + fn from((bit_width, is_signed): (i32, bool)) -> Self { + match (bit_width, is_signed) { + (8, true) => IntegerType::Int8, + (16, true) => IntegerType::Int16, + (32, true) => IntegerType::Int32, + (64, true) => IntegerType::Int64, + (8, false) => IntegerType::UInt8, + (16, false) => IntegerType::UInt16, + (32, false) => IntegerType::UInt32, + (64, false) => IntegerType::UInt64, + // The above are the only possible annotations for parquet's int32. Anything else + // is a deviation to the parquet specification and we ignore + _ => IntegerType::Int32, + } + } +} + +impl From for (usize, bool) { + fn from(type_: IntegerType) -> (usize, bool) { + match type_ { + IntegerType::Int8 => (8, true), + IntegerType::Int16 => (16, true), + IntegerType::Int32 => (32, true), + IntegerType::Int64 => (64, true), + IntegerType::UInt8 => (8, false), + IntegerType::UInt16 => (16, false), + IntegerType::UInt32 => (32, false), + IntegerType::UInt64 => (64, false), + } + } +} + +impl TryFrom for PrimitiveLogicalType { + type Error = Error; + + fn try_from(type_: ParquetLogicalType) -> Result { + Ok(match type_ { + ParquetLogicalType::STRING(_) => PrimitiveLogicalType::String, + ParquetLogicalType::ENUM(_) => PrimitiveLogicalType::Enum, + ParquetLogicalType::DECIMAL(decimal) => PrimitiveLogicalType::Decimal( + decimal.precision.try_into()?, + decimal.scale.try_into()?, + ), + ParquetLogicalType::DATE(_) => PrimitiveLogicalType::Date, + ParquetLogicalType::TIME(time) => PrimitiveLogicalType::Time { + unit: time.unit.into(), + is_adjusted_to_utc: time.is_adjusted_to_u_t_c, + }, + ParquetLogicalType::TIMESTAMP(time) => PrimitiveLogicalType::Timestamp { + unit: time.unit.into(), + is_adjusted_to_utc: time.is_adjusted_to_u_t_c, + }, + ParquetLogicalType::INTEGER(int) => { + PrimitiveLogicalType::Integer((int.bit_width as i32, int.is_signed).into()) + }, + ParquetLogicalType::UNKNOWN(_) => PrimitiveLogicalType::Unknown, + ParquetLogicalType::JSON(_) => PrimitiveLogicalType::Json, + ParquetLogicalType::BSON(_) => PrimitiveLogicalType::Bson, + ParquetLogicalType::UUID(_) => PrimitiveLogicalType::Uuid, + _ => return Err(Error::oos("LogicalType value out of range")), + }) + } +} + +impl TryFrom for GroupLogicalType { + type Error = Error; + + fn try_from(type_: ParquetLogicalType) -> Result { + Ok(match type_ { + ParquetLogicalType::LIST(_) => GroupLogicalType::List, + ParquetLogicalType::MAP(_) => GroupLogicalType::Map, + _ => return Err(Error::oos("LogicalType value out of range")), + }) + } +} + +impl From for ParquetLogicalType { + fn from(type_: PrimitiveLogicalType) -> Self { + match type_ { + PrimitiveLogicalType::String => ParquetLogicalType::STRING(Default::default()), + PrimitiveLogicalType::Enum => ParquetLogicalType::ENUM(Default::default()), + PrimitiveLogicalType::Decimal(precision, scale) => { + ParquetLogicalType::DECIMAL(DecimalType { + precision: precision as i32, + scale: scale as i32, + }) + }, + PrimitiveLogicalType::Date => ParquetLogicalType::DATE(Default::default()), + PrimitiveLogicalType::Time { + unit, + is_adjusted_to_utc, + } => ParquetLogicalType::TIME(TimeType { + unit: unit.into(), + is_adjusted_to_u_t_c: is_adjusted_to_utc, + }), + PrimitiveLogicalType::Timestamp { + unit, + is_adjusted_to_utc, + } => ParquetLogicalType::TIMESTAMP(TimestampType { + unit: unit.into(), + is_adjusted_to_u_t_c: is_adjusted_to_utc, + }), + PrimitiveLogicalType::Integer(integer) => { + let (bit_width, is_signed) = integer.into(); + ParquetLogicalType::INTEGER(IntType { + bit_width: bit_width as i8, + is_signed, + }) + }, + PrimitiveLogicalType::Unknown => ParquetLogicalType::UNKNOWN(Default::default()), + PrimitiveLogicalType::Json => ParquetLogicalType::JSON(Default::default()), + PrimitiveLogicalType::Bson => ParquetLogicalType::BSON(Default::default()), + PrimitiveLogicalType::Uuid => ParquetLogicalType::UUID(Default::default()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn round_trip_primitive() -> Result<(), Error> { + use PrimitiveLogicalType::*; + let a = vec![ + String, + Enum, + Decimal(3, 1), + Date, + Time { + unit: TimeUnit::Milliseconds, + is_adjusted_to_utc: true, + }, + Timestamp { + unit: TimeUnit::Milliseconds, + is_adjusted_to_utc: true, + }, + Integer(IntegerType::Int16), + Unknown, + Json, + Bson, + Uuid, + ]; + for a in a { + let c: ParquetLogicalType = a.into(); + let e: PrimitiveLogicalType = c.try_into()?; + assert_eq!(e, a); + } + Ok(()) + } + + #[test] + fn round_trip_encoding() -> Result<(), Error> { + use Encoding::*; + let a = vec![ + Plain, + PlainDictionary, + Rle, + BitPacked, + DeltaBinaryPacked, + DeltaLengthByteArray, + DeltaByteArray, + RleDictionary, + ByteStreamSplit, + ]; + for a in a { + let c: ParquetEncoding = a.into(); + let e: Encoding = c.try_into()?; + assert_eq!(e, a); + } + Ok(()) + } + + #[test] + fn round_compression() -> Result<(), Error> { + use Compression::*; + let a = vec![Uncompressed, Snappy, Gzip, Lzo, Brotli, Lz4, Zstd, Lz4Raw]; + for a in a { + let c: CompressionCodec = a.into(); + let e: Compression = c.try_into()?; + assert_eq!(e, a); + } + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs new file mode 100644 index 000000000000..76e6809ac767 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/column/mod.rs @@ -0,0 +1,204 @@ +use std::io::{Read, Seek}; +use std::vec::IntoIter; + +use super::{get_field_columns, get_page_iterator, PageFilter, PageReader}; +use crate::parquet::error::Error; +use crate::parquet::metadata::{ColumnChunkMetaData, RowGroupMetaData}; +use crate::parquet::page::CompressedPage; +use crate::parquet::schema::types::ParquetType; + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +mod stream; + +/// Returns a [`ColumnIterator`] of column chunks corresponding to `field`. +/// +/// Contrarily to [`get_page_iterator`] that returns a single iterator of pages, this iterator +/// iterates over columns, one by one, and returns a [`PageReader`] per column. +/// For primitive fields (e.g. `i64`), [`ColumnIterator`] yields exactly one column. +/// For complex fields, it yields multiple columns. +/// `max_page_size` is the maximum number of bytes allowed. +pub fn get_column_iterator( + reader: R, + row_group: &RowGroupMetaData, + field_name: &str, + page_filter: Option, + scratch: Vec, + max_page_size: usize, +) -> ColumnIterator { + let columns = get_field_columns(row_group.columns(), field_name) + .cloned() + .collect::>(); + + ColumnIterator::new(reader, columns, page_filter, scratch, max_page_size) +} + +/// State of [`MutStreamingIterator`]. +#[derive(Debug)] +pub enum State { + /// Iterator still has elements + Some(T), + /// Iterator finished + Finished(Vec), +} + +/// A special kind of fallible streaming iterator where `advance` consumes the iterator. +pub trait MutStreamingIterator: Sized { + type Item; + type Error; + + fn advance(self) -> std::result::Result, Self::Error>; + fn get(&mut self) -> Option<&mut Self::Item>; +} + +/// A [`MutStreamingIterator`] that reads column chunks one by one, +/// returning a [`PageReader`] per column. +pub struct ColumnIterator { + reader: Option, + columns: Vec, + page_filter: Option, + current: Option<(PageReader, ColumnChunkMetaData)>, + scratch: Vec, + max_page_size: usize, +} + +impl ColumnIterator { + /// Returns a new [`ColumnIterator`] + /// `max_page_size` is the maximum allowed page size + pub fn new( + reader: R, + mut columns: Vec, + page_filter: Option, + scratch: Vec, + max_page_size: usize, + ) -> Self { + columns.reverse(); + Self { + reader: Some(reader), + scratch, + columns, + page_filter, + current: None, + max_page_size, + } + } +} + +impl MutStreamingIterator for ColumnIterator { + type Item = (PageReader, ColumnChunkMetaData); + type Error = Error; + + fn advance(mut self) -> Result, Error> { + let (reader, scratch) = if let Some((iter, _)) = self.current { + iter.into_inner() + } else { + (self.reader.unwrap(), self.scratch) + }; + if self.columns.is_empty() { + return Ok(State::Finished(scratch)); + }; + let column = self.columns.pop().unwrap(); + + let iter = get_page_iterator( + &column, + reader, + self.page_filter.clone(), + scratch, + self.max_page_size, + )?; + let current = Some((iter, column)); + Ok(State::Some(Self { + reader: None, + columns: self.columns, + page_filter: self.page_filter, + current, + scratch: vec![], + max_page_size: self.max_page_size, + })) + } + + fn get(&mut self) -> Option<&mut Self::Item> { + self.current.as_mut() + } +} + +/// A [`MutStreamingIterator`] of pre-read column chunks +#[derive(Debug)] +pub struct ReadColumnIterator { + field: ParquetType, + chunks: Vec<(Vec>, ColumnChunkMetaData)>, + current: Option<(IntoIter>, ColumnChunkMetaData)>, +} + +impl ReadColumnIterator { + /// Returns a new [`ReadColumnIterator`] + pub fn new( + field: ParquetType, + chunks: Vec<(Vec>, ColumnChunkMetaData)>, + ) -> Self { + Self { + field, + chunks, + current: None, + } + } +} + +impl MutStreamingIterator for ReadColumnIterator { + type Item = (IntoIter>, ColumnChunkMetaData); + type Error = Error; + + fn advance(mut self) -> Result, Error> { + if self.chunks.is_empty() { + return Ok(State::Finished(vec![])); + } + self.current = self + .chunks + .pop() + .map(|(pages, meta)| (pages.into_iter(), meta)); + Ok(State::Some(Self { + field: self.field, + chunks: self.chunks, + current: self.current, + })) + } + + fn get(&mut self) -> Option<&mut Self::Item> { + self.current.as_mut() + } +} + +/// Reads all columns that are part of the parquet field `field_name` +/// # Implementation +/// This operation is IO-bounded `O(C)` where C is the number of columns associated to +/// the field (one for non-nested types) +/// It reads the columns sequentially. Use [`read_column`] to fork this operation to multiple +/// readers. +pub fn read_columns<'a, R: Read + Seek>( + reader: &mut R, + columns: &'a [ColumnChunkMetaData], + field_name: &'a str, +) -> Result)>, Error> { + get_field_columns(columns, field_name) + .map(|column| read_column(reader, column).map(|c| (column, c))) + .collect() +} + +/// Reads a column chunk into memory +/// This operation is IO-bounded and allocates the column's `compressed_size`. +pub fn read_column(reader: &mut R, column: &ColumnChunkMetaData) -> Result, Error> +where + R: Read + Seek, +{ + let (start, length) = column.byte_range(); + reader.seek(std::io::SeekFrom::Start(start))?; + + let mut chunk = vec![]; + chunk.try_reserve(length as usize)?; + reader.by_ref().take(length).read_to_end(&mut chunk)?; + Ok(chunk) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub use stream::{read_column_async, read_columns_async}; diff --git a/crates/polars-parquet/src/parquet/read/column/stream.rs b/crates/polars-parquet/src/parquet/read/column/stream.rs new file mode 100644 index 000000000000..eac4fd497fd6 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/column/stream.rs @@ -0,0 +1,51 @@ +use futures::future::{try_join_all, BoxFuture}; +use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; + +use crate::parquet::error::Error; +use crate::parquet::metadata::ColumnChunkMetaData; +use crate::parquet::read::get_field_columns; + +/// Reads a single column chunk into memory asynchronously +pub async fn read_column_async<'b, R, F>( + factory: F, + meta: &ColumnChunkMetaData, +) -> Result, Error> +where + R: AsyncRead + AsyncSeek + Send + Unpin, + F: Fn() -> BoxFuture<'b, std::io::Result>, +{ + let mut reader = factory().await?; + let (start, length) = meta.byte_range(); + reader.seek(std::io::SeekFrom::Start(start)).await?; + + let mut chunk = vec![]; + chunk.try_reserve(length as usize)?; + reader.take(length).read_to_end(&mut chunk).await?; + Result::Ok(chunk) +} + +/// Reads all columns that are part of the parquet field `field_name` +/// # Implementation +/// This operation is IO-bounded `O(C)` where C is the number of columns associated to +/// the field (one for non-nested types) +/// +/// It does so asynchronously via a single `join_all` over all the necessary columns for +/// `field_name`. +pub async fn read_columns_async< + 'a, + 'b, + R: AsyncRead + AsyncSeek + Send + Unpin, + F: Fn() -> BoxFuture<'b, std::io::Result> + Clone, +>( + factory: F, + columns: &'a [ColumnChunkMetaData], + field_name: &'a str, +) -> Result)>, Error> { + let fields = get_field_columns(columns, field_name).collect::>(); + let futures = fields + .iter() + .map(|meta| async { read_column_async(factory.clone(), meta).await }); + + let columns = try_join_all(futures).await?; + Ok(fields.into_iter().zip(columns).collect()) +} diff --git a/crates/polars-parquet/src/parquet/read/compression.rs b/crates/polars-parquet/src/parquet/read/compression.rs new file mode 100644 index 000000000000..fbe2ef938f82 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/compression.rs @@ -0,0 +1,286 @@ +use parquet_format_safe::DataPageHeaderV2; +use streaming_decompression; + +use super::page::PageIterator; +use crate::parquet::compression::{self, Compression}; +use crate::parquet::error::{Error, Result}; +use crate::parquet::page::{CompressedPage, DataPage, DataPageHeader, DictPage, Page}; +use crate::parquet::FallibleStreamingIterator; + +fn decompress_v1(compressed: &[u8], compression: Compression, buffer: &mut [u8]) -> Result<()> { + compression::decompress(compression, compressed, buffer) +} + +fn decompress_v2( + compressed: &[u8], + page_header: &DataPageHeaderV2, + compression: Compression, + buffer: &mut [u8], +) -> Result<()> { + // When processing data page v2, depending on enabled compression for the + // page, we should account for uncompressed data ('offset') of + // repetition and definition levels. + // + // We always use 0 offset for other pages other than v2, `true` flag means + // that compression will be applied if decompressor is defined + let offset = (page_header.definition_levels_byte_length + + page_header.repetition_levels_byte_length) as usize; + // When is_compressed flag is missing the page is considered compressed + let can_decompress = page_header.is_compressed.unwrap_or(true); + + if can_decompress { + if offset > buffer.len() || offset > compressed.len() { + return Err(Error::OutOfSpec( + "V2 Page Header reported incorrect offset to compressed data".to_string(), + )); + } + + (buffer[..offset]).copy_from_slice(&compressed[..offset]); + + compression::decompress(compression, &compressed[offset..], &mut buffer[offset..])?; + } else { + if buffer.len() != compressed.len() { + return Err(Error::OutOfSpec( + "V2 Page Header reported incorrect decompressed size".to_string(), + )); + } + buffer.copy_from_slice(compressed); + } + Ok(()) +} + +/// decompresses a [`CompressedDataPage`] into `buffer`. +/// If the page is un-compressed, `buffer` is swapped instead. +/// Returns whether the page was decompressed. +pub fn decompress_buffer( + compressed_page: &mut CompressedPage, + buffer: &mut Vec, +) -> Result { + if compressed_page.compression() != Compression::Uncompressed { + // prepare the compression buffer + let read_size = compressed_page.uncompressed_size(); + + if read_size > buffer.capacity() { + // dealloc and ignore region, replacing it by a new region. + // This won't reallocate - it frees and calls `alloc_zeroed` + *buffer = vec![0; read_size]; + } else if read_size > buffer.len() { + // fill what we need with zeros so that we can use them in `Read`. + // This won't reallocate + buffer.resize(read_size, 0); + } else { + buffer.truncate(read_size); + } + match compressed_page { + CompressedPage::Data(compressed_page) => match compressed_page.header() { + DataPageHeader::V1(_) => { + decompress_v1(&compressed_page.buffer, compressed_page.compression, buffer)? + }, + DataPageHeader::V2(header) => decompress_v2( + &compressed_page.buffer, + header, + compressed_page.compression, + buffer, + )?, + }, + CompressedPage::Dict(page) => decompress_v1(&page.buffer, page.compression(), buffer)?, + } + Ok(true) + } else { + // page.buffer is already decompressed => swap it with `buffer`, making `page.buffer` the + // decompression buffer and `buffer` the decompressed buffer + std::mem::swap(compressed_page.buffer(), buffer); + Ok(false) + } +} + +fn create_page(compressed_page: CompressedPage, buffer: Vec) -> Page { + match compressed_page { + CompressedPage::Data(page) => Page::Data(DataPage::new_read( + page.header, + buffer, + page.descriptor, + page.selected_rows, + )), + CompressedPage::Dict(page) => Page::Dict(DictPage { + buffer, + num_values: page.num_values, + is_sorted: page.is_sorted, + }), + } +} + +/// Decompresses the page, using `buffer` for decompression. +/// If `page.buffer.len() == 0`, there was no decompression and the buffer was moved. +/// Else, decompression took place. +pub fn decompress(mut compressed_page: CompressedPage, buffer: &mut Vec) -> Result { + decompress_buffer(&mut compressed_page, buffer)?; + Ok(create_page(compressed_page, std::mem::take(buffer))) +} + +fn decompress_reuse( + mut compressed_page: CompressedPage, + iterator: &mut P, + buffer: &mut Vec, +) -> Result<(Page, bool)> { + let was_decompressed = decompress_buffer(&mut compressed_page, buffer)?; + + if was_decompressed { + iterator.swap_buffer(compressed_page.buffer()) + }; + + let new_page = create_page(compressed_page, std::mem::take(buffer)); + + Ok((new_page, was_decompressed)) +} + +/// Decompressor that allows re-using the page buffer of [`PageIterator`]. +/// # Implementation +/// The implementation depends on whether a page is compressed or not. +/// > `PageReader(a)`, `CompressedPage(b)`, `Decompressor(c)`, `DecompressedPage(d)` +/// ### un-compressed pages: +/// > page iter: `a` is swapped with `b` +/// > decompress iter: `b` is swapped with `d`, `b` is swapped with `a` +/// therefore: +/// * `PageReader` has its buffer back +/// * `Decompressor`'s buffer is un-used +/// * `DecompressedPage` has the same data as `CompressedPage` had +/// ### compressed pages: +/// > page iter: `a` is swapped with `b` +/// > decompress iter: +/// > * `b` is decompressed into `c` +/// > * `b` is swapped with `a` +/// > * `c` is moved to `d` +/// > * (next iteration): `d` is moved to `c` +/// therefore, while the page is available: +/// * `PageReader` has its buffer back +/// * `Decompressor`'s buffer empty +/// * `DecompressedPage` has the decompressed buffer +/// after the page is used: +/// * `PageReader` has its buffer back +/// * `Decompressor` has its buffer back +/// * `DecompressedPage` has an empty buffer +pub struct Decompressor { + iter: P, + buffer: Vec, + current: Option, + was_decompressed: bool, +} + +impl Decompressor

{ + /// Creates a new [`Decompressor`]. + pub fn new(iter: P, buffer: Vec) -> Self { + Self { + iter, + buffer, + current: None, + was_decompressed: false, + } + } + + /// Returns two buffers: the first buffer corresponds to the page buffer, + /// the second to the decompression buffer. + pub fn into_buffers(mut self) -> (Vec, Vec) { + let mut page_buffer = vec![]; + self.iter.swap_buffer(&mut page_buffer); + (page_buffer, self.buffer) + } +} + +impl FallibleStreamingIterator for Decompressor

{ + type Item = Page; + type Error = Error; + + fn advance(&mut self) -> Result<()> { + if let Some(page) = self.current.as_mut() { + if self.was_decompressed { + self.buffer = std::mem::take(page.buffer()); + } else { + self.iter.swap_buffer(page.buffer()); + } + } + + let next = self + .iter + .next() + .map(|x| { + x.and_then(|x| { + let (page, was_decompressed) = + decompress_reuse(x, &mut self.iter, &mut self.buffer)?; + self.was_decompressed = was_decompressed; + Ok(page) + }) + }) + .transpose()?; + self.current = next; + Ok(()) + } + + fn get(&self) -> Option<&Self::Item> { + self.current.as_ref() + } +} + +type _Decompressor = streaming_decompression::Decompressor< + CompressedPage, + Page, + fn(CompressedPage, &mut Vec) -> Result, + Error, + I, +>; + +impl streaming_decompression::Compressed for CompressedPage { + #[inline] + fn is_compressed(&self) -> bool { + self.compression() != Compression::Uncompressed + } +} + +impl streaming_decompression::Decompressed for Page { + #[inline] + fn buffer_mut(&mut self) -> &mut Vec { + self.buffer() + } +} + +/// A [`FallibleStreamingIterator`] that decompresses [`CompressedPage`] into [`DataPage`]. +/// # Implementation +/// This decompressor uses an internal [`Vec`] to perform decompressions which +/// is re-used across pages, so that a single allocation is required. +/// If the pages are not compressed, the internal buffer is not used. +pub struct BasicDecompressor>> { + iter: _Decompressor, +} + +impl BasicDecompressor +where + I: Iterator>, +{ + /// Returns a new [`BasicDecompressor`]. + pub fn new(iter: I, buffer: Vec) -> Self { + Self { + iter: _Decompressor::new(iter, buffer, decompress), + } + } + + /// Returns its internal buffer, consuming itself. + pub fn into_inner(self) -> Vec { + self.iter.into_inner() + } +} + +impl FallibleStreamingIterator for BasicDecompressor +where + I: Iterator>, +{ + type Item = Page; + type Error = Error; + + fn advance(&mut self) -> Result<()> { + self.iter.advance() + } + + fn get(&self) -> Option<&Self::Item> { + self.iter.get() + } +} diff --git a/crates/polars-parquet/src/parquet/read/indexes/deserialize.rs b/crates/polars-parquet/src/parquet/read/indexes/deserialize.rs new file mode 100644 index 000000000000..1570605f83d0 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/indexes/deserialize.rs @@ -0,0 +1,27 @@ +use parquet_format_safe::thrift::protocol::TCompactInputProtocol; +use parquet_format_safe::ColumnIndex; + +use crate::parquet::error::Error; +use crate::parquet::indexes::{BooleanIndex, ByteIndex, FixedLenByteIndex, Index, NativeIndex}; +use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; + +pub fn deserialize(data: &[u8], primitive_type: PrimitiveType) -> Result, Error> { + let mut prot = TCompactInputProtocol::new(data, data.len() * 2 + 1024); + + let index = ColumnIndex::read_from_in_protocol(&mut prot)?; + + let index = match primitive_type.physical_type { + PhysicalType::Boolean => Box::new(BooleanIndex::try_new(index)?) as Box, + PhysicalType::Int32 => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::Int64 => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_new(index, primitive_type)?), + PhysicalType::Float => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::Double => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::ByteArray => Box::new(ByteIndex::try_new(index, primitive_type)?), + PhysicalType::FixedLenByteArray(_) => { + Box::new(FixedLenByteIndex::try_new(index, primitive_type)?) + }, + }; + + Ok(index) +} diff --git a/crates/polars-parquet/src/parquet/read/indexes/mod.rs b/crates/polars-parquet/src/parquet/read/indexes/mod.rs new file mode 100644 index 000000000000..1e1919c84c75 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/indexes/mod.rs @@ -0,0 +1,4 @@ +mod deserialize; +mod read; + +pub use read::*; diff --git a/crates/polars-parquet/src/parquet/read/indexes/read.rs b/crates/polars-parquet/src/parquet/read/indexes/read.rs new file mode 100644 index 000000000000..379fb4150766 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/indexes/read.rs @@ -0,0 +1,131 @@ +use std::convert::TryInto; +use std::io::{Cursor, Read, Seek, SeekFrom}; + +use parquet_format_safe::thrift::protocol::TCompactInputProtocol; +use parquet_format_safe::{ColumnChunk, OffsetIndex, PageLocation}; + +use super::deserialize::deserialize; +use crate::parquet::error::Error; +use crate::parquet::indexes::Index; +use crate::parquet::metadata::ColumnChunkMetaData; + +fn prepare_read Option, G: Fn(&ColumnChunk) -> Option>( + chunks: &[ColumnChunkMetaData], + get_offset: F, + get_length: G, +) -> Result<(u64, Vec), Error> { + // c1: [start, length] + // ... + // cN: [start, length] + + let first_chunk = if let Some(chunk) = chunks.first() { + chunk + } else { + return Ok((0, vec![])); + }; + let metadata = first_chunk.column_chunk(); + + let offset: u64 = if let Some(offset) = get_offset(metadata) { + offset.try_into()? + } else { + return Ok((0, vec![])); + }; + + let lengths = chunks + .iter() + .map(|x| get_length(x.column_chunk())) + .map(|maybe_length| { + let index_length = maybe_length.ok_or_else(|| { + Error::oos("The column length must exist if column offset exists") + })?; + + Ok(index_length.try_into()?) + }) + .collect::, Error>>()?; + + Ok((offset, lengths)) +} + +fn prepare_column_index_read(chunks: &[ColumnChunkMetaData]) -> Result<(u64, Vec), Error> { + prepare_read(chunks, |x| x.column_index_offset, |x| x.column_index_length) +} + +fn prepare_offset_index_read(chunks: &[ColumnChunkMetaData]) -> Result<(u64, Vec), Error> { + prepare_read(chunks, |x| x.offset_index_offset, |x| x.offset_index_length) +} + +fn deserialize_column_indexes( + chunks: &[ColumnChunkMetaData], + data: &[u8], + lengths: Vec, +) -> Result>, Error> { + let mut start = 0; + let data = lengths.into_iter().map(|length| { + let r = &data[start..start + length]; + start += length; + r + }); + + chunks + .iter() + .zip(data) + .map(|(chunk, data)| { + let primitive_type = chunk.descriptor().descriptor.primitive_type.clone(); + deserialize(data, primitive_type) + }) + .collect() +} + +/// Reads the column indexes of all [`ColumnChunkMetaData`] and deserializes them into [`Index`]. +/// Returns an empty vector if indexes are not available +pub fn read_columns_indexes( + reader: &mut R, + chunks: &[ColumnChunkMetaData], +) -> Result>, Error> { + let (offset, lengths) = prepare_column_index_read(chunks)?; + + let length = lengths.iter().sum::(); + + reader.seek(SeekFrom::Start(offset))?; + + let mut data = vec![]; + data.try_reserve(length)?; + reader.by_ref().take(length as u64).read_to_end(&mut data)?; + + deserialize_column_indexes(chunks, &data, lengths) +} + +fn deserialize_page_locations( + data: &[u8], + column_number: usize, +) -> Result>, Error> { + let len = data.len() * 2 + 1024; + let mut reader = Cursor::new(data); + + (0..column_number) + .map(|_| { + let mut prot = TCompactInputProtocol::new(&mut reader, len); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + Ok(offset.page_locations) + }) + .collect() +} + +/// Read [`PageLocation`]s from the [`ColumnChunkMetaData`]s. +/// Returns an empty vector if indexes are not available +pub fn read_pages_locations( + reader: &mut R, + chunks: &[ColumnChunkMetaData], +) -> Result>, Error> { + let (offset, lengths) = prepare_offset_index_read(chunks)?; + + let length = lengths.iter().sum::(); + + reader.seek(SeekFrom::Start(offset))?; + + let mut data = vec![]; + data.try_reserve(length)?; + reader.by_ref().take(length as u64).read_to_end(&mut data)?; + + deserialize_page_locations(&data, chunks.len()) +} diff --git a/crates/polars-parquet/src/parquet/read/levels.rs b/crates/polars-parquet/src/parquet/read/levels.rs new file mode 100644 index 000000000000..69d12cff9194 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/levels.rs @@ -0,0 +1,27 @@ +/// Returns the number of bits needed to store the given maximum definition or repetition level. +#[inline] +pub fn get_bit_width(max_level: i16) -> u32 { + 16 - max_level.leading_zeros() +} + +#[cfg(test)] +mod tests { + use super::get_bit_width; + + #[test] + fn test_get_bit_width() { + assert_eq!(0, get_bit_width(0)); + assert_eq!(1, get_bit_width(1)); + assert_eq!(2, get_bit_width(2)); + assert_eq!(2, get_bit_width(3)); + assert_eq!(3, get_bit_width(4)); + assert_eq!(3, get_bit_width(5)); + assert_eq!(3, get_bit_width(6)); + assert_eq!(3, get_bit_width(7)); + assert_eq!(4, get_bit_width(8)); + assert_eq!(4, get_bit_width(15)); + + assert_eq!(8, get_bit_width(255)); + assert_eq!(9, get_bit_width(256)); + } +} diff --git a/crates/polars-parquet/src/parquet/read/metadata.rs b/crates/polars-parquet/src/parquet/read/metadata.rs new file mode 100644 index 000000000000..a75b939a513c --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/metadata.rs @@ -0,0 +1,101 @@ +use std::cmp::min; +use std::convert::TryInto; +use std::io::{Read, Seek, SeekFrom}; + +use parquet_format_safe::thrift::protocol::TCompactInputProtocol; +use parquet_format_safe::FileMetaData as TFileMetaData; + +use super::super::metadata::FileMetaData; +use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, HEADER_SIZE, PARQUET_MAGIC}; +use crate::parquet::error::{Error, Result}; + +pub(super) fn metadata_len(buffer: &[u8], len: usize) -> i32 { + i32::from_le_bytes(buffer[len - 8..len - 4].try_into().unwrap()) +} + +// see (unstable) Seek::stream_len +fn stream_len(seek: &mut impl Seek) -> std::result::Result { + let old_pos = seek.stream_position()?; + let len = seek.seek(SeekFrom::End(0))?; + + // Avoid seeking a third time when we were already at the end of the + // stream. The branch is usually way cheaper than a seek operation. + if old_pos != len { + seek.seek(SeekFrom::Start(old_pos))?; + } + + Ok(len) +} + +/// Reads a [`FileMetaData`] from the reader, located at the end of the file. +pub fn read_metadata(reader: &mut R) -> Result { + // check file is large enough to hold footer + let file_size = stream_len(reader)?; + read_metadata_with_size(reader, file_size) +} + +/// Reads a [`FileMetaData`] from the reader, located at the end of the file, with known file size. +pub fn read_metadata_with_size( + reader: &mut R, + file_size: u64, +) -> Result { + if file_size < HEADER_SIZE + FOOTER_SIZE { + return Err(Error::oos( + "A parquet file must contain a header and footer with at least 12 bytes", + )); + } + + // read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer + let default_end_len = min(DEFAULT_FOOTER_READ_SIZE, file_size) as usize; + reader.seek(SeekFrom::End(-(default_end_len as i64)))?; + + let mut buffer = Vec::with_capacity(default_end_len); + reader + .by_ref() + .take(default_end_len as u64) + .read_to_end(&mut buffer)?; + + // check this is indeed a parquet file + if buffer[default_end_len - 4..] != PARQUET_MAGIC { + return Err(Error::oos("The file must end with PAR1")); + } + + let metadata_len = metadata_len(&buffer, default_end_len); + + let metadata_len: u64 = metadata_len.try_into()?; + + let footer_len = FOOTER_SIZE + metadata_len; + if footer_len > file_size { + return Err(Error::oos( + "The footer size must be smaller or equal to the file's size", + )); + } + + let reader: &[u8] = if (footer_len as usize) < buffer.len() { + // the whole metadata is in the bytes we already read + let remaining = buffer.len() - footer_len as usize; + &buffer[remaining..] + } else { + // the end of file read by default is not long enough, read again including the metadata. + reader.seek(SeekFrom::End(-(footer_len as i64)))?; + + buffer.clear(); + buffer.try_reserve(footer_len as usize)?; + reader.take(footer_len).read_to_end(&mut buffer)?; + + &buffer + }; + + // a highly nested but sparse struct could result in many allocations + let max_size = reader.len() * 2 + 1024; + + deserialize_metadata(reader, max_size) +} + +/// Parse loaded metadata bytes +pub fn deserialize_metadata(reader: R, max_size: usize) -> Result { + let mut prot = TCompactInputProtocol::new(reader, max_size); + let metadata = TFileMetaData::read_from_in_protocol(&mut prot)?; + + FileMetaData::try_from_thrift(metadata) +} diff --git a/crates/polars-parquet/src/parquet/read/mod.rs b/crates/polars-parquet/src/parquet/read/mod.rs new file mode 100644 index 000000000000..d5790c205cb1 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/mod.rs @@ -0,0 +1,237 @@ +mod column; +mod compression; +mod indexes; +pub mod levels; +mod metadata; +mod page; +#[cfg(feature = "async")] +mod stream; + +use std::io::{Read, Seek, SeekFrom}; +use std::sync::Arc; + +pub use column::*; +pub use compression::{decompress, BasicDecompressor, Decompressor}; +pub use indexes::{read_columns_indexes, read_pages_locations}; +pub use metadata::{deserialize_metadata, read_metadata, read_metadata_with_size}; +#[cfg(feature = "async")] +pub use page::{get_page_stream, get_page_stream_from_column_start}; +pub use page::{IndexedPageReader, PageFilter, PageIterator, PageMetaData, PageReader}; +#[cfg(feature = "async")] +pub use stream::read_metadata as read_metadata_async; + +use crate::parquet::error::Result; +use crate::parquet::metadata::{ColumnChunkMetaData, FileMetaData, RowGroupMetaData}; + +/// Filters row group metadata to only those row groups, +/// for which the predicate function returns true +pub fn filter_row_groups( + metadata: &FileMetaData, + predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool, +) -> FileMetaData { + let mut filtered_row_groups = Vec::::new(); + for (i, row_group_metadata) in metadata.row_groups.iter().enumerate() { + if predicate(row_group_metadata, i) { + filtered_row_groups.push(row_group_metadata.clone()); + } + } + let mut metadata = metadata.clone(); + metadata.row_groups = filtered_row_groups; + metadata +} + +/// Returns a new [`PageReader`] by seeking `reader` to the beginning of `column_chunk`. +pub fn get_page_iterator( + column_chunk: &ColumnChunkMetaData, + mut reader: R, + pages_filter: Option, + scratch: Vec, + max_page_size: usize, +) -> Result> { + let pages_filter = pages_filter.unwrap_or_else(|| Arc::new(|_, _| true)); + + let (col_start, _) = column_chunk.byte_range(); + reader.seek(SeekFrom::Start(col_start))?; + Ok(PageReader::new( + reader, + column_chunk, + pages_filter, + scratch, + max_page_size, + )) +} + +/// Returns all [`ColumnChunkMetaData`] associated to `field_name`. +/// For non-nested types, this returns an iterator with a single column +pub fn get_field_columns<'a>( + columns: &'a [ColumnChunkMetaData], + field_name: &'a str, +) -> impl Iterator { + columns + .iter() + .filter(move |x| x.descriptor().path_in_schema[0] == field_name) +} + +#[cfg(test)] +mod tests { + use std::fs::File; + + use super::*; + use crate::parquet::tests::get_path; + use crate::parquet::FallibleStreamingIterator; + + #[test] + fn basic() -> Result<()> { + let mut testdata = get_path(); + testdata.push("alltypes_plain.parquet"); + let mut file = File::open(testdata).unwrap(); + + let metadata = read_metadata(&mut file)?; + + let row_group = 0; + let column = 0; + let column_metadata = &metadata.row_groups[row_group].columns()[column]; + let buffer = vec![]; + let mut iter = get_page_iterator(column_metadata, &mut file, None, buffer, 1024 * 1024)?; + + let dict = iter.next().unwrap().unwrap(); + assert_eq!(dict.num_values(), 0); + let page = iter.next().unwrap().unwrap(); + assert_eq!(page.num_values(), 8); + Ok(()) + } + + #[test] + fn reuse_buffer() -> Result<()> { + let mut testdata = get_path(); + testdata.push("alltypes_plain.snappy.parquet"); + let mut file = File::open(testdata).unwrap(); + + let metadata = read_metadata(&mut file)?; + + let row_group = 0; + let column = 0; + let column_metadata = &metadata.row_groups[row_group].columns()[column]; + let buffer = vec![0]; + let iterator = get_page_iterator(column_metadata, &mut file, None, buffer, 1024 * 1024)?; + + let buffer = vec![]; + let mut iterator = Decompressor::new(iterator, buffer); + + let _dict = iterator.next()?.unwrap(); + let _page = iterator.next()?.unwrap(); + + assert!(iterator.next()?.is_none()); + let (a, b) = iterator.into_buffers(); + assert_eq!(a.len(), 11); // note: compressed is higher in this example. + assert_eq!(b.len(), 9); + + Ok(()) + } + + #[test] + fn reuse_buffer_decompress() -> Result<()> { + let mut testdata = get_path(); + testdata.push("alltypes_plain.parquet"); + let mut file = File::open(testdata).unwrap(); + + let metadata = read_metadata(&mut file)?; + + let row_group = 0; + let column = 0; + let column_metadata = &metadata.row_groups[row_group].columns()[column]; + let buffer = vec![1]; + let iterator = get_page_iterator(column_metadata, &mut file, None, buffer, 1024 * 1024)?; + + let buffer = vec![]; + let mut iterator = Decompressor::new(iterator, buffer); + + // dict + iterator.next()?.unwrap(); + // page + iterator.next()?.unwrap(); + + assert!(iterator.next()?.is_none()); + let (a, b) = iterator.into_buffers(); + + assert_eq!(a.len(), 11); + assert_eq!(b.len(), 0); // the decompressed buffer is never used because it is always swapped with the other buffer. + + Ok(()) + } + + #[test] + fn column_iter() -> Result<()> { + let mut testdata = get_path(); + testdata.push("alltypes_plain.parquet"); + let mut file = File::open(testdata).unwrap(); + + let metadata = read_metadata(&mut file)?; + + let row_group = 0; + let column = 0; + let column_metadata = &metadata.row_groups[row_group].columns()[column]; + let iter: Vec<_> = + get_page_iterator(column_metadata, &mut file, None, vec![], usize::MAX)?.collect(); + + let field = metadata.schema().fields()[0].clone(); + let mut iter = ReadColumnIterator::new(field, vec![(iter, column_metadata.clone())]); + + loop { + match iter.advance()? { + State::Some(mut new_iter) => { + if let Some((pages, _descriptor)) = new_iter.get() { + let mut iterator = BasicDecompressor::new(pages, vec![]); + while let Some(_page) = iterator.next()? { + // do something with it + } + let _internal_buffer = iterator.into_inner(); + } + iter = new_iter; + }, + State::Finished(_buffer) => { + assert!(_buffer.is_empty()); // data is uncompressed => buffer is always moved + break; + }, + } + } + Ok(()) + } + + #[test] + fn basics_column_iterator() -> Result<()> { + let mut testdata = get_path(); + testdata.push("alltypes_plain.parquet"); + let mut file = File::open(testdata).unwrap(); + + let metadata = read_metadata(&mut file)?; + + let mut iter = ColumnIterator::new( + file, + metadata.row_groups[0].columns().to_vec(), + None, + vec![], + usize::MAX, // we trust the file is correct + ); + + loop { + match iter.advance()? { + State::Some(mut new_iter) => { + if let Some((pages, _descriptor)) = new_iter.get() { + let mut iterator = BasicDecompressor::new(pages, vec![]); + while let Some(_page) = iterator.next()? { + // do something with it + } + let _internal_buffer = iterator.into_inner(); + } + iter = new_iter; + }, + State::Finished(_buffer) => { + assert!(_buffer.is_empty()); // data is uncompressed => buffer is always moved + break; + }, + } + } + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/read/page/indexed_reader.rs b/crates/polars-parquet/src/parquet/read/page/indexed_reader.rs new file mode 100644 index 000000000000..ac11e725070c --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/page/indexed_reader.rs @@ -0,0 +1,204 @@ +use std::collections::VecDeque; +use std::io::{Cursor, Read, Seek, SeekFrom}; + +use super::reader::{finish_page, read_page_header, PageMetaData}; +use crate::parquet::error::Error; +use crate::parquet::indexes::{FilteredPage, Interval}; +use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; +use crate::parquet::page::{CompressedDictPage, CompressedPage, ParquetPageHeader}; +use crate::parquet::parquet_bridge::Compression; + +#[derive(Debug, Clone, Copy)] +enum State { + MaybeDict, + Data, +} + +/// A fallible [`Iterator`] of [`CompressedPage`]. This iterator leverages page indexes +/// to skip pages that are not needed. Consequently, the pages from this +/// iterator always have [`Some`] [`crate::parquet::page::CompressedDataPage::selected_rows()`] +pub struct IndexedPageReader { + // The source + reader: R, + + column_start: u64, + compression: Compression, + + // used to deserialize dictionary pages and attach the descriptor to every read page + descriptor: Descriptor, + + // buffer to read the whole page [header][data] into memory + buffer: Vec, + + // buffer to store the data [data] and re-use across pages + data_buffer: Vec, + + pages: VecDeque, + + state: State, +} + +fn read_page( + reader: &mut R, + start: u64, + length: usize, + buffer: &mut Vec, + data: &mut Vec, +) -> Result { + // seek to the page + reader.seek(SeekFrom::Start(start))?; + + // read [header][data] to buffer + buffer.clear(); + buffer.try_reserve(length)?; + reader.by_ref().take(length as u64).read_to_end(buffer)?; + + // deserialize [header] + let mut reader = Cursor::new(buffer); + let page_header = read_page_header(&mut reader, 1024 * 1024)?; + let header_size = reader.stream_position().unwrap() as usize; + let buffer = reader.into_inner(); + + // copy [data] + data.clear(); + data.extend_from_slice(&buffer[header_size..]); + Ok(page_header) +} + +fn read_dict_page( + reader: &mut R, + start: u64, + length: usize, + buffer: &mut Vec, + data: &mut Vec, + compression: Compression, + descriptor: &Descriptor, +) -> Result { + let page_header = read_page(reader, start, length, buffer, data)?; + + let page = finish_page(page_header, data, compression, descriptor, None)?; + if let CompressedPage::Dict(page) = page { + Ok(page) + } else { + Err(Error::oos( + "The first page is not a dictionary page but it should", + )) + } +} + +impl IndexedPageReader { + /// Returns a new [`IndexedPageReader`]. + pub fn new( + reader: R, + column: &ColumnChunkMetaData, + pages: Vec, + buffer: Vec, + data_buffer: Vec, + ) -> Self { + Self::new_with_page_meta(reader, column.into(), pages, buffer, data_buffer) + } + + /// Returns a new [`IndexedPageReader`] with [`PageMetaData`]. + pub fn new_with_page_meta( + reader: R, + column: PageMetaData, + pages: Vec, + buffer: Vec, + data_buffer: Vec, + ) -> Self { + let pages = pages.into_iter().collect(); + Self { + reader, + column_start: column.column_start, + compression: column.compression, + descriptor: column.descriptor, + buffer, + data_buffer, + pages, + state: State::MaybeDict, + } + } + + /// consumes self into the reader and the two internal buffers + pub fn into_inner(self) -> (R, Vec, Vec) { + (self.reader, self.buffer, self.data_buffer) + } + + fn read_page( + &mut self, + start: u64, + length: usize, + selected_rows: Vec, + ) -> Result { + // it will be read - take buffer + let mut data = std::mem::take(&mut self.data_buffer); + + let page_header = read_page(&mut self.reader, start, length, &mut self.buffer, &mut data)?; + + finish_page( + page_header, + &mut data, + self.compression, + &self.descriptor, + Some(selected_rows), + ) + } + + fn read_dict(&mut self) -> Option> { + // a dictionary page exists iff the first data page is not at the start of + // the column + let (start, length) = match self.pages.get(0) { + Some(page) => { + let length = (page.start - self.column_start) as usize; + if length > 0 { + (self.column_start, length) + } else { + return None; + } + }, + None => return None, + }; + + // it will be read - take buffer + let mut data = std::mem::take(&mut self.data_buffer); + + let maybe_page = read_dict_page( + &mut self.reader, + start, + length, + &mut self.buffer, + &mut data, + self.compression, + &self.descriptor, + ); + Some(maybe_page.map(CompressedPage::Dict)) + } +} + +impl Iterator for IndexedPageReader { + type Item = Result; + + fn next(&mut self) -> Option { + match self.state { + State::MaybeDict => { + self.state = State::Data; + if let Some(dict) = self.read_dict() { + Some(dict) + } else { + self.next() + } + }, + State::Data => { + if let Some(page) = self.pages.pop_front() { + if page.selected_rows.is_empty() { + self.next() + } else { + Some(self.read_page(page.start, page.length, page.selected_rows)) + } + } else { + None + } + }, + } + } +} diff --git a/crates/polars-parquet/src/parquet/read/page/mod.rs b/crates/polars-parquet/src/parquet/read/page/mod.rs new file mode 100644 index 000000000000..d3437ef5d14d --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/page/mod.rs @@ -0,0 +1,18 @@ +mod indexed_reader; +mod reader; +#[cfg(feature = "async")] +mod stream; + +pub use indexed_reader::IndexedPageReader; +pub use reader::{PageFilter, PageMetaData, PageReader}; + +use crate::parquet::error::Error; +use crate::parquet::page::CompressedPage; + +pub trait PageIterator: Iterator> { + fn swap_buffer(&mut self, buffer: &mut Vec); +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub use stream::{get_page_stream, get_page_stream_from_column_start}; diff --git a/crates/polars-parquet/src/parquet/read/page/reader.rs b/crates/polars-parquet/src/parquet/read/page/reader.rs new file mode 100644 index 000000000000..e0078f97c6d4 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/page/reader.rs @@ -0,0 +1,306 @@ +use std::convert::TryInto; +use std::io::Read; +use std::sync::Arc; + +use parquet_format_safe::thrift::protocol::TCompactInputProtocol; + +use super::PageIterator; +use crate::parquet::compression::Compression; +use crate::parquet::error::{Error, Result}; +use crate::parquet::indexes::Interval; +use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; +use crate::parquet::page::{ + CompressedDataPage, CompressedDictPage, CompressedPage, DataPageHeader, PageType, + ParquetPageHeader, +}; +use crate::parquet::parquet_bridge::Encoding; + +/// This meta is a small part of [`ColumnChunkMetaData`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PageMetaData { + /// The start offset of this column chunk in file. + pub column_start: u64, + /// The number of values in this column chunk. + pub num_values: i64, + /// Compression type + pub compression: Compression, + /// The descriptor of this parquet column + pub descriptor: Descriptor, +} + +impl PageMetaData { + /// Returns a new [`PageMetaData`]. + pub fn new( + column_start: u64, + num_values: i64, + compression: Compression, + descriptor: Descriptor, + ) -> Self { + Self { + column_start, + num_values, + compression, + descriptor, + } + } +} + +impl From<&ColumnChunkMetaData> for PageMetaData { + fn from(column: &ColumnChunkMetaData) -> Self { + Self { + column_start: column.byte_range().0, + num_values: column.num_values(), + compression: column.compression(), + descriptor: column.descriptor().descriptor.clone(), + } + } +} + +/// Type declaration for a page filter +pub type PageFilter = Arc bool + Send + Sync>; + +/// A fallible [`Iterator`] of [`CompressedDataPage`]. This iterator reads pages back +/// to back until all pages have been consumed. +/// The pages from this iterator always have [`None`] [`crate::parquet::page::CompressedDataPage::selected_rows()`] since +/// filter pushdown is not supported without a +/// pre-computed [page index](https://github.com/apache/parquet-format/blob/master/PageIndex.md). +pub struct PageReader { + // The source + reader: R, + + compression: Compression, + + // The number of values we have seen so far. + seen_num_values: i64, + + // The number of total values in this column chunk. + total_num_values: i64, + + pages_filter: PageFilter, + + descriptor: Descriptor, + + // The currently allocated buffer. + pub(crate) scratch: Vec, + + // Maximum page size (compressed or uncompressed) to limit allocations + max_page_size: usize, +} + +impl PageReader { + /// Returns a new [`PageReader`]. + /// + /// It assumes that the reader has been `sought` (`seek`) to the beginning of `column`. + /// The parameter `max_header_size` + pub fn new( + reader: R, + column: &ColumnChunkMetaData, + pages_filter: PageFilter, + scratch: Vec, + max_page_size: usize, + ) -> Self { + Self::new_with_page_meta(reader, column.into(), pages_filter, scratch, max_page_size) + } + + /// Create a a new [`PageReader`] with [`PageMetaData`]. + /// + /// It assumes that the reader has been `sought` (`seek`) to the beginning of `column`. + pub fn new_with_page_meta( + reader: R, + reader_meta: PageMetaData, + pages_filter: PageFilter, + scratch: Vec, + max_page_size: usize, + ) -> Self { + Self { + reader, + total_num_values: reader_meta.num_values, + compression: reader_meta.compression, + seen_num_values: 0, + descriptor: reader_meta.descriptor, + pages_filter, + scratch, + max_page_size, + } + } + + /// Returns the reader and this Readers' interval buffer + pub fn into_inner(self) -> (R, Vec) { + (self.reader, self.scratch) + } +} + +impl PageIterator for PageReader { + fn swap_buffer(&mut self, scratch: &mut Vec) { + std::mem::swap(&mut self.scratch, scratch) + } +} + +impl Iterator for PageReader { + type Item = Result; + + fn next(&mut self) -> Option { + let mut buffer = std::mem::take(&mut self.scratch); + let maybe_maybe_page = next_page(self, &mut buffer).transpose(); + if let Some(ref maybe_page) = maybe_maybe_page { + if let Ok(CompressedPage::Data(page)) = maybe_page { + // check if we should filter it (only valid for data pages) + let to_consume = (self.pages_filter)(&self.descriptor, page.header()); + if !to_consume { + self.scratch = std::mem::take(&mut buffer); + return self.next(); + } + } + } else { + // no page => we take back the buffer + self.scratch = std::mem::take(&mut buffer); + } + maybe_maybe_page + } +} + +/// Reads Page header from Thrift. +pub(super) fn read_page_header( + reader: &mut R, + max_size: usize, +) -> Result { + let mut prot = TCompactInputProtocol::new(reader, max_size); + let page_header = ParquetPageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) +} + +/// This function is lightweight and executes a minimal amount of work so that it is IO bounded. +// Any un-necessary CPU-intensive tasks SHOULD be executed on individual pages. +fn next_page( + reader: &mut PageReader, + buffer: &mut Vec, +) -> Result> { + if reader.seen_num_values >= reader.total_num_values { + return Ok(None); + }; + build_page(reader, buffer) +} + +pub(super) fn build_page( + reader: &mut PageReader, + buffer: &mut Vec, +) -> Result> { + let page_header = read_page_header(&mut reader.reader, reader.max_page_size)?; + + reader.seen_num_values += get_page_header(&page_header)? + .map(|x| x.num_values() as i64) + .unwrap_or_default(); + + let read_size: usize = page_header.compressed_page_size.try_into()?; + + if read_size > reader.max_page_size { + return Err(Error::WouldOverAllocate); + } + + buffer.clear(); + buffer.try_reserve(read_size)?; + let bytes_read = reader + .reader + .by_ref() + .take(read_size as u64) + .read_to_end(buffer)?; + + if bytes_read != read_size { + return Err(Error::oos( + "The page header reported the wrong page size".to_string(), + )); + } + + finish_page( + page_header, + buffer, + reader.compression, + &reader.descriptor, + None, + ) + .map(Some) +} + +pub(super) fn finish_page( + page_header: ParquetPageHeader, + data: &mut Vec, + compression: Compression, + descriptor: &Descriptor, + selected_rows: Option>, +) -> Result { + let type_ = page_header.type_.try_into()?; + let uncompressed_page_size = page_header.uncompressed_page_size.try_into()?; + match type_ { + PageType::DictionaryPage => { + let dict_header = page_header.dictionary_page_header.as_ref().ok_or_else(|| { + Error::oos( + "The page header type is a dictionary page but the dictionary header is empty", + ) + })?; + let is_sorted = dict_header.is_sorted.unwrap_or(false); + + // move the buffer to `dict_page` + let page = CompressedDictPage::new( + std::mem::take(data), + compression, + uncompressed_page_size, + dict_header.num_values.try_into()?, + is_sorted, + ); + + Ok(CompressedPage::Dict(page)) + }, + PageType::DataPage => { + let header = page_header.data_page_header.ok_or_else(|| { + Error::oos("The page header type is a v1 data page but the v1 data header is empty") + })?; + + Ok(CompressedPage::Data(CompressedDataPage::new_read( + DataPageHeader::V1(header), + std::mem::take(data), + compression, + uncompressed_page_size, + descriptor.clone(), + selected_rows, + ))) + }, + PageType::DataPageV2 => { + let header = page_header.data_page_header_v2.ok_or_else(|| { + Error::oos("The page header type is a v2 data page but the v2 data header is empty") + })?; + + Ok(CompressedPage::Data(CompressedDataPage::new_read( + DataPageHeader::V2(header), + std::mem::take(data), + compression, + uncompressed_page_size, + descriptor.clone(), + selected_rows, + ))) + }, + } +} + +pub(super) fn get_page_header(header: &ParquetPageHeader) -> Result> { + let type_ = header.type_.try_into()?; + Ok(match type_ { + PageType::DataPage => { + let header = header.data_page_header.clone().ok_or_else(|| { + Error::oos("The page header type is a v1 data page but the v1 header is empty") + })?; + let _: Encoding = header.encoding.try_into()?; + let _: Encoding = header.repetition_level_encoding.try_into()?; + let _: Encoding = header.definition_level_encoding.try_into()?; + + Some(DataPageHeader::V1(header)) + }, + PageType::DataPageV2 => { + let header = header.data_page_header_v2.clone().ok_or_else(|| { + Error::oos("The page header type is a v1 data page but the v1 header is empty") + })?; + let _: Encoding = header.encoding.try_into()?; + Some(DataPageHeader::V2(header)) + }, + _ => None, + }) +} diff --git a/crates/polars-parquet/src/parquet/read/page/stream.rs b/crates/polars-parquet/src/parquet/read/page/stream.rs new file mode 100644 index 000000000000..657e56a82c4a --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/page/stream.rs @@ -0,0 +1,138 @@ +use std::io::SeekFrom; + +use async_stream::try_stream; +use futures::io::{copy, sink}; +use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, Stream}; +use parquet_format_safe::thrift::protocol::TCompactInputStreamProtocol; + +use super::reader::{finish_page, get_page_header, PageMetaData}; +use super::PageFilter; +use crate::parquet::compression::Compression; +use crate::parquet::error::{Error, Result}; +use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; +use crate::parquet::page::{CompressedPage, ParquetPageHeader}; + +/// Returns a stream of compressed data pages +pub async fn get_page_stream<'a, RR: AsyncRead + Unpin + Send + AsyncSeek>( + column_metadata: &'a ColumnChunkMetaData, + reader: &'a mut RR, + scratch: Vec, + pages_filter: PageFilter, + max_page_size: usize, +) -> Result> + 'a> { + get_page_stream_with_page_meta( + column_metadata.into(), + reader, + scratch, + pages_filter, + max_page_size, + ) + .await +} + +/// Returns a stream of compressed data pages from a reader that begins at the start of the column +pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>( + column_metadata: &'a ColumnChunkMetaData, + reader: &'a mut R, + scratch: Vec, + pages_filter: PageFilter, + max_header_size: usize, +) -> Result> + 'a> { + let page_metadata: PageMetaData = column_metadata.into(); + Ok(_get_page_stream( + reader, + page_metadata.num_values, + page_metadata.compression, + page_metadata.descriptor, + scratch, + pages_filter, + max_header_size, + )) +} + +/// Returns a stream of compressed data pages with [`PageMetaData`] +pub async fn get_page_stream_with_page_meta( + page_metadata: PageMetaData, + reader: &mut RR, + scratch: Vec, + pages_filter: PageFilter, + max_page_size: usize, +) -> Result> + '_> { + let column_start = page_metadata.column_start; + reader.seek(SeekFrom::Start(column_start)).await?; + Ok(_get_page_stream( + reader, + page_metadata.num_values, + page_metadata.compression, + page_metadata.descriptor, + scratch, + pages_filter, + max_page_size, + )) +} + +fn _get_page_stream( + reader: &mut R, + total_num_values: i64, + compression: Compression, + descriptor: Descriptor, + mut scratch: Vec, + pages_filter: PageFilter, + max_page_size: usize, +) -> impl Stream> + '_ { + let mut seen_values = 0i64; + try_stream! { + while seen_values < total_num_values { + // the header + let page_header = read_page_header(reader, max_page_size).await?; + + let data_header = get_page_header(&page_header)?; + seen_values += data_header.as_ref().map(|x| x.num_values() as i64).unwrap_or_default(); + + let read_size: usize = page_header.compressed_page_size.try_into()?; + + if let Some(data_header) = data_header { + if !pages_filter(&descriptor, &data_header) { + // page to be skipped, we sill need to seek + copy(reader.take(read_size as u64), &mut sink()).await?; + continue + } + } + + if read_size > max_page_size { + Err(Error::WouldOverAllocate)? + } + + // followed by the buffer + scratch.clear(); + scratch.try_reserve(read_size)?; + let bytes_read = reader + .take(read_size as u64) + .read_to_end(&mut scratch).await?; + + if bytes_read != read_size { + Err(Error::oos( + "The page header reported the wrong page size".to_string(), + ))? + } + + yield finish_page( + page_header, + &mut scratch, + compression, + &descriptor, + None, + )?; + } + } +} + +/// Reads Page header from Thrift. +async fn read_page_header( + reader: &mut R, + max_page_size: usize, +) -> Result { + let mut prot = TCompactInputStreamProtocol::new(reader, max_page_size); + let page_header = ParquetPageHeader::stream_from_in_protocol(&mut prot).await?; + Ok(page_header) +} diff --git a/crates/polars-parquet/src/parquet/read/stream.rs b/crates/polars-parquet/src/parquet/read/stream.rs new file mode 100644 index 000000000000..e6e47e159937 --- /dev/null +++ b/crates/polars-parquet/src/parquet/read/stream.rs @@ -0,0 +1,88 @@ +use std::io::SeekFrom; + +use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; + +use super::super::metadata::FileMetaData; +use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, PARQUET_MAGIC}; +use super::metadata::{deserialize_metadata, metadata_len}; +use crate::parquet::error::{Error, Result}; +use crate::parquet::HEADER_SIZE; + +async fn stream_len( + seek: &mut (impl AsyncSeek + std::marker::Unpin), +) -> std::result::Result { + let old_pos = seek.seek(SeekFrom::Current(0)).await?; + let len = seek.seek(SeekFrom::End(0)).await?; + + // Avoid seeking a third time when we were already at the end of the + // stream. The branch is usually way cheaper than a seek operation. + if old_pos != len { + seek.seek(SeekFrom::Start(old_pos)).await?; + } + + Ok(len) +} + +/// Asynchronously reads the files' metadata +pub async fn read_metadata( + reader: &mut R, +) -> Result { + let file_size = stream_len(reader).await?; + + if file_size < HEADER_SIZE + FOOTER_SIZE { + return Err(Error::oos( + "A parquet file must contain a header and footer with at least 12 bytes", + )); + } + + // read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer + let default_end_len = std::cmp::min(DEFAULT_FOOTER_READ_SIZE, file_size) as usize; + reader + .seek(SeekFrom::End(-(default_end_len as i64))) + .await?; + + let mut buffer = vec![]; + buffer.try_reserve(default_end_len)?; + reader + .take(default_end_len as u64) + .read_to_end(&mut buffer) + .await?; + + // check this is indeed a parquet file + if buffer[default_end_len - 4..] != PARQUET_MAGIC { + return Err(Error::oos("Invalid Parquet file. Corrupt footer")); + } + + let metadata_len = metadata_len(&buffer, default_end_len); + let metadata_len: u64 = metadata_len.try_into()?; + + let footer_len = FOOTER_SIZE + metadata_len; + if footer_len > file_size { + return Err(Error::oos( + "The footer size must be smaller or equal to the file's size", + )); + } + + let reader = if (footer_len as usize) < buffer.len() { + // the whole metadata is in the bytes we already read + let remaining = buffer.len() - footer_len as usize; + &buffer[remaining..] + } else { + // the end of file read by default is not long enough, read again including the metadata. + reader.seek(SeekFrom::End(-(footer_len as i64))).await?; + + buffer.clear(); + buffer.try_reserve(footer_len as usize)?; + reader + .take(footer_len as u64) + .read_to_end(&mut buffer) + .await?; + + &buffer + }; + + // a highly nested but sparse struct could result in many allocations + let max_size = reader.len() * 2 + 1024; + + deserialize_metadata(reader, max_size) +} diff --git a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs new file mode 100644 index 000000000000..f04800516d55 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs @@ -0,0 +1,1159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema parser. +//! Provides methods to parse and validate string message type into Parquet +//! [`ParquetType`](crate::parquet::schema::types::ParquetType). +//! +//! # Example +//! +//! ```rust +//! use crate::parquet::parquet::schema::io_message::from_message; +//! +//! let message_type = " +//! message spark_schema { +//! OPTIONAL BYTE_ARRAY a (UTF8); +//! REQUIRED INT32 b; +//! REQUIRED DOUBLE c; +//! REQUIRED BOOLEAN d; +//! OPTIONAL group e (LIST) { +//! REPEATED group list { +//! REQUIRED INT32 element; +//! } +//! } +//! } +//! "; +//! +//! let schema = from_message(message_type).expect("Expected valid schema"); +//! println!("{:?}", schema); +//! ``` + +use parquet_format_safe::Type; +use types::PrimitiveLogicalType; + +use super::super::types::{ParquetType, TimeUnit}; +use super::super::*; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::{GroupConvertedType, PrimitiveConvertedType}; + +fn is_logical_type(s: &str) -> bool { + matches!( + s, + "INTEGER" + | "MAP" + | "LIST" + | "ENUM" + | "DECIMAL" + | "DATE" + | "TIME" + | "TIMESTAMP" + | "STRING" + | "JSON" + | "BSON" + | "UUID" + | "UNKNOWN" + | "INTERVAL" + ) +} + +fn is_converted_type(s: &str) -> bool { + matches!( + s, + "UTF8" + | "ENUM" + | "DECIMAL" + | "DATE" + | "TIME_MILLIS" + | "TIME_MICROS" + | "TIMESTAMP_MILLIS" + | "TIMESTAMP_MICROS" + | "UINT_8" + | "UINT_16" + | "UINT_32" + | "UINT_64" + | "INT_8" + | "INT_16" + | "INT_32" + | "INT_64" + | "JSON" + | "BSON" + | "INTERVAL" + ) +} + +fn converted_group_from_str(s: &str) -> Result { + Ok(match s { + "MAP" => GroupConvertedType::Map, + "MAP_KEY_VALUE" => GroupConvertedType::MapKeyValue, + "LIST" => GroupConvertedType::List, + other => return Err(Error::oos(format!("Invalid converted type {}", other))), + }) +} + +fn converted_primitive_from_str(s: &str) -> Option { + use PrimitiveConvertedType::*; + Some(match s { + "UTF8" => Utf8, + "ENUM" => Enum, + "DECIMAL" => Decimal(0, 0), + "DATE" => Date, + "TIME_MILLIS" => TimeMillis, + "TIME_MICROS" => TimeMicros, + "TIMESTAMP_MILLIS" => TimestampMillis, + "TIMESTAMP_MICROS" => TimestampMicros, + "UINT_8" => Uint8, + "UINT_16" => Uint16, + "UINT_32" => Uint32, + "UINT_64" => Uint64, + "INT_8" => Int8, + "INT_16" => Int16, + "INT_32" => Int32, + "INT_64" => Int64, + "JSON" => Json, + "BSON" => Bson, + "INTERVAL" => Interval, + _ => return None, + }) +} + +fn repetition_from_str(s: &str) -> Result { + Ok(match s { + "REQUIRED" => Repetition::Required, + "OPTIONAL" => Repetition::Optional, + "REPEATED" => Repetition::Repeated, + other => return Err(Error::oos(format!("Invalid repetition {}", other))), + }) +} + +fn type_from_str(s: &str) -> Result { + match s { + "BOOLEAN" => Ok(Type::BOOLEAN), + "INT32" => Ok(Type::INT32), + "INT64" => Ok(Type::INT64), + "INT96" => Ok(Type::INT96), + "FLOAT" => Ok(Type::FLOAT), + "DOUBLE" => Ok(Type::DOUBLE), + "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY), + "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY), + other => Err(Error::oos(format!("Invalid type {}", other))), + } +} + +/// Parses message type as string into a Parquet [`ParquetType`](crate::parquet::schema::types::ParquetType) +/// which, for example, could be used to extract individual columns. Returns Parquet +/// general error when parsing or validation fails. +pub fn from_message(message_type: &str) -> Result { + let mut parser = Parser { + tokenizer: &mut Tokenizer::from_str(message_type), + }; + parser.parse_message_type() +} + +/// Tokenizer to split message type string into tokens that are separated using characters +/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens. +/// Tokenizer provides Iterator interface to process tokens; it also allows to step back +/// to reprocess previous tokens. +struct Tokenizer<'a> { + // List of all tokens for a string + tokens: Vec<&'a str>, + // Current index of vector + index: usize, +} + +impl<'a> Tokenizer<'a> { + // Create tokenizer from message type string + pub fn from_str(string: &'a str) -> Self { + let vec = string + .split_whitespace() + .flat_map(Self::split_token) + .collect(); + Tokenizer { + tokens: vec, + index: 0, + } + } + + // List of all special characters in schema + fn is_schema_delim(c: char) -> bool { + c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ',' + } + + /// Splits string into tokens; input string can already be token or can contain + /// delimiters, e.g. required" -> Vec("required") and + /// "(UTF8);" -> Vec("(", "UTF8", ")", ";") + fn split_token(string: &str) -> Vec<&str> { + let mut buffer: Vec<&str> = Vec::new(); + let mut tail = string; + while let Some(index) = tail.find(Self::is_schema_delim) { + let (h, t) = tail.split_at(index); + if !h.is_empty() { + buffer.push(h); + } + buffer.push(&t[0..1]); + tail = &t[1..]; + } + if !tail.is_empty() { + buffer.push(tail); + } + buffer + } + + // Move pointer to a previous element + fn backtrack(&mut self) { + self.index -= 1; + } +} + +impl<'a> Iterator for Tokenizer<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if self.index < self.tokens.len() { + self.index += 1; + Some(self.tokens[self.index - 1]) + } else { + None + } + } +} + +/// Internal Schema parser. +/// Traverses message type using tokenizer and parses each group/primitive type +/// recursively. +struct Parser<'a> { + tokenizer: &'a mut Tokenizer<'a>, +} + +// Utility function to assert token on validity. +fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { + match token { + Some(value) if value == expected => Ok(()), + Some(other) => Err(Error::oos(format!( + "Expected '{}', found token '{}'", + expected, other + ))), + None => Err(Error::oos(format!( + "Expected '{}', but no token found (None)", + expected + ))), + } +} + +// Utility function to parse i32 or return general error. +fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { + value + .ok_or_else(|| Error::oos(not_found_msg)) + .and_then(|v| v.parse::().map_err(|_| Error::oos(parse_fail_msg))) +} + +// Utility function to parse boolean or return general error. +#[inline] +fn parse_bool(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { + value + .ok_or_else(|| Error::oos(not_found_msg)) + .and_then(|v| { + v.to_lowercase() + .parse::() + .map_err(|_| Error::oos(parse_fail_msg)) + }) +} + +// Utility function to parse TimeUnit or return general error. +fn parse_timeunit( + value: Option<&str>, + not_found_msg: &str, + parse_fail_msg: &str, +) -> Result { + value + .ok_or_else(|| Error::oos(not_found_msg)) + .and_then(|v| match v.to_uppercase().as_str() { + "MILLIS" => Ok(TimeUnit::Milliseconds), + "MICROS" => Ok(TimeUnit::Microseconds), + "NANOS" => Ok(TimeUnit::Nanoseconds), + _ => Err(Error::oos(parse_fail_msg)), + }) +} + +impl<'a> Parser<'a> { + // Entry function to parse message type, uses internal tokenizer. + fn parse_message_type(&mut self) -> Result { + // Check that message type starts with "message". + match self.tokenizer.next() { + Some("message") => { + let name = self + .tokenizer + .next() + .ok_or_else(|| Error::oos("Expected name, found None"))?; + let fields = self.parse_child_types()?; + Ok(ParquetType::new_root(name.to_string(), fields)) + }, + _ => Err(Error::oos("Message type does not start with 'message'")), + } + } + + // Parses child types for a current group type. + // This is only invoked on root and group types. + fn parse_child_types(&mut self) -> Result> { + assert_token(self.tokenizer.next(), "{")?; + let mut vec = Vec::new(); + while let Some(value) = self.tokenizer.next() { + if value == "}" { + break; + } else { + self.tokenizer.backtrack(); + vec.push(self.add_type()?); + } + } + Ok(vec) + } + + fn add_type(&mut self) -> Result { + // Parse repetition + let repetition = self + .tokenizer + .next() + .ok_or_else(|| Error::oos("Expected repetition, found None")) + .and_then(|v| repetition_from_str(&v.to_uppercase()))?; + + match self.tokenizer.next() { + Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(repetition), + Some(type_string) => { + let physical_type = type_from_str(&type_string.to_uppercase())?; + self.add_primitive_type(repetition, physical_type) + }, + None => Err(Error::oos("Invalid type, could not extract next token")), + } + } + + fn add_group_type(&mut self, repetition: Repetition) -> Result { + // Parse name of the group type + let name = self + .tokenizer + .next() + .ok_or_else(|| Error::oos("Expected name, found None"))?; + + // Parse converted type if exists + let converted_type = if let Some("(") = self.tokenizer.next() { + let converted_type = self + .tokenizer + .next() + .ok_or_else(|| Error::oos("Expected converted type, found None")) + .and_then(|v| converted_group_from_str(&v.to_uppercase()))?; + assert_token(self.tokenizer.next(), ")")?; + Some(converted_type) + } else { + self.tokenizer.backtrack(); + None + }; + + // Parse optional id + let id = if let Some("=") = self.tokenizer.next() { + self.tokenizer.next().and_then(|v| v.parse::().ok()) + } else { + self.tokenizer.backtrack(); + None + }; + + let fields = self.parse_child_types()?; + + Ok(ParquetType::from_converted( + name.to_string(), + fields, + repetition, + converted_type, + id, + )) + } + + fn add_primitive_type( + &mut self, + repetition: Repetition, + physical_type: Type, + ) -> Result { + // Read type length if the type is FIXED_LEN_BYTE_ARRAY. + let length = if physical_type == Type::FIXED_LEN_BYTE_ARRAY { + assert_token(self.tokenizer.next(), "(")?; + let length = parse_i32( + self.tokenizer.next(), + "Expected length for FIXED_LEN_BYTE_ARRAY, found None", + "Failed to parse length for FIXED_LEN_BYTE_ARRAY", + )?; + assert_token(self.tokenizer.next(), ")")?; + Some(length) + } else { + None + }; + + // Parse name of the primitive type + let name = self + .tokenizer + .next() + .ok_or_else(|| Error::oos("Expected name, found None"))?; + + // Parse logical types + let (converted_type, logical_type) = if let Some("(") = self.tokenizer.next() { + let (is_logical_type, converted_type, token) = self + .tokenizer + .next() + .ok_or_else(|| Error::oos("Expected converted or logical type, found None")) + .and_then(|v| { + let string = v.to_uppercase(); + Ok(if is_logical_type(&string) { + (true, None, string) + } else if is_converted_type(&string) { + (false, converted_primitive_from_str(&string), string) + } else { + return Err(Error::oos(format!( + "Expected converted or logical type, found {}", + string + ))); + }) + })?; + + let logical_type = if is_logical_type { + Some(self.parse_logical_type(&token)?) + } else { + None + }; + + // converted type decimal + let converted_type = match converted_type { + Some(PrimitiveConvertedType::Decimal(_, _)) => { + Some(self.parse_converted_decimal()?) + }, + other => other, + }; + + assert_token(self.tokenizer.next(), ")")?; + (converted_type, logical_type) + } else { + self.tokenizer.backtrack(); + (None, None) + }; + + // Parse optional id + let id = if let Some("=") = self.tokenizer.next() { + self.tokenizer.next().and_then(|v| v.parse::().ok()) + } else { + self.tokenizer.backtrack(); + None + }; + assert_token(self.tokenizer.next(), ";")?; + + ParquetType::try_from_primitive( + name.to_string(), + (physical_type, length).try_into()?, + repetition, + converted_type, + logical_type, + id, + ) + } + + fn parse_converted_decimal(&mut self) -> Result { + assert_token(self.tokenizer.next(), "(")?; + // Parse precision + let precision = parse_i32( + self.tokenizer.next(), + "Expected precision, found None", + "Failed to parse precision for DECIMAL type", + )?; + + // Parse scale + let scale = if let Some(",") = self.tokenizer.next() { + parse_i32( + self.tokenizer.next(), + "Expected scale, found None", + "Failed to parse scale for DECIMAL type", + )? + } else { + // Scale is not provided, set it to 0. + self.tokenizer.backtrack(); + 0 + }; + + assert_token(self.tokenizer.next(), ")")?; + Ok(PrimitiveConvertedType::Decimal( + precision.try_into()?, + scale.try_into()?, + )) + } + + fn parse_logical_type(&mut self, tpe: &str) -> Result { + Ok(match tpe { + "ENUM" => PrimitiveLogicalType::Enum, + "DATE" => PrimitiveLogicalType::Date, + "DECIMAL" => { + let (precision, scale) = if let Some("(") = self.tokenizer.next() { + let precision = parse_i32( + self.tokenizer.next(), + "Expected precision, found None", + "Failed to parse precision for DECIMAL type", + )?; + let scale = if let Some(",") = self.tokenizer.next() { + parse_i32( + self.tokenizer.next(), + "Expected scale, found None", + "Failed to parse scale for DECIMAL type", + )? + } else { + self.tokenizer.backtrack(); + 0 + }; + assert_token(self.tokenizer.next(), ")")?; + (precision, scale) + } else { + self.tokenizer.backtrack(); + (0, 0) + }; + PrimitiveLogicalType::Decimal(precision.try_into()?, scale.try_into()?) + }, + "TIME" => { + let (unit, is_adjusted_to_utc) = if let Some("(") = self.tokenizer.next() { + let unit = parse_timeunit( + self.tokenizer.next(), + "Invalid timeunit found", + "Failed to parse timeunit for TIME type", + )?; + let is_adjusted_to_utc = if let Some(",") = self.tokenizer.next() { + parse_bool( + self.tokenizer.next(), + "Invalid boolean found", + "Failed to parse timezone info for TIME type", + )? + } else { + self.tokenizer.backtrack(); + false + }; + assert_token(self.tokenizer.next(), ")")?; + (unit, is_adjusted_to_utc) + } else { + self.tokenizer.backtrack(); + (TimeUnit::Milliseconds, false) + }; + PrimitiveLogicalType::Time { + is_adjusted_to_utc, + unit, + } + }, + "TIMESTAMP" => { + let (unit, is_adjusted_to_utc) = if let Some("(") = self.tokenizer.next() { + let unit = parse_timeunit( + self.tokenizer.next(), + "Invalid timeunit found", + "Failed to parse timeunit for TIMESTAMP type", + )?; + let is_adjusted_to_utc = if let Some(",") = self.tokenizer.next() { + parse_bool( + self.tokenizer.next(), + "Invalid boolean found", + "Failed to parse timezone info for TIMESTAMP type", + )? + } else { + // Invalid token for unit + self.tokenizer.backtrack(); + false + }; + assert_token(self.tokenizer.next(), ")")?; + (unit, is_adjusted_to_utc) + } else { + self.tokenizer.backtrack(); + (TimeUnit::Milliseconds, false) + }; + PrimitiveLogicalType::Timestamp { + is_adjusted_to_utc, + unit, + } + }, + "INTEGER" => { + let (bit_width, is_signed) = if let Some("(") = self.tokenizer.next() { + let bit_width = parse_i32( + self.tokenizer.next(), + "Invalid bit_width found", + "Failed to parse bit_width for INTEGER type", + )?; + let is_signed = if let Some(",") = self.tokenizer.next() { + parse_bool( + self.tokenizer.next(), + "Invalid boolean found", + "Failed to parse is_signed for INTEGER type", + )? + } else { + // Invalid token for unit + self.tokenizer.backtrack(); + return Err(Error::oos("INTEGER requires sign")); + }; + assert_token(self.tokenizer.next(), ")")?; + (bit_width, is_signed) + } else { + // Invalid token for unit + self.tokenizer.backtrack(); + return Err(Error::oos("INTEGER requires width and sign")); + }; + PrimitiveLogicalType::Integer((bit_width, is_signed).into()) + }, + "STRING" => PrimitiveLogicalType::String, + "JSON" => PrimitiveLogicalType::Json, + "BSON" => PrimitiveLogicalType::Bson, + "UUID" => PrimitiveLogicalType::Uuid, + "UNKNOWN" => PrimitiveLogicalType::Unknown, + "INTERVAL" => return Err(Error::oos("Interval logical type not yet supported")), + _ => unreachable!(), + }) + } +} + +#[cfg(test)] +mod tests { + use types::{IntegerType, PrimitiveLogicalType}; + + use super::*; + use crate::parquet::schema::types::{GroupConvertedType, PhysicalType, PrimitiveConvertedType}; + + #[test] + fn test_tokenize_empty_string() { + assert_eq!(Tokenizer::from_str("").next(), None); + } + + #[test] + fn test_tokenize_delimiters() { + let mut iter = Tokenizer::from_str(",;{}()="); + assert_eq!(iter.next(), Some(",")); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), Some("{")); + assert_eq!(iter.next(), Some("}")); + assert_eq!(iter.next(), Some("(")); + assert_eq!(iter.next(), Some(")")); + assert_eq!(iter.next(), Some("=")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_delimiters_with_whitespaces() { + let mut iter = Tokenizer::from_str(" , ; { } ( ) = "); + assert_eq!(iter.next(), Some(",")); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), Some("{")); + assert_eq!(iter.next(), Some("}")); + assert_eq!(iter.next(), Some("(")); + assert_eq!(iter.next(), Some(")")); + assert_eq!(iter.next(), Some("=")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_words() { + let mut iter = Tokenizer::from_str("abc def ghi jkl mno"); + assert_eq!(iter.next(), Some("abc")); + assert_eq!(iter.next(), Some("def")); + assert_eq!(iter.next(), Some("ghi")); + assert_eq!(iter.next(), Some("jkl")); + assert_eq!(iter.next(), Some("mno")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_backtrack() { + let mut iter = Tokenizer::from_str("abc;"); + assert_eq!(iter.next(), Some("abc")); + assert_eq!(iter.next(), Some(";")); + iter.backtrack(); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_message_type() { + let schema = " + message schema { + required int32 a; + optional binary c (UTF8); + required group d { + required int32 a; + optional binary c (UTF8); + } + required group e (LIST) { + repeated group list { + required int32 element; + } + } + } + "; + let iter = Tokenizer::from_str(schema); + let mut res = Vec::new(); + for token in iter { + res.push(token); + } + assert_eq!( + res, + vec![ + "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c", + "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a", + ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group", + "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32", + "element", ";", "}", "}", "}" + ] + ); + } + + #[test] + fn test_assert_token() { + assert!(assert_token(Some("a"), "a").is_ok()); + assert!(assert_token(Some("a"), "b").is_err()); + assert!(assert_token(None, "b").is_err()); + } + + #[test] + fn test_parse_message_type_invalid() { + let mut iter = Tokenizer::from_str("test"); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "File out of specification: Message type does not start with 'message'" + ); + } + + #[test] + fn test_parse_message_type_no_name() { + let mut iter = Tokenizer::from_str("message"); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "File out of specification: Expected name, found None" + ); + } + + #[test] + fn test_parse_message_type_fixed_byte_array() { + let schema = " + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY col; + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + let schema = " + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY(16) col; + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_message_type_decimal() { + // It is okay for decimal to omit precision and scale with right syntax. + // Here we test wrong syntax of decimal type + + // Invalid decimal syntax + let schema = " + message root { + optional int32 f1 (DECIMAL(); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal, need precision and scale + let schema = " + message root { + optional int32 f1 (DECIMAL()); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal because of `,` - has precision, needs scale + let schema = " + message root { + optional int32 f1 (DECIMAL(8,)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + } + + #[test] + fn test_parse_decimal_wrong() { + // Invalid decimal because, we always require either precision or scale to be + // specified as part of converted type + let schema = " + message root { + optional int32 f3 (DECIMAL); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Valid decimal (precision, scale) + let schema = " + message root { + optional int32 f1 (DECIMAL(8, 3)); + optional int32 f2 (DECIMAL(8)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_message_type_compare_1() -> Result<()> { + let schema = " + message root { + optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); + optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let fields = vec![ + ParquetType::try_from_primitive( + "f1".to_string(), + PhysicalType::FixedLenByteArray(5), + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Decimal(9, 3)), + None, + )?, + ParquetType::try_from_primitive( + "f2".to_string(), + PhysicalType::FixedLenByteArray(16), + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Decimal(38, 18)), + None, + )?, + ]; + + let expected = ParquetType::new_root("root".to_string(), fields); + + assert_eq!(message, expected); + Ok(()) + } + + #[test] + fn test_parse_message_type_compare_2() -> Result<()> { + let schema = " + message root { + required group a0 { + optional group a1 (LIST) { + repeated binary a2 (UTF8); + } + + optional group b1 (LIST) { + repeated group b2 { + optional int32 b3; + optional double b4; + } + } + } + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let a2 = ParquetType::try_from_primitive( + "a2".to_string(), + PhysicalType::ByteArray, + Repetition::Repeated, + Some(PrimitiveConvertedType::Utf8), + None, + None, + )?; + let a1 = ParquetType::from_converted( + "a1".to_string(), + vec![a2], + Repetition::Optional, + Some(GroupConvertedType::List), + None, + ); + let b2 = ParquetType::from_converted( + "b2".to_string(), + vec![ + ParquetType::from_physical("b3".to_string(), PhysicalType::Int32), + ParquetType::from_physical("b4".to_string(), PhysicalType::Double), + ], + Repetition::Repeated, + None, + None, + ); + let b1 = ParquetType::from_converted( + "b1".to_string(), + vec![b2], + Repetition::Optional, + Some(GroupConvertedType::List), + None, + ); + let a0 = ParquetType::from_converted( + "a0".to_string(), + vec![a1, b1], + Repetition::Required, + None, + None, + ); + + let expected = ParquetType::new_root("root".to_string(), vec![a0]); + + assert_eq!(message, expected); + Ok(()) + } + + #[test] + fn test_parse_message_type_compare_3() -> Result<()> { + let schema = " + message root { + required int32 _1 (INT_8); + required int32 _2 (INT_16); + required float _3; + required double _4; + optional int32 _5 (DATE); + optional binary _6 (UTF8); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let f1 = ParquetType::try_from_primitive( + "_1".to_string(), + PhysicalType::Int32, + Repetition::Required, + Some(PrimitiveConvertedType::Int8), + None, + None, + )?; + let f2 = ParquetType::try_from_primitive( + "_2".to_string(), + PhysicalType::Int32, + Repetition::Required, + Some(PrimitiveConvertedType::Int16), + None, + None, + )?; + let f3 = ParquetType::try_from_primitive( + "_3".to_string(), + PhysicalType::Float, + Repetition::Required, + None, + None, + None, + )?; + let f4 = ParquetType::try_from_primitive( + "_4".to_string(), + PhysicalType::Double, + Repetition::Required, + None, + None, + None, + )?; + let f5 = ParquetType::try_from_primitive( + "_5".to_string(), + PhysicalType::Int32, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Date), + None, + )?; + let f6 = ParquetType::try_from_primitive( + "_6".to_string(), + PhysicalType::ByteArray, + Repetition::Optional, + Some(PrimitiveConvertedType::Utf8), + None, + None, + )?; + + let fields = vec![f1, f2, f3, f4, f5, f6]; + + let expected = ParquetType::new_root("root".to_string(), fields); + assert_eq!(message, expected); + Ok(()) + } + + #[test] + fn test_parse_message_type_compare_4() -> Result<()> { + let schema = " + message root { + required int32 _1 (INTEGER(8,true)); + required int32 _2 (INTEGER(16,false)); + required float _3; + required double _4; + optional int32 _5 (DATE); + optional int32 _6 (TIME(MILLIS,false)); + optional int64 _7 (TIME(MICROS,true)); + optional int64 _8 (TIMESTAMP(MILLIS,true)); + optional int64 _9 (TIMESTAMP(NANOS,false)); + optional binary _10 (STRING); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type()?; + + let f1 = ParquetType::try_from_primitive( + "_1".to_string(), + PhysicalType::Int32, + Repetition::Required, + None, + Some(PrimitiveLogicalType::Integer(IntegerType::Int8)), + None, + )?; + let f2 = ParquetType::try_from_primitive( + "_2".to_string(), + PhysicalType::Int32, + Repetition::Required, + None, + Some(PrimitiveLogicalType::Integer(IntegerType::UInt16)), + None, + )?; + let f3 = ParquetType::try_from_primitive( + "_3".to_string(), + PhysicalType::Float, + Repetition::Required, + None, + None, + None, + )?; + let f4 = ParquetType::try_from_primitive( + "_4".to_string(), + PhysicalType::Double, + Repetition::Required, + None, + None, + None, + )?; + let f5 = ParquetType::try_from_primitive( + "_5".to_string(), + PhysicalType::Int32, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Date), + None, + )?; + let f6 = ParquetType::try_from_primitive( + "_6".to_string(), + PhysicalType::Int32, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Time { + is_adjusted_to_utc: false, + unit: TimeUnit::Milliseconds, + }), + None, + )?; + let f7 = ParquetType::try_from_primitive( + "_7".to_string(), + PhysicalType::Int64, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Time { + is_adjusted_to_utc: true, + unit: TimeUnit::Microseconds, + }), + None, + )?; + let f8 = ParquetType::try_from_primitive( + "_8".to_string(), + PhysicalType::Int64, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Timestamp { + is_adjusted_to_utc: true, + unit: TimeUnit::Milliseconds, + }), + None, + )?; + let f9 = ParquetType::try_from_primitive( + "_9".to_string(), + PhysicalType::Int64, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::Timestamp { + is_adjusted_to_utc: false, + unit: TimeUnit::Nanoseconds, + }), + None, + )?; + + let f10 = ParquetType::try_from_primitive( + "_10".to_string(), + PhysicalType::ByteArray, + Repetition::Optional, + None, + Some(PrimitiveLogicalType::String), + None, + )?; + + let fields = vec![f1, f2, f3, f4, f5, f6, f7, f8, f9, f10]; + + let expected = ParquetType::new_root("root".to_string(), fields); + assert_eq!(message, expected); + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/schema/io_message/mod.rs b/crates/polars-parquet/src/parquet/schema/io_message/mod.rs new file mode 100644 index 000000000000..1e296a7f3724 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/io_message/mod.rs @@ -0,0 +1,3 @@ +mod from_message; + +pub use from_message::from_message; diff --git a/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs b/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs new file mode 100644 index 000000000000..b99c0881fb89 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/io_thrift/from_thrift.rs @@ -0,0 +1,134 @@ +use parquet_format_safe::SchemaElement; + +use super::super::types::ParquetType; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::FieldInfo; + +impl ParquetType { + /// Method to convert from Thrift. + pub fn try_from_thrift(elements: &[SchemaElement]) -> Result { + let mut index = 0; + let mut schema_nodes = Vec::new(); + while index < elements.len() { + let t = from_thrift_helper(elements, index)?; + index = t.0; + schema_nodes.push(t.1); + } + if schema_nodes.len() != 1 { + return Err(Error::oos(format!( + "Expected exactly one root node, but found {}", + schema_nodes.len() + ))); + } + + Ok(schema_nodes.remove(0)) + } +} + +/// Constructs a new Type from the `elements`, starting at index `index`. +/// The first result is the starting index for the next Type after this one. If it is +/// equal to `elements.len()`, then this Type is the last one. +/// The second result is the result Type. +fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, ParquetType)> { + // Whether or not the current node is root (message type). + // There is only one message type node in the schema tree. + let is_root_node = index == 0; + + let element = elements + .get(index) + .ok_or_else(|| Error::oos(format!("index {} on SchemaElement is not valid", index)))?; + let name = element.name.clone(); + let converted_type = element.converted_type; + + let id = element.field_id; + match element.num_children { + // From parquet-format: + // The children count is used to construct the nested relationship. + // This field is not set when the element is a primitive type + // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we + // have to handle this case too. + None | Some(0) => { + // primitive type + let repetition = element + .repetition_type + .ok_or_else(|| Error::oos("Repetition level must be defined for a primitive type"))? + .try_into()?; + let physical_type = element + .type_ + .ok_or_else(|| Error::oos("Physical type must be defined for a primitive type"))?; + + let converted_type = converted_type + .map(|converted_type| { + let maybe_decimal = match (element.precision, element.scale) { + (Some(precision), Some(scale)) => Some((precision, scale)), + (None, None) => None, + _ => { + return Err(Error::oos( + "When precision or scale are defined, both must be defined", + )) + }, + }; + (converted_type, maybe_decimal).try_into() + }) + .transpose()?; + + let logical_type = element + .logical_type + .clone() + .map(|x| x.try_into()) + .transpose()?; + + let tp = ParquetType::try_from_primitive( + name, + (physical_type, element.type_length).try_into()?, + repetition, + converted_type, + logical_type, + id, + )?; + + Ok((index + 1, tp)) + }, + Some(n) => { + let mut fields = vec![]; + let mut next_index = index + 1; + for _ in 0..n { + let child_result = from_thrift_helper(elements, next_index)?; + next_index = child_result.0; + fields.push(child_result.1); + } + + let tp = if is_root_node { + ParquetType::new_root(name, fields) + } else { + let repetition = if let Some(repetition) = element.repetition_type { + repetition.try_into()? + } else { + return Err(Error::oos( + "The repetition level of a non-root must be non-null", + )); + }; + + let converted_type = converted_type.map(|x| x.try_into()).transpose()?; + + let logical_type = element + .logical_type + .clone() + .map(|x| x.try_into()) + .transpose()?; + + ParquetType::GroupType { + field_info: FieldInfo { + name, + repetition, + id, + }, + fields, + converted_type, + logical_type, + } + }; + Ok((next_index, tp)) + }, + } +} diff --git a/crates/polars-parquet/src/parquet/schema/io_thrift/mod.rs b/crates/polars-parquet/src/parquet/schema/io_thrift/mod.rs new file mode 100644 index 000000000000..5176eb131ff2 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/io_thrift/mod.rs @@ -0,0 +1,85 @@ +mod from_thrift; +pub use from_thrift::*; + +mod to_thrift; +pub use to_thrift::*; + +#[cfg(test)] +mod tests { + use crate::parquet::error::Result; + use crate::parquet::schema::io_message::from_message; + use crate::parquet::schema::types::ParquetType; + + fn test_round_trip(message: &str) -> Result<()> { + let expected_schema = from_message(message)?; + let thrift_schema = expected_schema.to_thrift(); + let thrift_schema = thrift_schema.into_iter().collect::>(); + let result_schema = ParquetType::try_from_thrift(&thrift_schema)?; + assert_eq!(result_schema, expected_schema); + Ok(()) + } + + #[test] + fn test_schema_type_thrift_conversion() { + let message_type = " + message conversions { + REQUIRED INT64 id; + OPTIONAL group int_array_Array (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + } + OPTIONAL group int_map (MAP) { + REPEATED group map (MAP_KEY_VALUE) { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL INT32 value; + } + } + OPTIONAL group int_Map_Array (LIST) { + REPEATED group list { + OPTIONAL group g (MAP) { + REPEATED group map (MAP_KEY_VALUE) { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value { + OPTIONAL group H { + OPTIONAL group i (LIST) { + REPEATED group list { + OPTIONAL DOUBLE element; + } + } + } + } + } + } + } + } + OPTIONAL group nested_struct { + OPTIONAL INT32 A; + OPTIONAL group b (LIST) { + REPEATED group list { + REQUIRED FIXED_LEN_BYTE_ARRAY (16) element; + } + } + } + } + "; + test_round_trip(message_type).unwrap(); + } + + #[test] + fn test_schema_type_thrift_conversion_decimal() { + let message_type = " + message decimals { + OPTIONAL INT32 field0; + OPTIONAL INT64 field1 (DECIMAL (18, 2)); + OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18)); + OPTIONAL BYTE_ARRAY field3 (DECIMAL (9)); + } + "; + test_round_trip(message_type).unwrap(); + } +} diff --git a/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs b/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs new file mode 100644 index 000000000000..27c9d886b2ef --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/io_thrift/to_thrift.rs @@ -0,0 +1,82 @@ +use parquet_format_safe::{ConvertedType, SchemaElement}; + +use super::super::types::ParquetType; +use crate::parquet::schema::types::PrimitiveType; + +impl ParquetType { + /// Method to convert to Thrift. + pub(crate) fn to_thrift(&self) -> Vec { + let mut elements: Vec = Vec::new(); + to_thrift_helper(self, &mut elements, true); + elements + } +} + +/// Constructs list of `SchemaElement` from the schema using depth-first traversal. +/// Here we assume that schema is always valid and starts with group type. +fn to_thrift_helper(schema: &ParquetType, elements: &mut Vec, is_root: bool) { + match schema { + ParquetType::PrimitiveType(PrimitiveType { + field_info, + logical_type, + converted_type, + physical_type, + }) => { + let (type_, type_length) = (*physical_type).into(); + let (converted_type, maybe_decimal) = converted_type + .map(|x| x.into()) + .map(|x: (ConvertedType, Option<(i32, i32)>)| (Some(x.0), x.1)) + .unwrap_or((None, None)); + + let element = SchemaElement { + type_: Some(type_), + type_length, + repetition_type: Some(field_info.repetition.into()), + name: field_info.name.clone(), + num_children: None, + converted_type, + precision: maybe_decimal.map(|x| x.0), + scale: maybe_decimal.map(|x| x.1), + field_id: field_info.id, + logical_type: logical_type.map(|x| x.into()), + }; + + elements.push(element); + }, + ParquetType::GroupType { + field_info, + fields, + logical_type, + converted_type, + } => { + let converted_type = converted_type.map(|x| x.into()); + + let repetition_type = if is_root { + // https://github.com/apache/parquet-format/blob/7f06e838cbd1b7dbd722ff2580b9c2525e37fc46/src/main/thrift/parquet.thrift#L363 + None + } else { + Some(field_info.repetition) + }; + + let element = SchemaElement { + type_: None, + type_length: None, + repetition_type: repetition_type.map(|x| x.into()), + name: field_info.name.clone(), + num_children: Some(fields.len() as i32), + converted_type, + scale: None, + precision: None, + field_id: field_info.id, + logical_type: logical_type.map(|x| x.into()), + }; + + elements.push(element); + + // Add child elements for a group + for field in fields { + to_thrift_helper(field, elements, false); + } + }, + } +} diff --git a/crates/polars-parquet/src/parquet/schema/mod.rs b/crates/polars-parquet/src/parquet/schema/mod.rs new file mode 100644 index 000000000000..af1918afa7f9 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/mod.rs @@ -0,0 +1,7 @@ +pub use super::thrift_format::SchemaElement; +pub use crate::parquet::parquet_bridge::Repetition; + +pub mod io_message; +pub mod io_thrift; + +pub mod types; diff --git a/crates/polars-parquet/src/parquet/schema/types/basic_type.rs b/crates/polars-parquet/src/parquet/schema/types/basic_type.rs new file mode 100644 index 000000000000..b3697fcaa1c3 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/types/basic_type.rs @@ -0,0 +1,16 @@ +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use super::super::Repetition; + +/// Common type information. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct FieldInfo { + /// The field name + pub name: String, + /// The repetition + pub repetition: Repetition, + /// the optional id, to select fields by id + pub id: Option, +} diff --git a/crates/polars-parquet/src/parquet/schema/types/converted_type.rs b/crates/polars-parquet/src/parquet/schema/types/converted_type.rs new file mode 100644 index 000000000000..078d2324574c --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/types/converted_type.rs @@ -0,0 +1,238 @@ +use parquet_format_safe::ConvertedType; +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use crate::parquet::error::Error; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum PrimitiveConvertedType { + Utf8, + /// an enum is converted into a binary field + Enum, + /// A decimal value. + /// + /// This may be used to annotate binary or fixed primitive types. The + /// underlying byte array stores the unscaled value encoded as two's + /// complement using big-endian byte order (the most significant byte is the + /// zeroth element). The value of the decimal is the value * 10^{-scale}. + /// + /// This must be accompanied by a (maximum) precision and a scale in the + /// SchemaElement. The precision specifies the number of digits in the decimal + /// and the scale stores the location of the decimal point. For example 1.23 + /// would have precision 3 (3 total digits) and scale 2 (the decimal point is + /// 2 digits over). + // (precision, scale) + Decimal(usize, usize), + /// A Date + /// + /// Stored as days since Unix epoch, encoded as the INT32 physical type. + /// + Date, + /// A time + /// + /// The total number of milliseconds since midnight. The value is stored + /// as an INT32 physical type. + TimeMillis, + /// A time. + /// + /// The total number of microseconds since midnight. The value is stored as + /// an INT64 physical type. + TimeMicros, + /// A date/time combination + /// + /// Date and time recorded as milliseconds since the Unix epoch. Recorded as + /// a physical type of INT64. + TimestampMillis, + /// A date/time combination + /// + /// Date and time recorded as microseconds since the Unix epoch. The value is + /// stored as an INT64 physical type. + TimestampMicros, + /// An unsigned integer value. + /// + /// The number describes the maximum number of meainful data bits in + /// the stored value. 8, 16 and 32 bit values are stored using the + /// INT32 physical type. 64 bit values are stored using the INT64 + /// physical type. + /// + Uint8, + Uint16, + Uint32, + Uint64, + /// A signed integer value. + /// + /// The number describes the maximum number of meainful data bits in + /// the stored value. 8, 16 and 32 bit values are stored using the + /// INT32 physical type. 64 bit values are stored using the INT64 + /// physical type. + /// + Int8, + Int16, + Int32, + Int64, + /// An embedded JSON document + /// + /// A JSON document embedded within a single UTF8 column. + Json, + /// An embedded BSON document + /// + /// A BSON document embedded within a single BINARY column. + Bson, + /// An interval of time + /// + /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 + /// This data is composed of three separate little endian unsigned + /// integers. Each stores a component of a duration of time. The first + /// integer identifies the number of months associated with the duration, + /// the second identifies the number of days associated with the duration + /// and the third identifies the number of milliseconds associated with + /// the provided duration. This duration of time is independent of any + /// particular timezone or date. + Interval, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum GroupConvertedType { + /// a map is converted as an optional field containing a repeated key/value pair + Map, + /// a key/value pair is converted into a group of two fields + MapKeyValue, + /// a list is converted into an optional field containing a repeated field for its + /// values + List, +} + +impl TryFrom<(ConvertedType, Option<(i32, i32)>)> for PrimitiveConvertedType { + type Error = Error; + + fn try_from( + (ty, maybe_decimal): (ConvertedType, Option<(i32, i32)>), + ) -> Result { + use PrimitiveConvertedType::*; + Ok(match ty { + ConvertedType::UTF8 => Utf8, + ConvertedType::ENUM => Enum, + ConvertedType::DECIMAL => { + if let Some((precision, scale)) = maybe_decimal { + Decimal(precision.try_into()?, scale.try_into()?) + } else { + return Err(Error::oos("Decimal requires a precision and scale")); + } + }, + ConvertedType::DATE => Date, + ConvertedType::TIME_MILLIS => TimeMillis, + ConvertedType::TIME_MICROS => TimeMicros, + ConvertedType::TIMESTAMP_MILLIS => TimestampMillis, + ConvertedType::TIMESTAMP_MICROS => TimestampMicros, + ConvertedType::UINT_8 => Uint8, + ConvertedType::UINT_16 => Uint16, + ConvertedType::UINT_32 => Uint32, + ConvertedType::UINT_64 => Uint64, + ConvertedType::INT_8 => Int8, + ConvertedType::INT_16 => Int16, + ConvertedType::INT_32 => Int32, + ConvertedType::INT_64 => Int64, + ConvertedType::JSON => Json, + ConvertedType::BSON => Bson, + ConvertedType::INTERVAL => Interval, + _ => { + return Err(Error::oos(format!( + "Converted type \"{:?}\" cannot be applied to a primitive type", + ty + ))) + }, + }) + } +} + +impl TryFrom for GroupConvertedType { + type Error = Error; + + fn try_from(type_: ConvertedType) -> Result { + Ok(match type_ { + ConvertedType::LIST => GroupConvertedType::List, + ConvertedType::MAP => GroupConvertedType::Map, + ConvertedType::MAP_KEY_VALUE => GroupConvertedType::MapKeyValue, + _ => return Err(Error::oos("LogicalType value out of range")), + }) + } +} + +impl From for ConvertedType { + fn from(type_: GroupConvertedType) -> Self { + match type_ { + GroupConvertedType::Map => ConvertedType::MAP, + GroupConvertedType::List => ConvertedType::LIST, + GroupConvertedType::MapKeyValue => ConvertedType::MAP_KEY_VALUE, + } + } +} + +impl From for (ConvertedType, Option<(i32, i32)>) { + fn from(ty: PrimitiveConvertedType) -> Self { + use PrimitiveConvertedType::*; + match ty { + Utf8 => (ConvertedType::UTF8, None), + Enum => (ConvertedType::ENUM, None), + Decimal(precision, scale) => ( + ConvertedType::DECIMAL, + Some((precision as i32, scale as i32)), + ), + Date => (ConvertedType::DATE, None), + TimeMillis => (ConvertedType::TIME_MILLIS, None), + TimeMicros => (ConvertedType::TIME_MICROS, None), + TimestampMillis => (ConvertedType::TIMESTAMP_MILLIS, None), + TimestampMicros => (ConvertedType::TIMESTAMP_MICROS, None), + Uint8 => (ConvertedType::UINT_8, None), + Uint16 => (ConvertedType::UINT_16, None), + Uint32 => (ConvertedType::UINT_32, None), + Uint64 => (ConvertedType::UINT_64, None), + Int8 => (ConvertedType::INT_8, None), + Int16 => (ConvertedType::INT_16, None), + Int32 => (ConvertedType::INT_32, None), + Int64 => (ConvertedType::INT_64, None), + Json => (ConvertedType::JSON, None), + Bson => (ConvertedType::BSON, None), + Interval => (ConvertedType::INTERVAL, None), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn round_trip() -> Result<(), Error> { + use PrimitiveConvertedType::*; + let a = vec![ + Utf8, + Enum, + Decimal(3, 1), + Date, + TimeMillis, + TimeMicros, + TimestampMillis, + TimestampMicros, + Uint8, + Uint16, + Uint32, + Uint64, + Int8, + Int16, + Int32, + Int64, + Json, + Bson, + Interval, + ]; + for a in a { + let (c, d): (ConvertedType, Option<(i32, i32)>) = a.into(); + let e: PrimitiveConvertedType = (c, d).try_into()?; + assert_eq!(e, a); + } + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/schema/types/mod.rs b/crates/polars-parquet/src/parquet/schema/types/mod.rs new file mode 100644 index 000000000000..0516d75069bb --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/types/mod.rs @@ -0,0 +1,17 @@ +mod spec; + +mod physical_type; +pub use physical_type::*; + +mod basic_type; +pub use basic_type::*; + +mod converted_type; +pub use converted_type::*; + +mod parquet_type; +pub use parquet_type::*; + +pub use crate::parquet::parquet_bridge::{ + GroupLogicalType, IntegerType, PrimitiveLogicalType, TimeUnit, +}; diff --git a/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs b/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs new file mode 100644 index 000000000000..010f23ccde3a --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/types/parquet_type.rs @@ -0,0 +1,206 @@ +// see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md +use polars_utils::aliases::*; +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use super::super::Repetition; +use super::{ + spec, FieldInfo, GroupConvertedType, GroupLogicalType, PhysicalType, PrimitiveConvertedType, + PrimitiveLogicalType, +}; +use crate::parquet::error::Result; + +/// The complete description of a parquet column +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub struct PrimitiveType { + /// The fields' generic information + pub field_info: FieldInfo, + /// The optional logical type + pub logical_type: Option, + /// The optional converted type + pub converted_type: Option, + /// The physical type + pub physical_type: PhysicalType, +} + +impl PrimitiveType { + /// Helper method to create an optional field with no logical or converted types. + pub fn from_physical(name: String, physical_type: PhysicalType) -> Self { + let field_info = FieldInfo { + name, + repetition: Repetition::Optional, + id: None, + }; + Self { + field_info, + converted_type: None, + logical_type: None, + physical_type, + } + } +} + +/// Representation of a Parquet type describing primitive and nested fields, +/// including the top-level schema of the parquet file. +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum ParquetType { + PrimitiveType(PrimitiveType), + GroupType { + field_info: FieldInfo, + logical_type: Option, + converted_type: Option, + fields: Vec, + }, +} + +/// Accessors +impl ParquetType { + /// Returns [`FieldInfo`] information about the type. + pub fn get_field_info(&self) -> &FieldInfo { + match self { + Self::PrimitiveType(primitive) => &primitive.field_info, + Self::GroupType { field_info, .. } => field_info, + } + } + + /// Returns this type's field name. + pub fn name(&self) -> &str { + &self.get_field_info().name + } + + /// Checks if `sub_type` schema is part of current schema. + /// This method can be used to check if projected columns are part of the root schema. + pub fn check_contains(&self, sub_type: &ParquetType) -> bool { + let basic_match = self.get_field_info() == sub_type.get_field_info(); + + match (self, sub_type) { + ( + Self::PrimitiveType(PrimitiveType { physical_type, .. }), + Self::PrimitiveType(PrimitiveType { + physical_type: other_physical_type, + .. + }), + ) => basic_match && physical_type == other_physical_type, + ( + Self::GroupType { fields, .. }, + Self::GroupType { + fields: other_fields, + .. + }, + ) => { + // build hashmap of name -> Type + let mut field_map = PlHashMap::new(); + for field in fields { + field_map.insert(field.name(), field); + } + + for field in other_fields { + if !field_map + .get(field.name()) + .map(|tpe| tpe.check_contains(field)) + .unwrap_or(false) + { + return false; + } + } + true + }, + _ => false, + } + } +} + +/// Constructors +impl ParquetType { + pub(crate) fn new_root(name: String, fields: Vec) -> Self { + let field_info = FieldInfo { + name, + repetition: Repetition::Optional, + id: None, + }; + ParquetType::GroupType { + field_info, + fields, + logical_type: None, + converted_type: None, + } + } + + pub fn from_converted( + name: String, + fields: Vec, + repetition: Repetition, + converted_type: Option, + id: Option, + ) -> Self { + let field_info = FieldInfo { + name, + repetition, + id, + }; + + ParquetType::GroupType { + field_info, + fields, + converted_type, + logical_type: None, + } + } + + /// # Error + /// Errors iff the combination of physical, logical and converted type is not valid. + pub fn try_from_primitive( + name: String, + physical_type: PhysicalType, + repetition: Repetition, + converted_type: Option, + logical_type: Option, + id: Option, + ) -> Result { + spec::check_converted_invariants(&physical_type, &converted_type)?; + spec::check_logical_invariants(&physical_type, &logical_type)?; + + let field_info = FieldInfo { + name, + repetition, + id, + }; + + Ok(ParquetType::PrimitiveType(PrimitiveType { + field_info, + converted_type, + logical_type, + physical_type, + })) + } + + /// Helper method to create a [`ParquetType::PrimitiveType`] optional field + /// with no logical or converted types. + pub fn from_physical(name: String, physical_type: PhysicalType) -> Self { + ParquetType::PrimitiveType(PrimitiveType::from_physical(name, physical_type)) + } + + pub fn from_group( + name: String, + repetition: Repetition, + converted_type: Option, + logical_type: Option, + fields: Vec, + id: Option, + ) -> Self { + let field_info = FieldInfo { + name, + repetition, + id, + }; + + ParquetType::GroupType { + field_info, + logical_type, + converted_type, + fields, + } + } +} diff --git a/crates/polars-parquet/src/parquet/schema/types/physical_type.rs b/crates/polars-parquet/src/parquet/schema/types/physical_type.rs new file mode 100644 index 000000000000..ad576935a049 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/types/physical_type.rs @@ -0,0 +1,58 @@ +use parquet_format_safe::Type; +#[cfg(feature = "serde_types")] +use serde::{Deserialize, Serialize}; + +use crate::parquet::error::Error; + +/// The set of all physical types representable in Parquet +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] +pub enum PhysicalType { + Boolean, + Int32, + Int64, + Int96, + Float, + Double, + ByteArray, + FixedLenByteArray(usize), +} + +impl TryFrom<(Type, Option)> for PhysicalType { + type Error = Error; + + fn try_from((type_, length): (Type, Option)) -> Result { + Ok(match type_ { + Type::BOOLEAN => PhysicalType::Boolean, + Type::INT32 => PhysicalType::Int32, + Type::INT64 => PhysicalType::Int64, + Type::INT96 => PhysicalType::Int96, + Type::FLOAT => PhysicalType::Float, + Type::DOUBLE => PhysicalType::Double, + Type::BYTE_ARRAY => PhysicalType::ByteArray, + Type::FIXED_LEN_BYTE_ARRAY => { + let length = length + .ok_or_else(|| Error::oos("Length must be defined for FixedLenByteArray"))?; + PhysicalType::FixedLenByteArray(length.try_into()?) + }, + _ => return Err(Error::oos("Unknown type")), + }) + } +} + +impl From for (Type, Option) { + fn from(physical_type: PhysicalType) -> Self { + match physical_type { + PhysicalType::Boolean => (Type::BOOLEAN, None), + PhysicalType::Int32 => (Type::INT32, None), + PhysicalType::Int64 => (Type::INT64, None), + PhysicalType::Int96 => (Type::INT96, None), + PhysicalType::Float => (Type::FLOAT, None), + PhysicalType::Double => (Type::DOUBLE, None), + PhysicalType::ByteArray => (Type::BYTE_ARRAY, None), + PhysicalType::FixedLenByteArray(length) => { + (Type::FIXED_LEN_BYTE_ARRAY, Some(length as i32)) + }, + } + } +} diff --git a/crates/polars-parquet/src/parquet/schema/types/spec.rs b/crates/polars-parquet/src/parquet/schema/types/spec.rs new file mode 100644 index 000000000000..806048bb5065 --- /dev/null +++ b/crates/polars-parquet/src/parquet/schema/types/spec.rs @@ -0,0 +1,181 @@ +// see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md +use super::{IntegerType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, TimeUnit}; +use crate::parquet::error::{Error, Result}; + +fn check_decimal_invariants( + physical_type: &PhysicalType, + precision: usize, + scale: usize, +) -> Result<()> { + if precision < 1 { + return Err(Error::oos(format!( + "DECIMAL precision must be larger than 0; It is {}", + precision, + ))); + } + if scale > precision { + return Err(Error::oos(format!( + "Invalid DECIMAL: scale ({}) cannot be greater than precision \ + ({})", + scale, precision + ))); + } + + match physical_type { + PhysicalType::Int32 => { + if !(1..=9).contains(&precision) { + return Err(Error::oos(format!( + "Cannot represent INT32 as DECIMAL with precision {}", + precision + ))); + } + }, + PhysicalType::Int64 => { + if !(1..=18).contains(&precision) { + return Err(Error::oos(format!( + "Cannot represent INT64 as DECIMAL with precision {}", + precision + ))); + } + }, + PhysicalType::FixedLenByteArray(length) => { + let oos_error = || Error::oos(format!("Byte Array length {} out of spec", length)); + let max_precision = (2f64.powi( + (*length as i32) + .checked_mul(8) + .ok_or_else(oos_error)? + .checked_sub(1) + .ok_or_else(oos_error)?, + ) - 1f64) + .log10() + .floor() as usize; + + if precision > max_precision { + return Err(Error::oos(format!( + "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \ + precision {}. The max precision can only be {}", + length, precision, max_precision + ))); + } + }, + PhysicalType::ByteArray => {}, + _ => { + return Err(Error::oos( + "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY" + .to_string(), + )) + }, + }; + Ok(()) +} + +pub fn check_converted_invariants( + physical_type: &PhysicalType, + converted_type: &Option, +) -> Result<()> { + if converted_type.is_none() { + return Ok(()); + }; + let converted_type = converted_type.as_ref().unwrap(); + + use PrimitiveConvertedType::*; + match converted_type { + Utf8 | Bson | Json => { + if physical_type != &PhysicalType::ByteArray { + return Err(Error::oos(format!( + "{:?} can only annotate BYTE_ARRAY fields", + converted_type + ))); + } + }, + Decimal(precision, scale) => { + check_decimal_invariants(physical_type, *precision, *scale)?; + }, + Date | TimeMillis | Uint8 | Uint16 | Uint32 | Int8 | Int16 | Int32 => { + if physical_type != &PhysicalType::Int32 { + return Err(Error::oos(format!( + "{:?} can only annotate INT32", + converted_type + ))); + } + }, + TimeMicros | TimestampMillis | TimestampMicros | Uint64 | Int64 => { + if physical_type != &PhysicalType::Int64 { + return Err(Error::oos(format!( + "{:?} can only annotate INT64", + converted_type + ))); + } + }, + Interval => { + if physical_type != &PhysicalType::FixedLenByteArray(12) { + return Err(Error::oos( + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)".to_string(), + )); + } + }, + Enum => { + if physical_type != &PhysicalType::ByteArray { + return Err(Error::oos( + "ENUM can only annotate BYTE_ARRAY fields".to_string(), + )); + } + }, + }; + Ok(()) +} + +pub fn check_logical_invariants( + physical_type: &PhysicalType, + logical_type: &Option, +) -> Result<()> { + if logical_type.is_none() { + return Ok(()); + }; + let logical_type = logical_type.unwrap(); + + // Check that logical type and physical type are compatible + use PrimitiveLogicalType::*; + match (logical_type, physical_type) { + (Enum, PhysicalType::ByteArray) => {}, + (Decimal(precision, scale), _) => { + check_decimal_invariants(physical_type, precision, scale)?; + }, + (Date, PhysicalType::Int32) => {}, + ( + Time { + unit: TimeUnit::Milliseconds, + .. + }, + PhysicalType::Int32, + ) => {}, + (Time { unit, .. }, PhysicalType::Int64) => { + if unit == TimeUnit::Milliseconds { + return Err(Error::oos( + "Cannot use millisecond unit on INT64 type".to_string(), + )); + } + }, + (Timestamp { .. }, PhysicalType::Int64) => {}, + (Integer(IntegerType::Int8), PhysicalType::Int32) => {}, + (Integer(IntegerType::Int16), PhysicalType::Int32) => {}, + (Integer(IntegerType::Int32), PhysicalType::Int32) => {}, + (Integer(IntegerType::UInt8), PhysicalType::Int32) => {}, + (Integer(IntegerType::UInt16), PhysicalType::Int32) => {}, + (Integer(IntegerType::UInt32), PhysicalType::Int32) => {}, + (Integer(IntegerType::UInt64), PhysicalType::Int64) => {}, + (Integer(IntegerType::Int64), PhysicalType::Int64) => {}, + // Null type + (Unknown, PhysicalType::Int32) => {}, + (String | Json | Bson, PhysicalType::ByteArray) => {}, + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid + (Uuid, PhysicalType::FixedLenByteArray(16)) => {}, + (a, b) => { + return Err(Error::oos(format!( + "Cannot annotate {:?} from {:?} fields", + a, b + ))) + }, + }; + Ok(()) +} diff --git a/crates/polars-parquet/src/parquet/statistics/binary.rs b/crates/polars-parquet/src/parquet/statistics/binary.rs new file mode 100644 index 000000000000..1f599d2fc0e1 --- /dev/null +++ b/crates/polars-parquet/src/parquet/statistics/binary.rs @@ -0,0 +1,51 @@ +use std::sync::Arc; + +use parquet_format_safe::Statistics as ParquetStatistics; + +use super::Statistics; +use crate::parquet::error::Result; +use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BinaryStatistics { + pub primitive_type: PrimitiveType, + pub null_count: Option, + pub distinct_count: Option, + pub max_value: Option>, + pub min_value: Option>, +} + +impl Statistics for BinaryStatistics { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &PhysicalType::ByteArray + } + + fn null_count(&self) -> Option { + self.null_count + } +} + +pub fn read(v: &ParquetStatistics, primitive_type: PrimitiveType) -> Result> { + Ok(Arc::new(BinaryStatistics { + primitive_type, + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.clone(), + min_value: v.min_value.clone(), + })) +} + +pub fn write(v: &BinaryStatistics) -> ParquetStatistics { + ParquetStatistics { + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.clone(), + min_value: v.min_value.clone(), + min: None, + max: None, + } +} diff --git a/crates/polars-parquet/src/parquet/statistics/boolean.rs b/crates/polars-parquet/src/parquet/statistics/boolean.rs new file mode 100644 index 000000000000..c167341073f2 --- /dev/null +++ b/crates/polars-parquet/src/parquet/statistics/boolean.rs @@ -0,0 +1,72 @@ +use std::sync::Arc; + +use parquet_format_safe::Statistics as ParquetStatistics; + +use super::Statistics; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::PhysicalType; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BooleanStatistics { + pub null_count: Option, + pub distinct_count: Option, + pub max_value: Option, + pub min_value: Option, +} + +impl Statistics for BooleanStatistics { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &PhysicalType::Boolean + } + + fn null_count(&self) -> Option { + self.null_count + } +} + +pub fn read(v: &ParquetStatistics) -> Result> { + if let Some(ref v) = v.max_value { + if v.len() != std::mem::size_of::() { + return Err(Error::oos( + "The max_value of statistics MUST be plain encoded", + )); + } + }; + if let Some(ref v) = v.min_value { + if v.len() != std::mem::size_of::() { + return Err(Error::oos( + "The min_value of statistics MUST be plain encoded", + )); + } + }; + + Ok(Arc::new(BooleanStatistics { + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v + .max_value + .as_ref() + .and_then(|x| x.first()) + .map(|x| *x != 0), + min_value: v + .min_value + .as_ref() + .and_then(|x| x.first()) + .map(|x| *x != 0), + })) +} + +pub fn write(v: &BooleanStatistics) -> ParquetStatistics { + ParquetStatistics { + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.map(|x| vec![x as u8]), + min_value: v.min_value.map(|x| vec![x as u8]), + min: None, + max: None, + } +} diff --git a/crates/polars-parquet/src/parquet/statistics/fixed_len_binary.rs b/crates/polars-parquet/src/parquet/statistics/fixed_len_binary.rs new file mode 100644 index 000000000000..6def092b7edc --- /dev/null +++ b/crates/polars-parquet/src/parquet/statistics/fixed_len_binary.rs @@ -0,0 +1,76 @@ +use std::sync::Arc; + +use parquet_format_safe::Statistics as ParquetStatistics; + +use super::Statistics; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FixedLenStatistics { + pub primitive_type: PrimitiveType, + pub null_count: Option, + pub distinct_count: Option, + pub max_value: Option>, + pub min_value: Option>, +} + +impl Statistics for FixedLenStatistics { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &self.primitive_type.physical_type + } + + fn null_count(&self) -> Option { + self.null_count + } +} + +pub fn read( + v: &ParquetStatistics, + size: usize, + primitive_type: PrimitiveType, +) -> Result> { + if let Some(ref v) = v.max_value { + if v.len() != size { + return Err(Error::oos( + "The max_value of statistics MUST be plain encoded", + )); + } + }; + if let Some(ref v) = v.min_value { + if v.len() != size { + return Err(Error::oos( + "The min_value of statistics MUST be plain encoded", + )); + } + }; + + Ok(Arc::new(FixedLenStatistics { + primitive_type, + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.clone().map(|mut x| { + x.truncate(size); + x + }), + min_value: v.min_value.clone().map(|mut x| { + x.truncate(size); + x + }), + })) +} + +pub fn write(v: &FixedLenStatistics) -> ParquetStatistics { + ParquetStatistics { + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.clone(), + min_value: v.min_value.clone(), + min: None, + max: None, + } +} diff --git a/crates/polars-parquet/src/parquet/statistics/mod.rs b/crates/polars-parquet/src/parquet/statistics/mod.rs new file mode 100644 index 000000000000..7451ac753135 --- /dev/null +++ b/crates/polars-parquet/src/parquet/statistics/mod.rs @@ -0,0 +1,134 @@ +mod binary; +mod boolean; +mod fixed_len_binary; +mod primitive; + +use std::any::Any; +use std::sync::Arc; + +pub use binary::BinaryStatistics; +pub use boolean::BooleanStatistics; +pub use fixed_len_binary::FixedLenStatistics; +pub use primitive::PrimitiveStatistics; + +use crate::parquet::error::Result; +use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; +pub use crate::parquet::thrift_format::Statistics as ParquetStatistics; + +/// A trait used to describe specific statistics. Each physical type has its own struct. +/// Match the [`Statistics::physical_type`] to each type and downcast accordingly. +pub trait Statistics: Send + Sync + std::fmt::Debug { + fn as_any(&self) -> &dyn Any; + + fn physical_type(&self) -> &PhysicalType; + + fn null_count(&self) -> Option; +} + +impl PartialEq for &dyn Statistics { + fn eq(&self, other: &Self) -> bool { + self.physical_type() == other.physical_type() && { + match self.physical_type() { + PhysicalType::Boolean => { + self.as_any().downcast_ref::().unwrap() + == other.as_any().downcast_ref::().unwrap() + }, + PhysicalType::Int32 => { + self.as_any() + .downcast_ref::>() + .unwrap() + == other + .as_any() + .downcast_ref::>() + .unwrap() + }, + PhysicalType::Int64 => { + self.as_any() + .downcast_ref::>() + .unwrap() + == other + .as_any() + .downcast_ref::>() + .unwrap() + }, + PhysicalType::Int96 => { + self.as_any() + .downcast_ref::>() + .unwrap() + == other + .as_any() + .downcast_ref::>() + .unwrap() + }, + PhysicalType::Float => { + self.as_any() + .downcast_ref::>() + .unwrap() + == other + .as_any() + .downcast_ref::>() + .unwrap() + }, + PhysicalType::Double => { + self.as_any() + .downcast_ref::>() + .unwrap() + == other + .as_any() + .downcast_ref::>() + .unwrap() + }, + PhysicalType::ByteArray => { + self.as_any().downcast_ref::().unwrap() + == other.as_any().downcast_ref::().unwrap() + }, + PhysicalType::FixedLenByteArray(_) => { + self.as_any().downcast_ref::().unwrap() + == other.as_any().downcast_ref::().unwrap() + }, + } + } + } +} + +/// Deserializes a raw parquet statistics into [`Statistics`]. +/// # Error +/// This function errors if it is not possible to read the statistics to the +/// corresponding `physical_type`. +pub fn deserialize_statistics( + statistics: &ParquetStatistics, + primitive_type: PrimitiveType, +) -> Result> { + match primitive_type.physical_type { + PhysicalType::Boolean => boolean::read(statistics), + PhysicalType::Int32 => primitive::read::(statistics, primitive_type), + PhysicalType::Int64 => primitive::read::(statistics, primitive_type), + PhysicalType::Int96 => primitive::read::<[u32; 3]>(statistics, primitive_type), + PhysicalType::Float => primitive::read::(statistics, primitive_type), + PhysicalType::Double => primitive::read::(statistics, primitive_type), + PhysicalType::ByteArray => binary::read(statistics, primitive_type), + PhysicalType::FixedLenByteArray(size) => { + fixed_len_binary::read(statistics, size, primitive_type) + }, + } +} + +/// Serializes [`Statistics`] into a raw parquet statistics. +pub fn serialize_statistics(statistics: &dyn Statistics) -> ParquetStatistics { + match statistics.physical_type() { + PhysicalType::Boolean => boolean::write(statistics.as_any().downcast_ref().unwrap()), + PhysicalType::Int32 => primitive::write::(statistics.as_any().downcast_ref().unwrap()), + PhysicalType::Int64 => primitive::write::(statistics.as_any().downcast_ref().unwrap()), + PhysicalType::Int96 => { + primitive::write::<[u32; 3]>(statistics.as_any().downcast_ref().unwrap()) + }, + PhysicalType::Float => primitive::write::(statistics.as_any().downcast_ref().unwrap()), + PhysicalType::Double => { + primitive::write::(statistics.as_any().downcast_ref().unwrap()) + }, + PhysicalType::ByteArray => binary::write(statistics.as_any().downcast_ref().unwrap()), + PhysicalType::FixedLenByteArray(_) => { + fixed_len_binary::write(statistics.as_any().downcast_ref().unwrap()) + }, + } +} diff --git a/crates/polars-parquet/src/parquet/statistics/primitive.rs b/crates/polars-parquet/src/parquet/statistics/primitive.rs new file mode 100644 index 000000000000..17a927e9a1ac --- /dev/null +++ b/crates/polars-parquet/src/parquet/statistics/primitive.rs @@ -0,0 +1,70 @@ +use std::sync::Arc; + +use parquet_format_safe::Statistics as ParquetStatistics; + +use super::Statistics; +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; +use crate::parquet::types; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PrimitiveStatistics { + pub primitive_type: PrimitiveType, + pub null_count: Option, + pub distinct_count: Option, + pub min_value: Option, + pub max_value: Option, +} + +impl Statistics for PrimitiveStatistics { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &T::TYPE + } + + fn null_count(&self) -> Option { + self.null_count + } +} + +pub fn read( + v: &ParquetStatistics, + primitive_type: PrimitiveType, +) -> Result> { + if let Some(ref v) = v.max_value { + if v.len() != std::mem::size_of::() { + return Err(Error::oos( + "The max_value of statistics MUST be plain encoded", + )); + } + }; + if let Some(ref v) = v.min_value { + if v.len() != std::mem::size_of::() { + return Err(Error::oos( + "The min_value of statistics MUST be plain encoded", + )); + } + }; + + Ok(Arc::new(PrimitiveStatistics:: { + primitive_type, + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.as_ref().map(|x| types::decode(x)), + min_value: v.min_value.as_ref().map(|x| types::decode(x)), + })) +} + +pub fn write(v: &PrimitiveStatistics) -> ParquetStatistics { + ParquetStatistics { + null_count: v.null_count, + distinct_count: v.distinct_count, + max_value: v.max_value.map(|x| x.to_le_bytes().as_ref().to_vec()), + min_value: v.min_value.map(|x| x.to_le_bytes().as_ref().to_vec()), + min: None, + max: None, + } +} diff --git a/crates/polars-parquet/src/parquet/types.rs b/crates/polars-parquet/src/parquet/types.rs new file mode 100644 index 000000000000..59f6c71dc7ab --- /dev/null +++ b/crates/polars-parquet/src/parquet/types.rs @@ -0,0 +1,141 @@ +use std::convert::TryFrom; + +use crate::parquet::schema::types::PhysicalType; + +/// A physical native representation of a Parquet fixed-sized type. +pub trait NativeType: std::fmt::Debug + Send + Sync + 'static + Copy + Clone { + type Bytes: AsRef<[u8]> + for<'a> TryFrom<&'a [u8], Error = std::array::TryFromSliceError>; + + fn to_le_bytes(&self) -> Self::Bytes; + + fn from_le_bytes(bytes: Self::Bytes) -> Self; + + fn ord(&self, other: &Self) -> std::cmp::Ordering; + + const TYPE: PhysicalType; +} + +macro_rules! native { + ($type:ty, $physical_type:expr) => { + impl NativeType for $type { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn from_le_bytes(bytes: Self::Bytes) -> Self { + Self::from_le_bytes(bytes) + } + + #[inline] + fn ord(&self, other: &Self) -> std::cmp::Ordering { + self.partial_cmp(other).unwrap_or(std::cmp::Ordering::Equal) + } + + const TYPE: PhysicalType = $physical_type; + } + }; +} + +native!(i32, PhysicalType::Int32); +native!(i64, PhysicalType::Int64); +native!(f32, PhysicalType::Float); +native!(f64, PhysicalType::Double); + +impl NativeType for [u32; 3] { + const TYPE: PhysicalType = PhysicalType::Int96; + + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + let mut bytes = [0; 12]; + let first = self[0].to_le_bytes(); + bytes[0] = first[0]; + bytes[1] = first[1]; + bytes[2] = first[2]; + bytes[3] = first[3]; + let second = self[1].to_le_bytes(); + bytes[4] = second[0]; + bytes[5] = second[1]; + bytes[6] = second[2]; + bytes[7] = second[3]; + let third = self[2].to_le_bytes(); + bytes[8] = third[0]; + bytes[9] = third[1]; + bytes[10] = third[2]; + bytes[11] = third[3]; + bytes + } + + #[inline] + fn from_le_bytes(bytes: Self::Bytes) -> Self { + let mut first = [0; 4]; + first[0] = bytes[0]; + first[1] = bytes[1]; + first[2] = bytes[2]; + first[3] = bytes[3]; + let mut second = [0; 4]; + second[0] = bytes[4]; + second[1] = bytes[5]; + second[2] = bytes[6]; + second[3] = bytes[7]; + let mut third = [0; 4]; + third[0] = bytes[8]; + third[1] = bytes[9]; + third[2] = bytes[10]; + third[3] = bytes[11]; + [ + u32::from_le_bytes(first), + u32::from_le_bytes(second), + u32::from_le_bytes(third), + ] + } + + #[inline] + fn ord(&self, other: &Self) -> std::cmp::Ordering { + int96_to_i64_ns(*self).ord(&int96_to_i64_ns(*other)) + } +} + +#[inline] +pub fn int96_to_i64_ns(value: [u32; 3]) -> i64 { + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const SECONDS_PER_DAY: i64 = 86_400; + const NANOS_PER_SECOND: i64 = 1_000_000_000; + + let day = value[2] as i64; + let nanoseconds = ((value[1] as i64) << 32) + value[0] as i64; + let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY; + + seconds * NANOS_PER_SECOND + nanoseconds +} + +/// Returns the ordering of two binary values. +pub fn ord_binary<'a>(a: &'a [u8], b: &'a [u8]) -> std::cmp::Ordering { + use std::cmp::Ordering::*; + match (a.is_empty(), b.is_empty()) { + (true, true) => return Equal, + (true, false) => return Less, + (false, true) => return Greater, + (false, false) => {}, + } + + for (v1, v2) in a.iter().zip(b.iter()) { + match v1.cmp(v2) { + Equal => continue, + other => return other, + } + } + Equal +} + +#[inline] +pub fn decode(chunk: &[u8]) -> T { + let chunk: ::Bytes = match chunk.try_into() { + Ok(v) => v, + Err(_) => panic!(), + }; + T::from_le_bytes(chunk) +} diff --git a/crates/polars-parquet/src/parquet/write/column_chunk.rs b/crates/polars-parquet/src/parquet/write/column_chunk.rs new file mode 100644 index 000000000000..94452d2ac2d2 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/column_chunk.rs @@ -0,0 +1,208 @@ +use std::io::Write; + +#[cfg(feature = "async")] +use futures::AsyncWrite; +use parquet_format_safe::thrift::protocol::TCompactOutputProtocol; +#[cfg(feature = "async")] +use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; +use parquet_format_safe::{ColumnChunk, ColumnMetaData, Type}; +use polars_utils::aliases::PlHashSet; + +#[cfg(feature = "async")] +use super::page::write_page_async; +use super::page::{write_page, PageWriteSpec}; +use super::statistics::reduce; +use super::DynStreamingIterator; +use crate::parquet::compression::Compression; +use crate::parquet::encoding::Encoding; +use crate::parquet::error::{Error, Result}; +use crate::parquet::metadata::ColumnDescriptor; +use crate::parquet::page::{CompressedPage, PageType}; +use crate::parquet::statistics::serialize_statistics; +use crate::parquet::FallibleStreamingIterator; + +pub fn write_column_chunk( + writer: &mut W, + mut offset: u64, + descriptor: &ColumnDescriptor, + mut compressed_pages: DynStreamingIterator<'_, CompressedPage, E>, +) -> Result<(ColumnChunk, Vec, u64)> +where + W: Write, + Error: From, + E: std::error::Error, +{ + // write every page + + let initial = offset; + + let mut specs = vec![]; + while let Some(compressed_page) = compressed_pages.next()? { + let spec = write_page(writer, offset, compressed_page)?; + offset += spec.bytes_written; + specs.push(spec); + } + let mut bytes_written = offset - initial; + + let column_chunk = build_column_chunk(&specs, descriptor)?; + + // write metadata + let mut protocol = TCompactOutputProtocol::new(writer); + bytes_written += column_chunk + .meta_data + .as_ref() + .unwrap() + .write_to_out_protocol(&mut protocol)? as u64; + + Ok((column_chunk, specs, bytes_written)) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub async fn write_column_chunk_async( + writer: &mut W, + mut offset: u64, + descriptor: &ColumnDescriptor, + mut compressed_pages: DynStreamingIterator<'_, CompressedPage, E>, +) -> Result<(ColumnChunk, Vec, u64)> +where + W: AsyncWrite + Unpin + Send, + Error: From, + E: std::error::Error, +{ + let initial = offset; + // write every page + let mut specs = vec![]; + while let Some(compressed_page) = compressed_pages.next()? { + let spec = write_page_async(writer, offset, compressed_page).await?; + offset += spec.bytes_written; + specs.push(spec); + } + let mut bytes_written = offset - initial; + + let column_chunk = build_column_chunk(&specs, descriptor)?; + + // write metadata + let mut protocol = TCompactOutputStreamProtocol::new(writer); + bytes_written += column_chunk + .meta_data + .as_ref() + .unwrap() + .write_to_out_stream_protocol(&mut protocol) + .await? as u64; + + Ok((column_chunk, specs, bytes_written)) +} + +fn build_column_chunk( + specs: &[PageWriteSpec], + descriptor: &ColumnDescriptor, +) -> Result { + // compute stats to build header at the end of the chunk + + let compression = specs + .iter() + .map(|spec| spec.compression) + .collect::>(); + if compression.len() > 1 { + return Err(crate::parquet::error::Error::oos( + "All pages within a column chunk must be compressed with the same codec", + )); + } + let compression = compression + .into_iter() + .next() + .unwrap_or(Compression::Uncompressed); + + // SPEC: the total compressed size is the total compressed size of each page + the header size + let total_compressed_size = specs + .iter() + .map(|x| x.header_size as i64 + x.header.compressed_page_size as i64) + .sum(); + // SPEC: the total compressed size is the total compressed size of each page + the header size + let total_uncompressed_size = specs + .iter() + .map(|x| x.header_size as i64 + x.header.uncompressed_page_size as i64) + .sum(); + let data_page_offset = specs.first().map(|spec| spec.offset).unwrap_or(0) as i64; + let num_values = specs + .iter() + .map(|spec| { + let type_ = spec.header.type_.try_into().unwrap(); + match type_ { + PageType::DataPage => { + spec.header.data_page_header.as_ref().unwrap().num_values as i64 + }, + PageType::DataPageV2 => { + spec.header.data_page_header_v2.as_ref().unwrap().num_values as i64 + }, + _ => 0, // only data pages contribute + } + }) + .sum(); + let mut encodings = specs + .iter() + .flat_map(|spec| { + let type_ = spec.header.type_.try_into().unwrap(); + match type_ { + PageType::DataPage => vec![ + spec.header.data_page_header.as_ref().unwrap().encoding, + Encoding::Rle.into(), + ], + PageType::DataPageV2 => { + vec![ + spec.header.data_page_header_v2.as_ref().unwrap().encoding, + Encoding::Rle.into(), + ] + }, + PageType::DictionaryPage => vec![ + spec.header + .dictionary_page_header + .as_ref() + .unwrap() + .encoding, + ], + } + }) + .collect::>() // unique + .into_iter() // to vec + .collect::>(); + + // Sort the encodings to have deterministic metadata + encodings.sort(); + + let statistics = specs.iter().map(|x| &x.statistics).collect::>(); + let statistics = reduce(&statistics)?; + let statistics = statistics.map(|x| serialize_statistics(x.as_ref())); + + let (type_, _): (Type, Option) = descriptor.descriptor.primitive_type.physical_type.into(); + + let metadata = ColumnMetaData { + type_, + encodings, + path_in_schema: descriptor.path_in_schema.clone(), + codec: compression.into(), + num_values, + total_uncompressed_size, + total_compressed_size, + key_value_metadata: None, + data_page_offset, + index_page_offset: None, + dictionary_page_offset: None, + statistics, + encoding_stats: None, + bloom_filter_offset: None, + }; + + Ok(ColumnChunk { + file_path: None, // same file for now. + file_offset: data_page_offset + total_compressed_size, + meta_data: Some(metadata), + offset_index_offset: None, + offset_index_length: None, + column_index_offset: None, + column_index_length: None, + crypto_metadata: None, + encrypted_column_metadata: None, + }) +} diff --git a/crates/polars-parquet/src/parquet/write/compression.rs b/crates/polars-parquet/src/parquet/write/compression.rs new file mode 100644 index 000000000000..4451811982d4 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/compression.rs @@ -0,0 +1,160 @@ +use crate::parquet::compression::CompressionOptions; +use crate::parquet::error::{Error, Result}; +use crate::parquet::page::{ + CompressedDataPage, CompressedDictPage, CompressedPage, DataPage, DataPageHeader, DictPage, + Page, +}; +use crate::parquet::{compression, FallibleStreamingIterator}; + +/// Compresses a [`DataPage`] into a [`CompressedDataPage`]. +fn compress_data( + page: DataPage, + mut compressed_buffer: Vec, + compression: CompressionOptions, +) -> Result { + let DataPage { + mut buffer, + header, + descriptor, + selected_rows, + } = page; + let uncompressed_page_size = buffer.len(); + if compression != CompressionOptions::Uncompressed { + match &header { + DataPageHeader::V1(_) => { + compression::compress(compression, &buffer, &mut compressed_buffer)?; + }, + DataPageHeader::V2(header) => { + let levels_byte_length = (header.repetition_levels_byte_length + + header.definition_levels_byte_length) + as usize; + compressed_buffer.extend_from_slice(&buffer[..levels_byte_length]); + compression::compress( + compression, + &buffer[levels_byte_length..], + &mut compressed_buffer, + )?; + }, + }; + } else { + std::mem::swap(&mut buffer, &mut compressed_buffer); + }; + Ok(CompressedDataPage::new_read( + header, + compressed_buffer, + compression.into(), + uncompressed_page_size, + descriptor, + selected_rows, + )) +} + +fn compress_dict( + page: DictPage, + mut compressed_buffer: Vec, + compression: CompressionOptions, +) -> Result { + let DictPage { + mut buffer, + num_values, + is_sorted, + } = page; + let uncompressed_page_size = buffer.len(); + if compression != CompressionOptions::Uncompressed { + compression::compress(compression, &buffer, &mut compressed_buffer)?; + } else { + std::mem::swap(&mut buffer, &mut compressed_buffer); + } + Ok(CompressedDictPage::new( + compressed_buffer, + compression.into(), + uncompressed_page_size, + num_values, + is_sorted, + )) +} + +/// Compresses an [`EncodedPage`] into a [`CompressedPage`] using `compressed_buffer` as the +/// intermediary buffer. +/// +/// `compressed_buffer` is taken by value because it becomes owned by [`CompressedPage`] +/// +/// # Errors +/// Errors if the compressor fails +pub fn compress( + page: Page, + compressed_buffer: Vec, + compression: CompressionOptions, +) -> Result { + match page { + Page::Data(page) => { + compress_data(page, compressed_buffer, compression).map(CompressedPage::Data) + }, + Page::Dict(page) => { + compress_dict(page, compressed_buffer, compression).map(CompressedPage::Dict) + }, + } +} + +/// A [`FallibleStreamingIterator`] that consumes [`Page`] and yields [`CompressedPage`] +/// holding a reusable buffer ([`Vec`]) for compression. +pub struct Compressor>> { + iter: I, + compression: CompressionOptions, + buffer: Vec, + current: Option, +} + +impl>> Compressor { + /// Creates a new [`Compressor`] + pub fn new(iter: I, compression: CompressionOptions, buffer: Vec) -> Self { + Self { + iter, + compression, + buffer, + current: None, + } + } + + /// Creates a new [`Compressor`] (same as `new`) + pub fn new_from_vec(iter: I, compression: CompressionOptions, buffer: Vec) -> Self { + Self::new(iter, compression, buffer) + } + + /// Deconstructs itself into its iterator and scratch buffer. + pub fn into_inner(mut self) -> (I, Vec) { + let mut buffer = if let Some(page) = self.current.as_mut() { + std::mem::take(page.buffer()) + } else { + std::mem::take(&mut self.buffer) + }; + buffer.clear(); + (self.iter, buffer) + } +} + +impl>> FallibleStreamingIterator for Compressor { + type Item = CompressedPage; + type Error = Error; + + fn advance(&mut self) -> std::result::Result<(), Self::Error> { + let mut compressed_buffer = if let Some(page) = self.current.as_mut() { + std::mem::take(page.buffer()) + } else { + std::mem::take(&mut self.buffer) + }; + compressed_buffer.clear(); + + let next = self + .iter + .next() + .map(|x| x.and_then(|page| compress(page, compressed_buffer, self.compression))) + .transpose()?; + self.current = next; + Ok(()) + } + + fn get(&self) -> Option<&Self::Item> { + self.current.as_ref() + } +} diff --git a/crates/polars-parquet/src/parquet/write/dyn_iter.rs b/crates/polars-parquet/src/parquet/write/dyn_iter.rs new file mode 100644 index 000000000000..f47710b56b22 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/dyn_iter.rs @@ -0,0 +1,65 @@ +use crate::parquet::FallibleStreamingIterator; + +/// [`DynIter`] is an implementation of a single-threaded, dynamically-typed iterator. +/// +/// This implementation is object safe. +pub struct DynIter<'a, V> { + iter: Box + 'a + Send + Sync>, +} + +impl<'a, V> Iterator for DynIter<'a, V> { + type Item = V; + fn next(&mut self) -> Option { + self.iter.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +impl<'a, V> DynIter<'a, V> { + /// Returns a new [`DynIter`], boxing the incoming iterator + pub fn new(iter: I) -> Self + where + I: Iterator + 'a + Send + Sync, + { + Self { + iter: Box::new(iter), + } + } +} + +/// Dynamically-typed [`FallibleStreamingIterator`]. +pub struct DynStreamingIterator<'a, V, E> { + iter: Box + 'a + Send + Sync>, +} + +impl<'a, V, E> FallibleStreamingIterator for DynStreamingIterator<'a, V, E> { + type Item = V; + type Error = E; + + fn advance(&mut self) -> Result<(), Self::Error> { + self.iter.advance() + } + + fn get(&self) -> Option<&Self::Item> { + self.iter.get() + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +impl<'a, V, E> DynStreamingIterator<'a, V, E> { + /// Returns a new [`DynStreamingIterator`], boxing the incoming iterator + pub fn new(iter: I) -> Self + where + I: FallibleStreamingIterator + 'a + Send + Sync, + { + Self { + iter: Box::new(iter), + } + } +} diff --git a/crates/polars-parquet/src/parquet/write/file.rs b/crates/polars-parquet/src/parquet/write/file.rs new file mode 100644 index 000000000000..43fc81dbfdc1 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/file.rs @@ -0,0 +1,279 @@ +use std::io::Write; + +use parquet_format_safe::thrift::protocol::TCompactOutputProtocol; +use parquet_format_safe::RowGroup; + +use super::indexes::{write_column_index, write_offset_index}; +use super::page::PageWriteSpec; +use super::row_group::write_row_group; +use super::{RowGroupIter, WriteOptions}; +use crate::parquet::error::{Error, Result}; +pub use crate::parquet::metadata::KeyValue; +use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetaData}; +use crate::parquet::write::State; +use crate::parquet::{FOOTER_SIZE, PARQUET_MAGIC}; + +pub(super) fn start_file(writer: &mut W) -> Result { + writer.write_all(&PARQUET_MAGIC)?; + Ok(PARQUET_MAGIC.len() as u64) +} + +pub(super) fn end_file(mut writer: &mut W, metadata: &ThriftFileMetaData) -> Result { + // Write metadata + let mut protocol = TCompactOutputProtocol::new(&mut writer); + let metadata_len = metadata.write_to_out_protocol(&mut protocol)? as i32; + + // Write footer + let metadata_bytes = metadata_len.to_le_bytes(); + let mut footer_buffer = [0u8; FOOTER_SIZE as usize]; + (0..4).for_each(|i| { + footer_buffer[i] = metadata_bytes[i]; + }); + + (&mut footer_buffer[4..]).write_all(&PARQUET_MAGIC)?; + writer.write_all(&footer_buffer)?; + writer.flush()?; + Ok(metadata_len as u64 + FOOTER_SIZE) +} + +/// An interface to write a parquet file. +/// Use `start` to write the header, `write` to write a row group, +/// and `end` to write the footer. +pub struct FileWriter { + writer: W, + schema: SchemaDescriptor, + options: WriteOptions, + created_by: Option, + + offset: u64, + row_groups: Vec, + page_specs: Vec>>, + /// Used to store the current state for writing the file + state: State, + // when the file is written, metadata becomes available + metadata: Option, +} + +/// Writes a parquet file containing only the header and footer +/// +/// This is used to write the metadata as a separate Parquet file, usually when data +/// is partitioned across multiple files. +/// +/// Note: Recall that when combining row groups from [`ThriftFileMetaData`], the `file_path` on each +/// of their column chunks must be updated with their path relative to where they are written to. +pub fn write_metadata_sidecar( + writer: &mut W, + metadata: &ThriftFileMetaData, +) -> Result { + let mut len = start_file(writer)?; + len += end_file(writer, metadata)?; + Ok(len) +} + +// Accessors +impl FileWriter { + /// The options assigned to the file + pub fn options(&self) -> &WriteOptions { + &self.options + } + + /// The [`SchemaDescriptor`] assigned to this file + pub fn schema(&self) -> &SchemaDescriptor { + &self.schema + } + + /// Returns the [`ThriftFileMetaData`]. This is Some iff the [`Self::end`] has been called. + /// + /// This is used to write the metadata as a separate Parquet file, usually when data + /// is partitioned across multiple files + pub fn metadata(&self) -> Option<&ThriftFileMetaData> { + self.metadata.as_ref() + } +} + +impl FileWriter { + /// Returns a new [`FileWriter`]. + pub fn new( + writer: W, + schema: SchemaDescriptor, + options: WriteOptions, + created_by: Option, + ) -> Self { + Self { + writer, + schema, + options, + created_by, + offset: 0, + row_groups: vec![], + page_specs: vec![], + state: State::Initialised, + metadata: None, + } + } + + /// Writes the header of the file. + /// + /// This is automatically called by [`Self::write`] if not called following [`Self::new`]. + /// + /// # Errors + /// Returns an error if data has been written to the file. + fn start(&mut self) -> Result<()> { + if self.offset == 0 { + self.offset = start_file(&mut self.writer)?; + self.state = State::Started; + Ok(()) + } else { + Err(Error::InvalidParameter( + "Start cannot be called twice".to_string(), + )) + } + } + + /// Writes a row group to the file. + /// + /// This call is IO-bounded + pub fn write(&mut self, row_group: RowGroupIter<'_, E>) -> Result<()> + where + Error: From, + E: std::error::Error, + { + if self.offset == 0 { + self.start()?; + } + let ordinal = self.row_groups.len(); + let (group, specs, size) = write_row_group( + &mut self.writer, + self.offset, + self.schema.columns(), + row_group, + ordinal, + )?; + self.offset += size; + self.row_groups.push(group); + self.page_specs.push(specs); + Ok(()) + } + + /// Writes the footer of the parquet file. Returns the total size of the file and the + /// underlying writer. + pub fn end(&mut self, key_value_metadata: Option>) -> Result { + if self.offset == 0 { + self.start()?; + } + + if self.state != State::Started { + return Err(Error::InvalidParameter( + "End cannot be called twice".to_string(), + )); + } + // compute file stats + let num_rows = self.row_groups.iter().map(|group| group.num_rows).sum(); + + if self.options.write_statistics { + // write column indexes (require page statistics) + self.row_groups + .iter_mut() + .zip(self.page_specs.iter()) + .try_for_each(|(group, pages)| { + group.columns.iter_mut().zip(pages.iter()).try_for_each( + |(column, pages)| { + let offset = self.offset; + column.column_index_offset = Some(offset as i64); + self.offset += write_column_index(&mut self.writer, pages)?; + let length = self.offset - offset; + column.column_index_length = Some(length as i32); + Result::Ok(()) + }, + )?; + Result::Ok(()) + })?; + }; + + // write offset index + self.row_groups + .iter_mut() + .zip(self.page_specs.iter()) + .try_for_each(|(group, pages)| { + group + .columns + .iter_mut() + .zip(pages.iter()) + .try_for_each(|(column, pages)| { + let offset = self.offset; + column.offset_index_offset = Some(offset as i64); + self.offset += write_offset_index(&mut self.writer, pages)?; + column.offset_index_length = Some((self.offset - offset) as i32); + Result::Ok(()) + })?; + Result::Ok(()) + })?; + + let metadata = ThriftFileMetaData::new( + self.options.version.into(), + self.schema.clone().into_thrift(), + num_rows, + self.row_groups.clone(), + key_value_metadata, + self.created_by.clone(), + None, + None, + None, + ); + + let len = end_file(&mut self.writer, &metadata)?; + self.state = State::Finished; + self.metadata = Some(metadata); + Ok(self.offset + len) + } + + /// Returns the underlying writer. + pub fn into_inner(self) -> W { + self.writer + } + + /// Returns the underlying writer and [`ThriftFileMetaData`] + /// # Panics + /// This function panics if [`Self::end`] has not yet been called + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + (self.writer, self.metadata.expect("File to have ended")) + } +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::io::Cursor; + + use super::*; + use crate::parquet::error::Result; + use crate::parquet::read::read_metadata; + use crate::parquet::tests::get_path; + + #[test] + fn empty_file() -> Result<()> { + let mut testdata = get_path(); + testdata.push("alltypes_plain.parquet"); + let mut file = File::open(testdata).unwrap(); + + let mut metadata = read_metadata(&mut file)?; + + // take away all groups and rows + metadata.row_groups = vec![]; + metadata.num_rows = 0; + + let mut writer = Cursor::new(vec![]); + + // write the file + start_file(&mut writer)?; + end_file(&mut writer, &metadata.into_thrift())?; + + let a = writer.into_inner(); + + // read it again: + let result = read_metadata(&mut Cursor::new(a)); + assert!(result.is_ok()); + + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/write/indexes/mod.rs b/crates/polars-parquet/src/parquet/write/indexes/mod.rs new file mode 100644 index 000000000000..9f413a15d26a --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/indexes/mod.rs @@ -0,0 +1,4 @@ +mod serialize; +mod write; + +pub use write::*; diff --git a/crates/polars-parquet/src/parquet/write/indexes/serialize.rs b/crates/polars-parquet/src/parquet/write/indexes/serialize.rs new file mode 100644 index 000000000000..002ff2059371 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/indexes/serialize.rs @@ -0,0 +1,78 @@ +use parquet_format_safe::{BoundaryOrder, ColumnIndex, OffsetIndex, PageLocation}; + +use crate::parquet::error::{Error, Result}; +pub use crate::parquet::metadata::KeyValue; +use crate::parquet::statistics::serialize_statistics; +use crate::parquet::write::page::{is_data_page, PageWriteSpec}; + +pub fn serialize_column_index(pages: &[PageWriteSpec]) -> Result { + let mut null_pages = Vec::with_capacity(pages.len()); + let mut min_values = Vec::with_capacity(pages.len()); + let mut max_values = Vec::with_capacity(pages.len()); + let mut null_counts = Vec::with_capacity(pages.len()); + + pages + .iter() + .filter(|x| is_data_page(x)) + .try_for_each(|spec| { + if let Some(stats) = &spec.statistics { + let stats = serialize_statistics(stats.as_ref()); + + let null_count = stats + .null_count + .ok_or_else(|| Error::oos("null count of a page is required"))?; + null_counts.push(null_count); + + if let Some(min_value) = stats.min_value { + min_values.push(min_value); + max_values.push( + stats + .max_value + .ok_or_else(|| Error::oos("max value of a page is required"))?, + ); + null_pages.push(false) + } else { + min_values.push(vec![0]); + max_values.push(vec![0]); + null_pages.push(true) + } + + Result::Ok(()) + } else { + Err(Error::oos( + "options were set to write statistics but some pages miss them", + )) + } + })?; + Ok(ColumnIndex { + null_pages, + min_values, + max_values, + boundary_order: BoundaryOrder::UNORDERED, + null_counts: Some(null_counts), + }) +} + +pub fn serialize_offset_index(pages: &[PageWriteSpec]) -> Result { + let mut first_row_index = 0; + let page_locations = pages + .iter() + .filter(|x| is_data_page(x)) + .map(|spec| { + let location = PageLocation { + offset: spec.offset.try_into()?, + compressed_page_size: spec.bytes_written.try_into()?, + first_row_index, + }; + let num_rows = spec.num_rows.ok_or_else(|| { + Error::oos( + "options were set to write statistics but some data pages miss number of rows", + ) + })?; + first_row_index += num_rows as i64; + Ok(location) + }) + .collect::>>()?; + + Ok(OffsetIndex { page_locations }) +} diff --git a/crates/polars-parquet/src/parquet/write/indexes/write.rs b/crates/polars-parquet/src/parquet/write/indexes/write.rs new file mode 100644 index 000000000000..5aab227b7bac --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/indexes/write.rs @@ -0,0 +1,46 @@ +use std::io::Write; + +#[cfg(feature = "async")] +use futures::AsyncWrite; +use parquet_format_safe::thrift::protocol::TCompactOutputProtocol; +#[cfg(feature = "async")] +use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; + +use super::serialize::{serialize_column_index, serialize_offset_index}; +use crate::parquet::error::Result; +pub use crate::parquet::metadata::KeyValue; +use crate::parquet::write::page::PageWriteSpec; + +pub fn write_column_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { + let index = serialize_column_index(pages)?; + let mut protocol = TCompactOutputProtocol::new(writer); + Ok(index.write_to_out_protocol(&mut protocol)? as u64) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub async fn write_column_index_async( + writer: &mut W, + pages: &[PageWriteSpec], +) -> Result { + let index = serialize_column_index(pages)?; + let mut protocol = TCompactOutputStreamProtocol::new(writer); + Ok(index.write_to_out_stream_protocol(&mut protocol).await? as u64) +} + +pub fn write_offset_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { + let index = serialize_offset_index(pages)?; + let mut protocol = TCompactOutputProtocol::new(&mut *writer); + Ok(index.write_to_out_protocol(&mut protocol)? as u64) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub async fn write_offset_index_async( + writer: &mut W, + pages: &[PageWriteSpec], +) -> Result { + let index = serialize_offset_index(pages)?; + let mut protocol = TCompactOutputStreamProtocol::new(&mut *writer); + Ok(index.write_to_out_stream_protocol(&mut protocol).await? as u64) +} diff --git a/crates/polars-parquet/src/parquet/write/mod.rs b/crates/polars-parquet/src/parquet/write/mod.rs new file mode 100644 index 000000000000..251d37472db4 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/mod.rs @@ -0,0 +1,57 @@ +mod column_chunk; +mod compression; +mod file; +mod indexes; +pub(crate) mod page; +mod row_group; +mod statistics; + +#[cfg(feature = "async")] +mod stream; +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub use stream::FileStreamer; + +mod dyn_iter; +pub use compression::{compress, Compressor}; +pub use dyn_iter::{DynIter, DynStreamingIterator}; +pub use file::{write_metadata_sidecar, FileWriter}; +pub use row_group::ColumnOffsetsMetadata; + +use crate::parquet::page::CompressedPage; + +pub type RowGroupIter<'a, E> = + DynIter<'a, std::result::Result, E>>; + +/// Write options of different interfaces on this crate +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub struct WriteOptions { + /// Whether to write statistics, including indexes + pub write_statistics: bool, + /// Which Parquet version to use + pub version: Version, +} + +/// The parquet version to use +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum Version { + V1, + V2, +} + +/// Used to recall the state of the parquet writer - whether sync or async. +#[derive(PartialEq)] +enum State { + Initialised, + Started, + Finished, +} + +impl From for i32 { + fn from(version: Version) -> Self { + match version { + Version::V1 => 1, + Version::V2 => 2, + } + } +} diff --git a/crates/polars-parquet/src/parquet/write/page.rs b/crates/polars-parquet/src/parquet/write/page.rs new file mode 100644 index 000000000000..1f024b629f07 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/page.rs @@ -0,0 +1,243 @@ +use std::convert::TryInto; +use std::io::Write; +use std::sync::Arc; + +#[cfg(feature = "async")] +use futures::{AsyncWrite, AsyncWriteExt}; +use parquet_format_safe::thrift::protocol::TCompactOutputProtocol; +#[cfg(feature = "async")] +use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; +use parquet_format_safe::{DictionaryPageHeader, Encoding, PageType}; + +use crate::parquet::compression::Compression; +use crate::parquet::error::{Error, Result}; +use crate::parquet::page::{ + CompressedDataPage, CompressedDictPage, CompressedPage, DataPageHeader, ParquetPageHeader, +}; +use crate::parquet::statistics::Statistics; + +pub(crate) fn is_data_page(page: &PageWriteSpec) -> bool { + page.header.type_ == PageType::DATA_PAGE || page.header.type_ == PageType::DATA_PAGE_V2 +} + +fn maybe_bytes(uncompressed: usize, compressed: usize) -> Result<(i32, i32)> { + let uncompressed_page_size: i32 = uncompressed.try_into().map_err(|_| { + Error::oos(format!( + "A page can only contain i32::MAX uncompressed bytes. This one contains {}", + uncompressed + )) + })?; + + let compressed_page_size: i32 = compressed.try_into().map_err(|_| { + Error::oos(format!( + "A page can only contain i32::MAX compressed bytes. This one contains {}", + compressed + )) + })?; + + Ok((uncompressed_page_size, compressed_page_size)) +} + +/// Contains page write metrics. +pub struct PageWriteSpec { + pub header: ParquetPageHeader, + pub num_values: usize, + pub num_rows: Option, + pub header_size: u64, + pub offset: u64, + pub bytes_written: u64, + pub compression: Compression, + pub statistics: Option>, +} + +pub fn write_page( + writer: &mut W, + offset: u64, + compressed_page: &CompressedPage, +) -> Result { + let num_values = compressed_page.num_values(); + let selected_rows = compressed_page.selected_rows(); + + let header = match &compressed_page { + CompressedPage::Data(compressed_page) => assemble_data_page_header(compressed_page), + CompressedPage::Dict(compressed_page) => assemble_dict_page_header(compressed_page), + }?; + + let header_size = write_page_header(writer, &header)?; + let mut bytes_written = header_size; + + bytes_written += match &compressed_page { + CompressedPage::Data(compressed_page) => { + writer.write_all(&compressed_page.buffer)?; + compressed_page.buffer.len() as u64 + }, + CompressedPage::Dict(compressed_page) => { + writer.write_all(&compressed_page.buffer)?; + compressed_page.buffer.len() as u64 + }, + }; + + let statistics = match &compressed_page { + CompressedPage::Data(compressed_page) => compressed_page.statistics().transpose()?, + CompressedPage::Dict(_) => None, + }; + + Ok(PageWriteSpec { + header, + header_size, + offset, + bytes_written, + compression: compressed_page.compression(), + statistics, + num_rows: selected_rows.map(|x| x.last().unwrap().length), + num_values, + }) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub async fn write_page_async( + writer: &mut W, + offset: u64, + compressed_page: &CompressedPage, +) -> Result { + let num_values = compressed_page.num_values(); + let selected_rows = compressed_page.selected_rows(); + + let header = match &compressed_page { + CompressedPage::Data(compressed_page) => assemble_data_page_header(compressed_page), + CompressedPage::Dict(compressed_page) => assemble_dict_page_header(compressed_page), + }?; + + let header_size = write_page_header_async(writer, &header).await?; + let mut bytes_written = header_size as u64; + + bytes_written += match &compressed_page { + CompressedPage::Data(compressed_page) => { + writer.write_all(&compressed_page.buffer).await?; + compressed_page.buffer.len() as u64 + }, + CompressedPage::Dict(compressed_page) => { + writer.write_all(&compressed_page.buffer).await?; + compressed_page.buffer.len() as u64 + }, + }; + + let statistics = match &compressed_page { + CompressedPage::Data(compressed_page) => compressed_page.statistics().transpose()?, + CompressedPage::Dict(_) => None, + }; + + Ok(PageWriteSpec { + header, + header_size, + offset, + bytes_written, + compression: compressed_page.compression(), + statistics, + num_rows: selected_rows.map(|x| x.last().unwrap().length), + num_values, + }) +} + +fn assemble_data_page_header(page: &CompressedDataPage) -> Result { + let (uncompressed_page_size, compressed_page_size) = + maybe_bytes(page.uncompressed_size(), page.compressed_size())?; + + let mut page_header = ParquetPageHeader { + type_: match page.header() { + DataPageHeader::V1(_) => PageType::DATA_PAGE, + DataPageHeader::V2(_) => PageType::DATA_PAGE_V2, + }, + uncompressed_page_size, + compressed_page_size, + crc: None, + data_page_header: None, + index_page_header: None, + dictionary_page_header: None, + data_page_header_v2: None, + }; + + match page.header() { + DataPageHeader::V1(header) => { + page_header.data_page_header = Some(header.clone()); + }, + DataPageHeader::V2(header) => { + page_header.data_page_header_v2 = Some(header.clone()); + }, + } + Ok(page_header) +} + +fn assemble_dict_page_header(page: &CompressedDictPage) -> Result { + let (uncompressed_page_size, compressed_page_size) = + maybe_bytes(page.uncompressed_page_size, page.buffer.len())?; + + let num_values: i32 = page.num_values.try_into().map_err(|_| { + Error::oos(format!( + "A dictionary page can only contain i32::MAX items. This one contains {}", + page.num_values + )) + })?; + + Ok(ParquetPageHeader { + type_: PageType::DICTIONARY_PAGE, + uncompressed_page_size, + compressed_page_size, + crc: None, + data_page_header: None, + index_page_header: None, + dictionary_page_header: Some(DictionaryPageHeader { + num_values, + encoding: Encoding::PLAIN, + is_sorted: None, + }), + data_page_header_v2: None, + }) +} + +/// writes the page header into `writer`, returning the number of bytes used in the process. +fn write_page_header(mut writer: &mut W, header: &ParquetPageHeader) -> Result { + let mut protocol = TCompactOutputProtocol::new(&mut writer); + Ok(header.write_to_out_protocol(&mut protocol)? as u64) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +/// writes the page header into `writer`, returning the number of bytes used in the process. +async fn write_page_header_async( + mut writer: &mut W, + header: &ParquetPageHeader, +) -> Result { + let mut protocol = TCompactOutputStreamProtocol::new(&mut writer); + Ok(header.write_to_out_stream_protocol(&mut protocol).await? as u64) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dict_too_large() { + let page = CompressedDictPage::new( + vec![], + Compression::Uncompressed, + i32::MAX as usize + 1, + 100, + false, + ); + assert!(assemble_dict_page_header(&page).is_err()); + } + + #[test] + fn dict_too_many_values() { + let page = CompressedDictPage::new( + vec![], + Compression::Uncompressed, + 0, + i32::MAX as usize + 1, + false, + ); + assert!(assemble_dict_page_header(&page).is_err()); + } +} diff --git a/crates/polars-parquet/src/parquet/write/row_group.rs b/crates/polars-parquet/src/parquet/write/row_group.rs new file mode 100644 index 000000000000..943079430aaf --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/row_group.rs @@ -0,0 +1,200 @@ +use std::io::Write; + +#[cfg(feature = "async")] +use futures::AsyncWrite; +use parquet_format_safe::{ColumnChunk, RowGroup}; + +use super::column_chunk::write_column_chunk; +#[cfg(feature = "async")] +use super::column_chunk::write_column_chunk_async; +use super::page::{is_data_page, PageWriteSpec}; +use super::{DynIter, DynStreamingIterator}; +use crate::parquet::error::{Error, Result}; +use crate::parquet::metadata::{ColumnChunkMetaData, ColumnDescriptor}; +use crate::parquet::page::CompressedPage; + +pub struct ColumnOffsetsMetadata { + pub dictionary_page_offset: Option, + pub data_page_offset: Option, +} + +impl ColumnOffsetsMetadata { + pub fn from_column_chunk(column_chunk: &ColumnChunk) -> ColumnOffsetsMetadata { + ColumnOffsetsMetadata { + dictionary_page_offset: column_chunk + .meta_data + .as_ref() + .map(|meta| meta.dictionary_page_offset) + .unwrap_or(None), + data_page_offset: column_chunk + .meta_data + .as_ref() + .map(|meta| meta.data_page_offset), + } + } + + pub fn from_column_chunk_metadata( + column_chunk_metadata: &ColumnChunkMetaData, + ) -> ColumnOffsetsMetadata { + ColumnOffsetsMetadata { + dictionary_page_offset: column_chunk_metadata.dictionary_page_offset(), + data_page_offset: Some(column_chunk_metadata.data_page_offset()), + } + } + + pub fn calc_row_group_file_offset(&self) -> Option { + self.dictionary_page_offset + .filter(|x| *x > 0_i64) + .or(self.data_page_offset) + } +} + +fn compute_num_rows(columns: &[(ColumnChunk, Vec)]) -> Result { + columns + .get(0) + .map(|(_, specs)| { + let mut num_rows = 0; + specs + .iter() + .filter(|x| is_data_page(x)) + .try_for_each(|spec| { + num_rows += spec.num_rows.ok_or_else(|| { + Error::oos("All data pages must declare the number of rows on it") + })? as i64; + Result::Ok(()) + })?; + Result::Ok(num_rows) + }) + .unwrap_or(Ok(0)) +} + +pub fn write_row_group< + 'a, + W, + E, // external error any of the iterators may emit +>( + writer: &mut W, + mut offset: u64, + descriptors: &[ColumnDescriptor], + columns: DynIter<'a, std::result::Result, E>>, + ordinal: usize, +) -> Result<(RowGroup, Vec>, u64)> +where + W: Write, + Error: From, + E: std::error::Error, +{ + let column_iter = descriptors.iter().zip(columns); + + let initial = offset; + let columns = column_iter + .map(|(descriptor, page_iter)| { + let (column, page_specs, size) = + write_column_chunk(writer, offset, descriptor, page_iter?)?; + offset += size; + Ok((column, page_specs)) + }) + .collect::>>()?; + let bytes_written = offset - initial; + + let num_rows = compute_num_rows(&columns)?; + + // compute row group stats + let file_offset = columns + .get(0) + .map(|(column_chunk, _)| { + ColumnOffsetsMetadata::from_column_chunk(column_chunk).calc_row_group_file_offset() + }) + .unwrap_or(None); + + let total_byte_size = columns + .iter() + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_uncompressed_size) + .sum(); + let total_compressed_size = columns + .iter() + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_compressed_size) + .sum(); + + let (columns, specs) = columns.into_iter().unzip(); + + Ok(( + RowGroup { + columns, + total_byte_size, + num_rows, + sorting_columns: None, + file_offset, + total_compressed_size: Some(total_compressed_size), + ordinal: ordinal.try_into().ok(), + }, + specs, + bytes_written, + )) +} + +#[cfg(feature = "async")] +#[cfg_attr(docsrs, doc(cfg(feature = "async")))] +pub async fn write_row_group_async< + 'a, + W, + E, // external error any of the iterators may emit +>( + writer: &mut W, + mut offset: u64, + descriptors: &[ColumnDescriptor], + columns: DynIter<'a, std::result::Result, E>>, + ordinal: usize, +) -> Result<(RowGroup, Vec>, u64)> +where + W: AsyncWrite + Unpin + Send, + Error: From, + E: std::error::Error, +{ + let column_iter = descriptors.iter().zip(columns); + + let initial = offset; + let mut columns = vec![]; + for (descriptor, page_iter) in column_iter { + let (column, page_specs, size) = + write_column_chunk_async(writer, offset, descriptor, page_iter?).await?; + offset += size; + columns.push((column, page_specs)); + } + let bytes_written = offset - initial; + + let num_rows = compute_num_rows(&columns)?; + + // compute row group stats + let file_offset = columns + .get(0) + .map(|(column_chunk, _)| { + ColumnOffsetsMetadata::from_column_chunk(column_chunk).calc_row_group_file_offset() + }) + .unwrap_or(None); + + let total_byte_size = columns + .iter() + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_uncompressed_size) + .sum(); + let total_compressed_size = columns + .iter() + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_compressed_size) + .sum(); + + let (columns, specs) = columns.into_iter().unzip(); + + Ok(( + RowGroup { + columns, + total_byte_size, + num_rows: num_rows as i64, + sorting_columns: None, + file_offset, + total_compressed_size: Some(total_compressed_size), + ordinal: ordinal.try_into().ok(), + }, + specs, + bytes_written, + )) +} diff --git a/crates/polars-parquet/src/parquet/write/statistics.rs b/crates/polars-parquet/src/parquet/write/statistics.rs new file mode 100644 index 000000000000..f0aa9cc2011a --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/statistics.rs @@ -0,0 +1,323 @@ +use std::sync::Arc; + +use crate::parquet::error::{Error, Result}; +use crate::parquet::schema::types::PhysicalType; +use crate::parquet::statistics::*; +use crate::parquet::types::NativeType; + +#[inline] +fn reduce_single T>(lhs: Option, rhs: Option, op: F) -> Option { + match (lhs, rhs) { + (None, None) => None, + (Some(x), None) => Some(x), + (None, Some(x)) => Some(x), + (Some(x), Some(y)) => Some(op(x, y)), + } +} + +#[inline] +fn reduce_vec8(lhs: Option>, rhs: &Option>, max: bool) -> Option> { + match (lhs, rhs) { + (None, None) => None, + (Some(x), None) => Some(x), + (None, Some(x)) => Some(x.clone()), + (Some(x), Some(y)) => Some(ord_binary(x, y.clone(), max)), + } +} + +pub fn reduce(stats: &[&Option>]) -> Result>> { + if stats.is_empty() { + return Ok(None); + } + let stats = stats + .iter() + .filter_map(|x| x.as_ref()) + .map(|x| x.as_ref()) + .collect::>(); + if stats.is_empty() { + return Ok(None); + }; + + let same_type = stats + .iter() + .skip(1) + .all(|x| x.physical_type() == stats[0].physical_type()); + if !same_type { + return Err(Error::oos("The statistics do not have the same data_type")); + }; + Ok(match stats[0].physical_type() { + PhysicalType::Boolean => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_boolean(stats))) + }, + PhysicalType::Int32 => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_primitive::(stats))) + }, + PhysicalType::Int64 => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_primitive::(stats))) + }, + PhysicalType::Float => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_primitive::(stats))) + }, + PhysicalType::Double => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_primitive::(stats))) + }, + PhysicalType::ByteArray => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_binary(stats))) + }, + PhysicalType::FixedLenByteArray(_) => { + let stats = stats.iter().map(|x| x.as_any().downcast_ref().unwrap()); + Some(Arc::new(reduce_fix_len_binary(stats))) + }, + _ => todo!(), + }) +} + +fn reduce_binary<'a, I: Iterator>(mut stats: I) -> BinaryStatistics { + let initial = stats.next().unwrap().clone(); + stats.fold(initial, |mut acc, new| { + acc.min_value = reduce_vec8(acc.min_value, &new.min_value, false); + acc.max_value = reduce_vec8(acc.max_value, &new.max_value, true); + acc.null_count = reduce_single(acc.null_count, new.null_count, |x, y| x + y); + acc.distinct_count = None; + acc + }) +} + +fn reduce_fix_len_binary<'a, I: Iterator>( + mut stats: I, +) -> FixedLenStatistics { + let initial = stats.next().unwrap().clone(); + stats.fold(initial, |mut acc, new| { + acc.min_value = reduce_vec8(acc.min_value, &new.min_value, false); + acc.max_value = reduce_vec8(acc.max_value, &new.max_value, true); + acc.null_count = reduce_single(acc.null_count, new.null_count, |x, y| x + y); + acc.distinct_count = None; + acc + }) +} + +fn ord_binary(a: Vec, b: Vec, max: bool) -> Vec { + for (v1, v2) in a.iter().zip(b.iter()) { + match v1.cmp(v2) { + std::cmp::Ordering::Greater => { + if max { + return a; + } else { + return b; + } + }, + std::cmp::Ordering::Less => { + if max { + return b; + } else { + return a; + } + }, + _ => {}, + } + } + a +} + +fn reduce_boolean<'a, I: Iterator>( + mut stats: I, +) -> BooleanStatistics { + let initial = stats.next().unwrap().clone(); + stats.fold(initial, |mut acc, new| { + acc.min_value = reduce_single( + acc.min_value, + new.min_value, + |x, y| if x & !(y) { y } else { x }, + ); + acc.max_value = reduce_single( + acc.max_value, + new.max_value, + |x, y| if x & !(y) { x } else { y }, + ); + acc.null_count = reduce_single(acc.null_count, new.null_count, |x, y| x + y); + acc.distinct_count = None; + acc + }) +} + +fn reduce_primitive< + 'a, + T: NativeType + std::cmp::PartialOrd, + I: Iterator>, +>( + mut stats: I, +) -> PrimitiveStatistics { + let initial = stats.next().unwrap().clone(); + stats.fold(initial, |mut acc, new| { + acc.min_value = reduce_single( + acc.min_value, + new.min_value, + |x, y| if x > y { y } else { x }, + ); + acc.max_value = reduce_single( + acc.max_value, + new.max_value, + |x, y| if x > y { x } else { y }, + ); + acc.null_count = reduce_single(acc.null_count, new.null_count, |x, y| x + y); + acc.distinct_count = None; + acc + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parquet::schema::types::PrimitiveType; + + #[test] + fn binary() -> Result<()> { + let iter = vec![ + BinaryStatistics { + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::ByteArray, + ), + null_count: Some(0), + distinct_count: None, + min_value: Some(vec![1, 2]), + max_value: Some(vec![3, 4]), + }, + BinaryStatistics { + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::ByteArray, + ), + null_count: Some(0), + distinct_count: None, + min_value: Some(vec![4, 5]), + max_value: None, + }, + ]; + let a = reduce_binary(iter.iter()); + + assert_eq!( + a, + BinaryStatistics { + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::ByteArray, + ), + null_count: Some(0), + distinct_count: None, + min_value: Some(vec![1, 2]), + max_value: Some(vec![3, 4]), + }, + ); + + Ok(()) + } + + #[test] + fn fixed_len_binary() -> Result<()> { + let iter = vec![ + FixedLenStatistics { + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::FixedLenByteArray(2), + ), + null_count: Some(0), + distinct_count: None, + min_value: Some(vec![1, 2]), + max_value: Some(vec![3, 4]), + }, + FixedLenStatistics { + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::FixedLenByteArray(2), + ), + null_count: Some(0), + distinct_count: None, + min_value: Some(vec![4, 5]), + max_value: None, + }, + ]; + let a = reduce_fix_len_binary(iter.iter()); + + assert_eq!( + a, + FixedLenStatistics { + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::FixedLenByteArray(2), + ), + null_count: Some(0), + distinct_count: None, + min_value: Some(vec![1, 2]), + max_value: Some(vec![3, 4]), + }, + ); + + Ok(()) + } + + #[test] + fn boolean() -> Result<()> { + let iter = vec![ + BooleanStatistics { + null_count: Some(0), + distinct_count: None, + min_value: Some(false), + max_value: Some(false), + }, + BooleanStatistics { + null_count: Some(0), + distinct_count: None, + min_value: Some(true), + max_value: Some(true), + }, + ]; + let a = reduce_boolean(iter.iter()); + + assert_eq!( + a, + BooleanStatistics { + null_count: Some(0), + distinct_count: None, + min_value: Some(false), + max_value: Some(true), + }, + ); + + Ok(()) + } + + #[test] + fn primitive() -> Result<()> { + let iter = vec![PrimitiveStatistics { + null_count: Some(2), + distinct_count: None, + min_value: Some(30), + max_value: Some(70), + primitive_type: PrimitiveType::from_physical("bla".to_string(), PhysicalType::Int32), + }]; + let a = reduce_primitive(iter.iter()); + + assert_eq!( + a, + PrimitiveStatistics { + null_count: Some(2), + distinct_count: None, + min_value: Some(30), + max_value: Some(70), + primitive_type: PrimitiveType::from_physical( + "bla".to_string(), + PhysicalType::Int32, + ), + }, + ); + + Ok(()) + } +} diff --git a/crates/polars-parquet/src/parquet/write/stream.rs b/crates/polars-parquet/src/parquet/write/stream.rs new file mode 100644 index 000000000000..5ef3b32d2844 --- /dev/null +++ b/crates/polars-parquet/src/parquet/write/stream.rs @@ -0,0 +1,192 @@ +use std::io::Write; + +use futures::{AsyncWrite, AsyncWriteExt}; +use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; +use parquet_format_safe::{FileMetaData, RowGroup}; + +use super::row_group::write_row_group_async; +use super::{RowGroupIter, WriteOptions}; +use crate::parquet::error::{Error, Result}; +use crate::parquet::metadata::{KeyValue, SchemaDescriptor}; +use crate::parquet::write::indexes::{write_column_index_async, write_offset_index_async}; +use crate::parquet::write::page::PageWriteSpec; +use crate::parquet::write::State; +use crate::parquet::{FOOTER_SIZE, PARQUET_MAGIC}; + +async fn start_file(writer: &mut W) -> Result { + writer.write_all(&PARQUET_MAGIC).await?; + Ok(PARQUET_MAGIC.len() as u64) +} + +async fn end_file( + mut writer: &mut W, + metadata: FileMetaData, +) -> Result { + // Write file metadata + let mut protocol = TCompactOutputStreamProtocol::new(&mut writer); + let metadata_len = metadata.write_to_out_stream_protocol(&mut protocol).await? as i32; + + // Write footer + let metadata_bytes = metadata_len.to_le_bytes(); + let mut footer_buffer = [0u8; FOOTER_SIZE as usize]; + (0..4).for_each(|i| { + footer_buffer[i] = metadata_bytes[i]; + }); + + (&mut footer_buffer[4..]).write_all(&PARQUET_MAGIC)?; + writer.write_all(&footer_buffer).await?; + writer.flush().await?; + Ok(metadata_len as u64 + FOOTER_SIZE) +} + +/// An interface to write a parquet file asynchronously. +/// Use `start` to write the header, `write` to write a row group, +/// and `end` to write the footer. +pub struct FileStreamer { + writer: W, + schema: SchemaDescriptor, + options: WriteOptions, + created_by: Option, + + offset: u64, + row_groups: Vec, + page_specs: Vec>>, + /// Used to store the current state for writing the file + state: State, +} + +// Accessors +impl FileStreamer { + /// The options assigned to the file + pub fn options(&self) -> &WriteOptions { + &self.options + } + + /// The [`SchemaDescriptor`] assigned to this file + pub fn schema(&self) -> &SchemaDescriptor { + &self.schema + } +} + +impl FileStreamer { + /// Returns a new [`FileStreamer`]. + pub fn new( + writer: W, + schema: SchemaDescriptor, + options: WriteOptions, + created_by: Option, + ) -> Self { + Self { + writer, + schema, + options, + created_by, + offset: 0, + row_groups: vec![], + page_specs: vec![], + state: State::Initialised, + } + } + + /// Writes the header of the file. + /// + /// This is automatically called by [`Self::write`] if not called following [`Self::new`]. + /// + /// # Errors + /// Returns an error if data has been written to the file. + async fn start(&mut self) -> Result<()> { + if self.offset == 0 { + self.offset = start_file(&mut self.writer).await? as u64; + self.state = State::Started; + Ok(()) + } else { + Err(Error::InvalidParameter( + "Start cannot be called twice".to_string(), + )) + } + } + + /// Writes a row group to the file. + pub async fn write(&mut self, row_group: RowGroupIter<'_, E>) -> Result<()> + where + Error: From, + E: std::error::Error, + { + if self.offset == 0 { + self.start().await?; + } + + let ordinal = self.row_groups.len(); + let (group, specs, size) = write_row_group_async( + &mut self.writer, + self.offset, + self.schema.columns(), + row_group, + ordinal, + ) + .await?; + self.offset += size; + self.row_groups.push(group); + self.page_specs.push(specs); + Ok(()) + } + + /// Writes the footer of the parquet file. Returns the total size of the file and the + /// underlying writer. + pub async fn end(&mut self, key_value_metadata: Option>) -> Result { + if self.offset == 0 { + self.start().await?; + } + + if self.state != State::Started { + return Err(Error::InvalidParameter( + "End cannot be called twice".to_string(), + )); + } + // compute file stats + let num_rows = self.row_groups.iter().map(|group| group.num_rows).sum(); + + if self.options.write_statistics { + // write column indexes (require page statistics) + for (group, pages) in self.row_groups.iter_mut().zip(self.page_specs.iter()) { + for (column, pages) in group.columns.iter_mut().zip(pages.iter()) { + let offset = self.offset; + column.column_index_offset = Some(offset as i64); + self.offset += write_column_index_async(&mut self.writer, pages).await?; + let length = self.offset - offset; + column.column_index_length = Some(length as i32); + } + } + }; + + // write offset index + for (group, pages) in self.row_groups.iter_mut().zip(self.page_specs.iter()) { + for (column, pages) in group.columns.iter_mut().zip(pages.iter()) { + let offset = self.offset; + column.offset_index_offset = Some(offset as i64); + self.offset += write_offset_index_async(&mut self.writer, pages).await?; + column.offset_index_length = Some((self.offset - offset) as i32); + } + } + + let metadata = FileMetaData::new( + self.options.version.into(), + self.schema.clone().into_thrift(), + num_rows, + self.row_groups.clone(), + key_value_metadata, + self.created_by.clone(), + None, + None, + None, + ); + + let len = end_file(&mut self.writer, metadata).await?; + Ok(self.offset + len) + } + + /// Returns the underlying writer. + pub fn into_inner(self) -> W { + self.writer + } +} diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index c017f731739c..56a56493dce3 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -14,6 +14,7 @@ polars-error = { workspace = true } ahash = { workspace = true } bytemuck = { workspace = true } hashbrown = { workspace = true } +indexmap = { workspace = true } num-traits = { workspace = true } once_cell = { workspace = true } rayon = { workspace = true } diff --git a/crates/polars-utils/src/aliases.rs b/crates/polars-utils/src/aliases.rs index a2ca71d1ff47..5ecb1b0033d9 100644 --- a/crates/polars-utils/src/aliases.rs +++ b/crates/polars-utils/src/aliases.rs @@ -2,3 +2,60 @@ use ahash::RandomState; pub type PlHashMap = hashbrown::HashMap; pub type PlHashSet = hashbrown::HashSet; +pub type PlIndexMap = indexmap::IndexMap; +pub type PlIndexSet = indexmap::IndexSet; + +pub trait InitHashMaps { + type HashMap; + + fn new() -> Self::HashMap; + + fn with_capacity(capacity: usize) -> Self::HashMap; +} + +impl InitHashMaps for PlHashMap { + type HashMap = Self; + + fn new() -> Self::HashMap { + Self::with_capacity_and_hasher(0, Default::default()) + } + + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity_and_hasher(capacity, Default::default()) + } +} +impl InitHashMaps for PlHashSet { + type HashMap = Self; + + fn new() -> Self::HashMap { + Self::with_capacity_and_hasher(0, Default::default()) + } + + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity_and_hasher(capacity, Default::default()) + } +} + +impl InitHashMaps for PlIndexSet { + type HashMap = Self; + + fn new() -> Self::HashMap { + Self::with_capacity_and_hasher(0, Default::default()) + } + + fn with_capacity(capacity: usize) -> Self::HashMap { + Self::with_capacity_and_hasher(capacity, Default::default()) + } +} + +impl InitHashMaps for PlIndexMap { + type HashMap = Self; + + fn new() -> Self::HashMap { + Self::with_capacity_and_hasher(0, Default::default()) + } + + fn with_capacity(capacity: usize) -> Self::HashMap { + Self::with_capacity_and_hasher(capacity, Default::default()) + } +} diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 1c5c1d9140c6..08f6e965944a 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -1475,25 +1475,6 @@ dependencies = [ "futures", ] -[[package]] -name = "parquet2" -version = "0.17.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "579fe5745f02cef3d5f236bfed216fd4693e49e4e920a13475c6132233283bce" -dependencies = [ - "async-stream", - "brotli", - "flate2", - "futures", - "lz4", - "parquet-format-safe", - "seq-macro", - "snap", - "streaming-decompression", - "xxhash-rust", - "zstd 0.12.4", -] - [[package]] name = "parse-zoneinfo" version = "0.3.0" @@ -1668,7 +1649,6 @@ dependencies = [ "arrow-format", "avro-schema", "object_store", - "parquet2", "regex", "simdutf8", "thiserror", @@ -1806,14 +1786,23 @@ name = "polars-parquet" version = "0.34.2" dependencies = [ "ahash", + "async-stream", "base64", + "brotli", "ethnum", + "flate2", "futures", + "lz4", "num-traits", - "parquet2", + "parquet-format-safe", "polars-arrow", "polars-error", + "polars-utils", + "seq-macro", "simdutf8", + "snap", + "streaming-decompression", + "zstd 0.12.4", ] [[package]] @@ -1917,6 +1906,7 @@ dependencies = [ "ahash", "bytemuck", "hashbrown 0.14.0", + "indexmap 2.0.0", "num-traits", "once_cell", "polars-error",