From b8f5b6f08c54c6632496e8efcd822b3afe00ce6b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 23 Jan 2025 18:32:17 +0000 Subject: [PATCH 1/3] feat: allow setting custom value data type in `PrimitiveDictionaryBuilder` Fixes #7011 --- .../builder/primitive_dictionary_builder.rs | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 282f0ae9d5b1..d08f76723ca6 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -168,6 +168,22 @@ where map: HashMap::with_capacity(values_capacity), } } + + /// By default the [`PrimitiveBuilder`] used in the `values_builder` uses [`ArrowPrimitiveType::DATA_TYPE`] as the + /// data type of the generated array. + /// + /// This method allows overriding the data type, to allow specifying timezones + /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`] + /// + /// # Panics + /// + /// This method panics if `value_data_type` is not [PrimitiveArray::is_compatible] + pub fn with_value_data_type(self, value_data_type: DataType) -> Self { + Self { + values_builder: self.values_builder.with_data_type(value_data_type), + ..self + } + } } impl ArrayBuilder for PrimitiveDictionaryBuilder @@ -633,4 +649,17 @@ mod tests { assert_eq!(values, [None, None]); } + + #[test] + fn creating_dictionary_with_custom_value_type_should_work() { + let value_data_type = DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+08:00".into())); + let data_type = + DataType::Dictionary(Box::new(DataType::Int32), Box::new(value_data_type.clone())); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 2) + .with_value_data_type(value_data_type); + + assert_eq!(builder.finish().data_type(), &data_type); + } } From f26e63642f2437c9ee67ae9cb12c7f0cf8262a0a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 23 Jan 2025 18:49:00 +0000 Subject: [PATCH 2/3] use the values capacity for the hash map --- .../builder/primitive_dictionary_builder.rs | 40 +++++++------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index d08f76723ca6..26f2b24bccd1 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -144,7 +144,7 @@ where ) -> Self { let keys = keys_builder.values_slice(); let values = values_builder.values_slice(); - let mut map = HashMap::with_capacity(values.len()); + let mut map = HashMap::with_capacity(values_builder.capacity()); keys.iter().zip(values.iter()).for_each(|(key, value)| { map.insert(Value(*value), K::Native::to_usize(*key).unwrap()); @@ -168,22 +168,6 @@ where map: HashMap::with_capacity(values_capacity), } } - - /// By default the [`PrimitiveBuilder`] used in the `values_builder` uses [`ArrowPrimitiveType::DATA_TYPE`] as the - /// data type of the generated array. - /// - /// This method allows overriding the data type, to allow specifying timezones - /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`] - /// - /// # Panics - /// - /// This method panics if `value_data_type` is not [PrimitiveArray::is_compatible] - pub fn with_value_data_type(self, value_data_type: DataType) -> Self { - Self { - values_builder: self.values_builder.with_data_type(value_data_type), - ..self - } - } } impl ArrayBuilder for PrimitiveDictionaryBuilder @@ -651,15 +635,19 @@ mod tests { } #[test] - fn creating_dictionary_with_custom_value_type_should_work() { - let value_data_type = DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+08:00".into())); - let data_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(value_data_type.clone())); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 2) - .with_value_data_type(value_data_type); + fn creating_dictionary_from_builders_should_use_values_capacity_for_the_map() { + let builder = unsafe { + PrimitiveDictionaryBuilder::::new_from_builders( + PrimitiveBuilder::with_capacity(1).with_data_type(DataType::Int32), + PrimitiveBuilder::with_capacity(2).with_data_type(DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+08:00".into()))), + ) + }; - assert_eq!(builder.finish().data_type(), &data_type); + assert!( + builder.map.capacity() >= builder.values_builder.capacity(), + "map capacity {} should be at least the values capacity {}", + builder.map.capacity(), + builder.values_builder.capacity() + ) } } From a52f533dfc7e0e44cfbe73fc693640bdbb0d31e5 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 23 Jan 2025 19:00:00 +0000 Subject: [PATCH 3/3] update new_from_empty_builders and not new_from_builders --- .../src/builder/primitive_dictionary_builder.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 26f2b24bccd1..f4a6662462e0 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -126,10 +126,11 @@ where keys_builder.is_empty() && values_builder.is_empty(), "keys and values builders must be empty" ); + let values_capacity = values_builder.capacity(); Self { keys_builder, values_builder, - map: HashMap::new(), + map: HashMap::with_capacity(values_capacity), } } @@ -144,7 +145,7 @@ where ) -> Self { let keys = keys_builder.values_slice(); let values = values_builder.values_slice(); - let mut map = HashMap::with_capacity(values_builder.capacity()); + let mut map = HashMap::with_capacity(values.len()); keys.iter().zip(values.iter()).for_each(|(key, value)| { map.insert(Value(*value), K::Native::to_usize(*key).unwrap()); @@ -636,12 +637,10 @@ mod tests { #[test] fn creating_dictionary_from_builders_should_use_values_capacity_for_the_map() { - let builder = unsafe { - PrimitiveDictionaryBuilder::::new_from_builders( + let builder = PrimitiveDictionaryBuilder::::new_from_empty_builders( PrimitiveBuilder::with_capacity(1).with_data_type(DataType::Int32), PrimitiveBuilder::with_capacity(2).with_data_type(DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+08:00".into()))), - ) - }; + ); assert!( builder.map.capacity() >= builder.values_builder.capacity(),