Skip to content

Commit

Permalink
Add array_frequency Presto function (facebookincubator#3807)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#3807

Implement `array_frequency` in velox with same signature [as presto](https://prestodb.io/docs/current/functions/array.html#array_frequency).

Fixes facebookincubator#3752

Reviewed By: Yuhta

Differential Revision: D42712223

fbshipit-source-id: cf29ac047b6988b5fd72d320887359064ab7573a
  • Loading branch information
vermapratyush authored and facebook-github-bot committed Jan 26, 2023
1 parent 534149a commit cf55fb5
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 0 deletions.
10 changes: 10 additions & 0 deletions velox/docs/functions/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ Array Functions
SELECT array_except(ARRAY [1, 2, 2], ARRAY [1, 3, 4]); -- [2]
SELECT array_except(ARRAY [1, NULL, NULL], ARRAY [1, 1, NULL]); -- []

.. function:: array_frequency(array(E) x) -> map(E, int)

Returns a map: keys are the unique elements in the array, values are how many times the key appears.
Ignores null elements. Empty array returns empty map. E must be bigint or varchar. ::

SELECT array_frequency(ARRAY [1, 1, 2, 2, 2, 2]); -- {1 -> 2, 2 -> 4}
SELECT array_frequency(ARRAY [1, 1, NULL, NULL, NULL]); -- {1 -> 2}
SELECT array_frequency(ARRAY ["knock", "knock", "who", "?"]); -- {"knock" -> 2, "who" -> 1, "?" -> 1}
SELECT array_frequency(ARRAY []); -- {}

.. function:: array_has_duplicates(array(E)) -> boolean

Returns a boolean: whether array has any elements that occur more than once.
Expand Down
26 changes: 26 additions & 0 deletions velox/functions/prestosql/ArrayFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,4 +399,30 @@ struct ArrayHasDuplicatesFunction {
}
};

// Function Signature: array<T> -> map<T, int>, where T is {"bigint", "varchar"}
// Returns a map with frequency of each element in the input array vector.
template <typename TExecParams, typename T>
struct ArrayFrequencyFunction {
VELOX_DEFINE_FUNCTION_TYPES(TExecParams);

FOLLY_ALWAYS_INLINE void call(
out_type<velox::Map<T, int>>& out,
arg_type<velox::Array<T>> inputArray) {
folly::F14FastMap<arg_type<T>, int> frequencyCount;

// optimization to skip counting frequency of null values
if (inputArray.mayHaveNulls()) {
for (const auto& item : inputArray.skipNulls()) {
frequencyCount[item]++;
}
} else {
for (const auto& item : inputArray) {
frequencyCount[item.value()]++;
}
}

out.copy_from(frequencyCount);
}
};

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ inline void registerArrayHasDuplicatesFunctions() {
Array<T>>({"array_has_duplicates"});
}

template <typename T>
inline void registerArrayFrequencyFunctions() {
registerFunction<
ParameterBinder<ArrayFrequencyFunction, T>,
Map<T, int>,
Array<T>>({"array_frequency"});
}

void registerArrayFunctions() {
registerArrayConstructor("array_constructor");
VELOX_REGISTER_VECTOR_FUNCTION(udf_array_distinct, "array_distinct");
Expand Down Expand Up @@ -120,5 +128,8 @@ void registerArrayFunctions() {
registerArrayHasDuplicatesFunctions<int32_t>();
registerArrayHasDuplicatesFunctions<int64_t>();
registerArrayHasDuplicatesFunctions<Varchar>();

registerArrayFrequencyFunctions<int64_t>();
registerArrayFrequencyFunctions<Varchar>();
}
}; // namespace facebook::velox::functions
93 changes: 93 additions & 0 deletions velox/functions/prestosql/tests/ArrayFrequencyTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"

using namespace facebook::velox;
using namespace facebook::velox::test;

namespace {
class ArrayFrequencyTest : public functions::test::FunctionBaseTest {
protected:
// Evaluate an expression.
void testExpr(
const VectorPtr& expected,
const std::string& expression,
const std::vector<VectorPtr>& input) {
auto result = evaluate<BaseVector>(expression, makeRowVector(input));
assertEqualVectors(expected, result);
}
};
} // namespace

TEST_F(ArrayFrequencyTest, integerArray) {
auto array = makeNullableArrayVector<int64_t>(
{{2, 1, 1, -2},
{},
{1, 2, 1, 1, 1, 1},
{-1, std::nullopt, -1, -1},
{std::numeric_limits<int64_t>::max(),
std::numeric_limits<int64_t>::max(),
1,
std::nullopt,
0,
1,
std::nullopt,
0}});

auto expectedKeys = makeFlatVector<int64_t>(
{1,
2,
-2,
// empty key
1,
2,
-1,
std::numeric_limits<int64_t>::max(),
1,
0});
auto expectedValues = makeFlatVector<int>(
{2,
1,
1,
// empty value
5,
1,
3,
2,
2,
2});
auto expected = makeMapVector({0, 3, 3, 5, 6}, expectedKeys, expectedValues);

testExpr(expected, "array_frequency(C0)", {array});
}

TEST_F(ArrayFrequencyTest, varcharArray) {
using S = StringView;

auto array = makeNullableArrayVector<StringView>({
{S("hello"), S("world"), S("!"), S("!"), S("!")},
{},
{S("hello"), S("world"), std::nullopt, S("!"), S("!")},
});

auto expectedKeys = makeFlatVector<StringView>(
{S("hello"), S("world"), S("!"), S("hello"), S("world"), S("!")});
auto expectedValues = makeFlatVector<int>({1, 1, 3, 1, 1, 2});
auto expected = makeMapVector({0, 3, 3}, expectedKeys, expectedValues);

testExpr(expected, "array_frequency(C0)", {array});
}

0 comments on commit cf55fb5

Please sign in to comment.