Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rdataframe to awkward #1474

Merged
merged 39 commits into from
Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
30c9749
move PR1448 to a clean branch
ianna May 17, 2022
1c8b62f
fix pylint warning
ianna May 17, 2022
a267dd5
use array builder - 5x times faster than filling std containers
ianna May 19, 2022
c1af59d
specializations of build array
ianna May 19, 2022
80a21e3
disable failing test to debug
ianna May 19, 2022
b16aed4
cleanup
ianna May 19, 2022
c842fef
fix issue with entry type
ianna May 20, 2022
eab79dc
nested iterable and more tests
ianna May 24, 2022
b112879
move pure c++ code out of python string to a separate header file
ianna May 25, 2022
4c72eb4
add more tests
ianna May 30, 2022
f053710
introduce layout builder and fix its handling of complex numbers
ianna May 31, 2022
f08348c
NumpyForm and from_buffers
ianna Jun 9, 2022
26b9589
remove dependecy on layout builder
ianna Jun 9, 2022
1406013
return nested forms for nested data
ianna Jun 9, 2022
7377802
pythonize buffers
ianna Jun 13, 2022
99e23f7
copy data and offsets to ndarrays
ianna Jun 14, 2022
41ac1a2
enable all tests
ianna Jun 14, 2022
be1136c
make layouts directly, do not use from_buffers
ianna Jun 15, 2022
fe3934f
revert changes to Forth
ianna Jun 15, 2022
381acc5
revert changes to LayoutBuilder
ianna Jun 15, 2022
eb2f919
fix hash in to_rdataframe
ianna Jun 15, 2022
1b1b56b
nested iterable and more tests
ianna May 24, 2022
da84125
introduce layout builder and fix its handling of complex numbers
ianna May 31, 2022
c96a19b
remove dependecy on layout builder
ianna Jun 9, 2022
076656d
return nested forms for nested data
ianna Jun 9, 2022
fb3fc13
pythonize buffers
ianna Jun 13, 2022
8ac2ceb
copy data and offsets to ndarrays
ianna Jun 14, 2022
223748a
make layouts directly, do not use from_buffers
ianna Jun 15, 2022
4a65063
revert changes to Forth
ianna Jun 15, 2022
aa34998
revert changes to LayoutBuilder
ianna Jun 15, 2022
da2e8b7
correct merge
ianna Jun 15, 2022
72bec58
introduce local cache
ianna Jun 15, 2022
7d783dc
enable large scale test
ianna Jun 15, 2022
4df29e1
fix typo
ianna Jun 15, 2022
c390219
move hash function to _util, final cleanup
ianna Jun 16, 2022
eb6f36f
fix pylint warning
ianna Jun 16, 2022
a9188a5
address Jim's comments
ianna Jun 16, 2022
a6c8c93
remove column_as_records flag
ianna Jun 16, 2022
6c270d5
move the header to its own directory
ianna Jun 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions src/awkward/_v2/_connect/cling.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

import base64
import ctypes
import struct
import json
import re

Expand Down Expand Up @@ -466,13 +464,7 @@ def form_from_identifier(cls, form):
raise ak._v2._util.error(NotImplementedError("TODO: identifiers in C++"))

def class_type_suffix(self, key):
return (
base64.encodebytes(struct.pack("q", hash(key)))
.rstrip(b"=\n")
.replace(b"+", b"")
.replace(b"/", b"")
.decode("ascii")
)
return ak._v2._util.string_hash(key)

def _generate_common(self, key):
params = [
Expand Down
281 changes: 281 additions & 0 deletions src/awkward/_v2/_connect/rdataframe/ak_array_builders.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

#ifndef AWKWARD_ARRAY_BUILDERS_H_
#define AWKWARD_ARRAY_BUILDERS_H_

#include <iterator>
#include <stdlib.h>
#include <string>


namespace awkward {

template <typename T>
const std::string
type_to_name() {
return typeid(T).name();
}

template <>
const std::string
type_to_name<bool>() {
return "bool";
}

template <>
const std::string
type_to_name<int8_t>() {
return "int8";
}

template <>
const std::string
type_to_name<int16_t>() {
return "int16";
}

template <>
const std::string
type_to_name<int32_t>() {
return "int32";
}

template <>
const std::string
type_to_name<int64_t>() {
return "int64";
}

template <>
const std::string
type_to_name<uint8_t>() {
return "uint8";
}

template <>
const std::string
type_to_name<uint16_t>() {
return "uint16";
}

template <>
const std::string
type_to_name<uint32_t>() {
return "uint32";
}

template <>
const std::string
type_to_name<uint64_t>() {
return "uint64";
}

template <>
const std::string
type_to_name<float>() {
return "float32";
}

template <>
const std::string
type_to_name<double>() {
return "float64";
}

template <>
const std::string
type_to_name<char>() {
return "chars";
}

template <typename, typename = void>
constexpr bool is_iterable{};

template <typename T>
constexpr bool is_iterable<
T,
std::void_t< decltype(std::declval<T>().begin()),
decltype(std::declval<T>().end())
>
> = true;

template <typename T, typename DATA>
class CppBuffers {
public:
CppBuffers(ROOT::RDF::RResultPtr<std::vector<T>>& result)
: result_(result) {
offsets_.reserve(3);
data_.reserve(1024);
}

~CppBuffers() {
}

int64_t
offsets_length(int64_t level) {
return static_cast<int64_t>(offsets_[level].size());
}

int64_t
data_length() {
return data_.size();
}

void copy_offsets(void* to_buffer, int64_t length, int64_t level) {
auto ptr = reinterpret_cast<int64_t *>(to_buffer);
int64_t i = 0;
for (auto const& it : offsets_[level]) {
ptr[i++] = it;
}
}

void copy_data(void* to_buffer, int64_t length) {
auto ptr = reinterpret_cast<DATA*>(to_buffer);
int64_t i = 0;
for (auto const& it : data_) {
ptr[i++] = it;
}
}

std::pair<int64_t, int64_t>
offsets_and_flatten_2() {
int64_t i = 0;
std::vector<int64_t> offsets;
offsets.reserve(1024);
for (auto const& vec : result_) {
offsets.emplace_back(i);
i += vec.size();
data_.insert(data_.end(), vec.begin(), vec.end());
}
offsets.emplace_back(i);

offsets_.emplace_back(offsets);

return {static_cast<int64_t>(offsets_.size()), static_cast<int64_t>(offsets_[0].size())};
}

std::pair<int64_t, int64_t>
offsets_and_flatten_3() {
int64_t i = 0;
int64_t j = 0;
std::vector<int64_t> offsets;
offsets.reserve(1024);
std::vector<int64_t> inner_offsets;
inner_offsets.reserve(1024);
for (auto const& vec_of_vecs : result_) {
offsets.emplace_back(i);
i += vec_of_vecs.size();

for (auto const& vec : vec_of_vecs) {
inner_offsets.emplace_back(j);
j += vec.size();
data_.insert(data_.end(), vec.begin(), vec.end());
}
inner_offsets.emplace_back(j);
}
offsets.emplace_back(i);

offsets_.emplace_back(offsets);
offsets_.emplace_back(inner_offsets);

return {static_cast<int64_t>(offsets_.size()), static_cast<int64_t>(offsets_[0].size())};
}

std::pair<int64_t, int64_t>
offsets_and_flatten_4() {
int64_t i = 0;
int64_t j = 0;
int64_t k = 0;
std::vector<int64_t> offsets;
std::vector<int64_t> inner_offsets;
std::vector<int64_t> inner_inner_offsets;
for (auto const& vec_of_vecs_of_vecs : result_) {
offsets.emplace_back(i);
i += vec_of_vecs_of_vecs.size();

for (auto const& vec_of_vecs : vec_of_vecs_of_vecs) {
inner_offsets.emplace_back(j);
j += vec_of_vecs.size();

for (auto const&vec : vec_of_vecs) {
inner_inner_offsets.emplace_back(k);
k += vec.size();
data_.insert(data_.end(), vec.begin(), vec.end());
}
inner_inner_offsets.emplace_back(k);
}
inner_offsets.emplace_back(j);
}
offsets.emplace_back(i);

offsets_.emplace_back(offsets);
offsets_.emplace_back(inner_offsets);
offsets_.emplace_back(inner_inner_offsets);

return {static_cast<int64_t>(offsets_.size()), static_cast<int64_t>(offsets_[0].size())};
}

std::pair<int64_t, void*>
create_array() {
int64_t size = result_->size();
DATA* ptr = new DATA[size];
int64_t i = 0;
for (auto const& it : result_) {
ptr[i++] = it;
}
return {size, ptr};
}

private:
ROOT::RDF::RResultPtr<std::vector<T>>& result_;
std::vector<std::vector<int64_t>> offsets_;
std::vector<DATA> data_;
};

template <typename Test, template <typename...> class Ref>
struct is_specialization : std::false_type {
};

template <template <typename...> class Ref, typename... Args>
struct is_specialization<Ref<Args...>, Ref> : std::true_type {
};

template <typename T>
std::string
type_to_form(int64_t form_key_id) {
if (std::string(typeid(T).name()).find("awkward") != string::npos) {
return std::string("awkward type");
}

std::stringstream form_key;
form_key << "node" << (form_key_id++);

if (std::is_arithmetic<T>::value) {
std::string parameters(type_to_name<T>() + "\",");
if (std::is_same<T, char>::value) {
parameters = std::string("uint8\", \"parameters\": { \"__array__\": \"char\" }, ");
}
return "{\"class\": \"NumpyArray\", \"primitive\": \""
+ parameters + "\"form_key\": \"" + form_key.str() + "\"}";
} else if (is_specialization<T, std::complex>::value) {
return "{\"class\": \"NumpyArray\", \"primitive\": \"complex128\", \"form_key\": \""
+ form_key.str() + "\"}";
}

typedef typename T::value_type value_type;

if (is_iterable<T>) {
std::string parameters("");
if (std::is_same<value_type, char>::value) {
parameters = std::string(" \"parameters\": { \"__array__\": \"string\" }, ");
}
return "{\"class\": \"ListOffsetArray\", \"offsets\": \"i64\", \"content\":"
+ type_to_form<value_type>(form_key_id)
+ ", " + parameters + "\"form_key\": \"" + form_key.str() + "\"}";
}
return "unsupported type";
}

}

#endif // AWKWARD_ARRAY_BUILDERS_H_
Loading