Skip to content

Commit

Permalink
[cpp] Non-const StringReader methods
Browse files Browse the repository at this point in the history
- Make StringReader methods non-const so that data can be lazily read
  on first request.
- Also simplifies the ColwiseRank wrapper setup
- Ultimately this whole interface should be overhauled, but this is an
  okay stopgap
  • Loading branch information
bnprks committed Oct 19, 2023
1 parent f4ac273 commit 3711a40
Show file tree
Hide file tree
Showing 16 changed files with 104 additions and 88 deletions.
4 changes: 2 additions & 2 deletions src/R_array_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ NumericVector convert_ulong_to_numeric(const NumericVector &ulong_vec) {
// and longs represented as doubles

RcppStringReader::RcppStringReader(const StringVector &data) : data(data) {}
const char *RcppStringReader::get(uint64_t idx) const {
const char *RcppStringReader::get(uint64_t idx) {
if ((int64_t)idx < data.size()) return data[idx];
return NULL;
}
uint64_t RcppStringReader::size() const { return data.size(); }
uint64_t RcppStringReader::size() { return data.size(); }

S4ReaderBuilder::S4ReaderBuilder(S4 s4, uint32_t load_size) : s4(s4), load_size(load_size) {}
UIntReader S4ReaderBuilder::openUIntReader(std::string name) {
Expand Down
4 changes: 2 additions & 2 deletions src/R_array_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class RcppStringReader : public BPCells::StringReader {

public:
RcppStringReader(const Rcpp::StringVector &data);
const char *get(uint64_t idx) const override;
uint64_t size() const override;
const char *get(uint64_t idx) override;
uint64_t size() override;
};

class S4ReaderBuilder : public BPCells::ReaderBuilder {
Expand Down
4 changes: 2 additions & 2 deletions src/arrayIO/array_interfaces.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
namespace BPCells {

VecStringReader::VecStringReader(std::vector<std::string> data) : data(data) {}
const char *VecStringReader::get(uint64_t idx) const {
const char *VecStringReader::get(uint64_t idx) {
if (idx < data.size()) return data[idx].c_str();
return NULL;
}
uint64_t VecStringReader::size() const { return data.size(); }
uint64_t VecStringReader::size() { return data.size(); }

} // end namespace BPCells
13 changes: 7 additions & 6 deletions src/arrayIO/array_interfaces.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ namespace BPCells {
class StringReader {
public:
virtual ~StringReader() = default;
virtual const char *get(uint64_t idx) const = 0;
virtual uint64_t size() const = 0;
virtual const char *get(uint64_t idx) = 0;
virtual uint64_t size() = 0;
};

class StringWriter {
public:
virtual ~StringWriter() = default;
virtual void write(const StringReader &reader) = 0;
virtual void write(StringReader &reader) = 0;
void write(StringReader &&reader) { write(reader); }
};

// Simple generic StringReader designed to allow for transparent reading
Expand All @@ -30,13 +31,13 @@ class VecStringReader : public StringReader {

public:
VecStringReader(std::vector<std::string> data);
const char *get(uint64_t idx) const override;
uint64_t size() const override;
const char *get(uint64_t idx) override;
uint64_t size() override;
};

class NullStringWriter : public StringWriter {
public:
void write(const StringReader &reader) override {}
void write(StringReader &reader) override {}
};

template <class T> class BulkNumReader {
Expand Down
25 changes: 17 additions & 8 deletions src/arrayIO/binaryfile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,34 @@

namespace BPCells {

FileStringReader::FileStringReader(std_fs::path path) : data(readLines(path)) {}
const char *FileStringReader::get(uint64_t idx) const {
FileStringReader::FileStringReader(std_fs::path path) : path(path) {}

inline void FileStringReader::ensureDataReady() {
if (!data_ready) {
data = readLines(path);
data_ready = true;
}
}
const char *FileStringReader::get(uint64_t idx) {
ensureDataReady();
if (idx < data.size()) return data[idx].c_str();
return NULL;
}

uint64_t FileStringReader::size() const { return data.size(); }
uint64_t FileStringReader::size() {
ensureDataReady();
return data.size();
}

FileStringWriter::FileStringWriter(std_fs::path path) : path(path) {}
void FileStringWriter::write(const StringReader &reader) {
void FileStringWriter::write(StringReader &reader) {
std::ofstream f(path.c_str());
uint64_t i = 0;
while (true) {
const char *s = reader.get(i);
if (s == NULL) break;
while (*s != '\0') {
f.put(*s);
s++;
}

f.write(s, strlen(s));
f.put('\n');
i += 1;
}
Expand Down
9 changes: 6 additions & 3 deletions src/arrayIO/binaryfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,15 @@ std::vector<std::string> readLines(std_fs::path path);

class FileStringReader final : public StringReader {
private:
bool data_ready = false;
std_fs::path path;
std::vector<std::string> data;

inline void ensureDataReady();
public:
FileStringReader(std_fs::path path);
const char *get(uint64_t idx) const override;
uint64_t size() const override;
const char *get(uint64_t idx) override;
uint64_t size() override;
};

class FileStringWriter final : public StringWriter {
Expand All @@ -139,7 +142,7 @@ class FileStringWriter final : public StringWriter {

public:
FileStringWriter(std_fs::path path);
void write(const StringReader &reader) override;
void write(StringReader &reader) override;
};

class FileWriterBuilder final : public WriterBuilder {
Expand Down
26 changes: 15 additions & 11 deletions src/arrayIO/hdf5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,45 @@

namespace BPCells {

H5StringReader::H5StringReader(const HighFive::Group &group, std::string path) {
HighFive::SilenceHDF5 s;
H5StringReader::H5StringReader(const HighFive::Group &group, std::string path) : dataset(group.getDataSet(path)) {}

HighFive::DataSet d(group.getDataSet(path));
HighFive::DataType type = d.getDataType();
inline void H5StringReader::ensureDataReady() {
if (data_ready) return;
HighFive::DataType type = dataset.getDataType();
if (type.isVariableStr()) {
// Workaround for HighFive bug: don't try reading an empty string vector
if (d.getDimensions()[0] > 0) {
d.read(data);
if (dataset.getDimensions()[0] > 0) {
dataset.read(data);
} else {
data.resize(0);
}
} else {
uint64_t bytes = type.getSize();
uint64_t elements = d.getDimensions()[0];
uint64_t elements = dataset.getDimensions()[0];
std::vector<char> char_data(bytes * elements);
d.read(char_data.data(), type);
dataset.read(char_data.data(), type);
data.resize(elements);
for (uint64_t i = 0; i < elements; i++) {
data[i] = std::string(char_data.data() + bytes * i, char_data.data() + bytes * (i + 1));
}
}
}
const char *H5StringReader::get(uint64_t idx) const {
const char *H5StringReader::get(uint64_t idx) {
ensureDataReady();
if (idx < data.size()) return data[idx].c_str();
return NULL;
}
uint64_t H5StringReader::size() const { return data.size(); }
uint64_t H5StringReader::size() {
ensureDataReady();
return dataset.getElementCount();
}

H5StringWriter::H5StringWriter(const HighFive::Group &group, std::string path, uint32_t gzip_level)
: group(group)
, path(path)
, gzip_level(gzip_level) {}

void H5StringWriter::write(const StringReader &reader) {
void H5StringWriter::write(StringReader &reader) {
std::vector<std::string> data;
uint64_t i = 0;
while (true) {
Expand Down
9 changes: 6 additions & 3 deletions src/arrayIO/hdf5.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,15 @@ using H5UIntReader = H5NumReader<uint32_t>;

class H5StringReader : public StringReader {
private:
bool data_ready = false;
HighFive::DataSet dataset;
std::vector<std::string> data;

inline void ensureDataReady();
public:
H5StringReader(const HighFive::Group &group, std::string path);
const char *get(uint64_t idx) const override;
uint64_t size() const override;
const char *get(uint64_t idx) override;
uint64_t size() override;
};

class H5StringWriter : public StringWriter {
Expand All @@ -106,7 +109,7 @@ class H5StringWriter : public StringWriter {

public:
H5StringWriter(const HighFive::Group &group, std::string path, uint32_t gzip_level = 0);
void write(const StringReader &reader) override;
void write(StringReader &reader) override;
};

class H5WriterBuilder final : public WriterBuilder {
Expand Down
2 changes: 1 addition & 1 deletion src/arrayIO/vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace BPCells {

VecStringWriter::VecStringWriter(std::vector<std::string> &data) : data(data) {}
void VecStringWriter::write(const StringReader &reader) {
void VecStringWriter::write(StringReader &reader) {
uint64_t i = 0;
data.resize(0);
while (true) {
Expand Down
2 changes: 1 addition & 1 deletion src/arrayIO/vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class VecStringWriter : public StringWriter {

public:
VecStringWriter(std::vector<std::string> &data);
void write(const StringReader &reader) override;
void write(StringReader &reader) override;
};

class VecReaderWriterBuilder : public WriterBuilder, public ReaderBuilder {
Expand Down
23 changes: 3 additions & 20 deletions src/matrixIterators/ColwiseRank.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@ namespace BPCells {

// Rank-transform each column of a matrix. Offset the ranking such that 0 values
// are assigned a rank of 0. Tied ranks are averaged together.
template <class T> class ColwiseRank : public MatrixLoader<double> {
template <class T> class ColwiseRank : public MatrixConverterLoaderWrapper<T, double> {
private:
std::unique_ptr<MatrixLoader<T>> loader;
uint32_t current_col = UINT32_MAX - 1;
bool take_ownership = true;

std::vector<uint32_t> row_data, row_buf;
std::vector<T> val_data, val_buf;
Expand All @@ -23,7 +21,7 @@ template <class T> class ColwiseRank : public MatrixLoader<double> {

public:
ColwiseRank(std::unique_ptr<MatrixLoader<T>> &&loader, uint32_t load_size = 1024)
: loader(std::move(loader))
: MatrixConverterLoaderWrapper<T, double>(std::move(loader))
, load_size(load_size) {
row_data.resize(this->loader->rows());
row_buf.resize(this->loader->rows());
Expand All @@ -32,27 +30,14 @@ template <class T> class ColwiseRank : public MatrixLoader<double> {
ranks.resize(this->loader->rows());
}

~ColwiseRank() {
if (!take_ownership) loader.release();
}
ColwiseRank() = default;
ColwiseRank(ColwiseRank&&) = default;
ColwiseRank& operator=(ColwiseRank&&) = default;

// Set the object so that the inner loader will be preserved
// rather than calling the destructor when this loader is destructed
void preserve_input_loader() {take_ownership = false;}

// Get this column's sum of t^3 - t
// where t is the number of tied entries at a given rank
double tieStatistic() {return tie_statistic;}

uint32_t rows() const override { return loader->rows(); }
uint32_t cols() const override { return loader->cols(); }

const char *rowNames(uint32_t row) override { return loader->rowNames(row); }
const char *colNames(uint32_t col) override { return loader->colNames(col); }

void restart() override {
idx = 0;
cap = 0;
Expand All @@ -71,8 +56,6 @@ template <class T> class ColwiseRank : public MatrixLoader<double> {
return this->loader->nextCol();
}

uint32_t currentCol() const override { return loader->currentCol(); }

bool load() override {
if (idx == 0 && cap == 0) {
tie_statistic = 0;
Expand All @@ -95,7 +78,7 @@ template <class T> class ColwiseRank : public MatrixLoader<double> {
}

// Calculate the offset to apply to make rank of 0 map to 0
uint32_t implicit_zeros_count = rows() - cap;
uint32_t implicit_zeros_count = this->rows() - cap;
double zero_rank =
negative_count + (1 + explicit_zeros_count + implicit_zeros_count) / 2.0;
if (explicit_zeros_count == 0 && implicit_zeros_count == 0) zero_rank = 0;
Expand Down
8 changes: 4 additions & 4 deletions src/matrixIterators/ImportMatrixHDF5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ open10xFeatureMatrix(std::string file, uint32_t buffer_size, uint32_t read_size)

StoredMatrixWriter<uint32_t> create10xFeatureMatrix(
std::string file_path,
const StringReader &barcodes,
const StringReader &feature_ids,
const StringReader &feature_names,
const StringReader &feature_types,
StringReader &&barcodes,
StringReader &&feature_ids,
StringReader &&feature_names,
StringReader &&feature_types,
const std::map<std::string, std::unique_ptr<StringReader>> &feature_metadata,
uint32_t buffer_size,
uint32_t chunk_size,
Expand Down
8 changes: 4 additions & 4 deletions src/matrixIterators/ImportMatrixHDF5.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ StoredMatrix<uint32_t> open10xFeatureMatrix(

StoredMatrixWriter<uint32_t> create10xFeatureMatrix(
std::string file,
const StringReader &barcodes,
const StringReader &feature_ids,
const StringReader &feature_names,
const StringReader &feature_types,
StringReader &&barcodes,
StringReader &&feature_ids,
StringReader &&feature_names,
StringReader &&feature_types,
const std::map<std::string, std::unique_ptr<StringReader>> &feature_metadata,
uint32_t buffer_size,
uint32_t chunk_size,
Expand Down
Loading

0 comments on commit 3711a40

Please sign in to comment.