Skip to content

Commit

Permalink
fix #141 (#161)
Browse files Browse the repository at this point in the history
* rewrite append in cpp

Correct splicing

Splicer

a

* Update benchmark [no ci]

* update also works

* Remove legacy xml2 code; remove xml2 dependency

YES!

* improve header hygiene

* Clean up the C++ style

* Add more tests

* Update all docs [no ci]

* `xml2` in NEWS [no ci]
  • Loading branch information
chainsawriot authored Aug 27, 2023
1 parent d546721 commit a81f54a
Show file tree
Hide file tree
Showing 15 changed files with 963 additions and 64 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: readODS
Type: Package
Title: Read and Write ODS Files
Version: 2.0.6
Version: 2.0.7
Authors@R:
c(person("Gerrit-Jan", "Schutten", role = c("aut"), email = "[email protected]"),
person("Chung-hong", "Chan", role = c("aut", "cre"), email = "[email protected]", comment = c(ORCID = "0000-0002-6232-7530")),
Expand All @@ -24,7 +24,6 @@ Description: Read ODS (OpenDocument Spreadsheet) into R as data frame. Also supp
URL: https://github.com/ropensci/readODS
BugReports: https://github.com/ropensci/readODS/issues
Imports:
xml2 (>= 1.3.2),
cellranger,
readr (>= 1.2.1),
stringi,
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# readODS 2.07

## `append` and `update` of `write_ods` in C++

Significant speed improvement; also `xml2` is no longer a dependency.

# readODS 2.06

## `write_ods` and `write_fods` allow list of data frames
Expand Down
8 changes: 8 additions & 0 deletions R/cpp11.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ read_ods_ <- function(file, start_row, stop_row, start_col, stop_col, sheet, for
.Call(`_readODS_read_ods_`, file, start_row, stop_row, start_col, stop_col, sheet, formula_as_formula)
}

splice_sheet_ <- function(original_xml, sheet_xml, flat) {
.Call(`_readODS_splice_sheet_`, original_xml, sheet_xml, flat)
}

update_sheet_ <- function(original_xml, sheet_xml, flat, sheet) {
.Call(`_readODS_update_sheet_`, original_xml, sheet_xml, flat, sheet)
}

write_sheet_ <- function(filename, x, sheet, row_names, col_names, na_as_string, padding, header, footer) {
.Call(`_readODS_write_sheet_`, filename, x, sheet, row_names, col_names, na_as_string, padding, header, footer)
}
Expand Down
53 changes: 13 additions & 40 deletions R/write_ods.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,6 @@
return(path)
}

.find_sheet_node_by_sheet <- function(spreadsheet_node, sheet) {
sheet_node <- NULL
for (i in seq(2, length(xml2::xml_children(spreadsheet_node)))) {
if (!is.na(xml2::xml_attr(xml2::xml_children(spreadsheet_node)[[i]], "name") == sheet) &&
xml2::xml_attr(xml2::xml_children(spreadsheet_node)[[i]], "name") == sheet) {
sheet_node <- xml2::xml_children(spreadsheet_node)[[i]]
}
}
return(sheet_node)
}

.silent_read_xml <- function(x) {
suppressWarnings({
return(xml2::read_xml(x))
})
}

.silent_add_sheet_node <- function(sheet) {
.silent_read_xml(sprintf('<table:table table:name="%s" table:style-name="ta1"><table:table-column table:style-name="co1" table:number-columns-repeated="16384" table:default-cell-style-name="ce1"/></table:table>', sheet))
}

.convert_df_to_sheet <- function(x, sheet = "Sheet1", row_names = FALSE, col_names = FALSE, na_as_string = FALSE, padding = FALSE, xml_file = file.path(tempfile(fileext = ".xml"))) {
write_sheet_(x = x, filename = xml_file, sheet = sheet, row_names = row_names, col_names = col_names,
na_as_string = na_as_string, padding = padding,
Expand Down Expand Up @@ -77,34 +56,28 @@
if (isFALSE(flat)) {
zip::unzip(path, exdir = temp_ods_dir)
contentfile <- file.path(temp_ods_dir, "content.xml")
sheet_exist <- sheet %in% list_ods_sheets(path, include_external_data = TRUE)
sheets <- list_ods_sheets(path, include_external_data = TRUE)
} else {
contentfile <- path
sheet_exist <- sheet %in% list_fods_sheets(path, include_external_data = TRUE)
sheets <- list_fods_sheets(path, include_external_data = TRUE)
}
is_in_sheet_names <- stringi::stri_cmp(e1 = sheet, e2 = sheets) == 0
sheet_exist <- any(is_in_sheet_names)
if ((sheet_exist && append && !update) || (sheet_exist && !update)) {
## Sheet exists so we cannot append
stop(paste0("Sheet ", sheet, " exists. Set update to TRUE is you want to update this sheet."), call. = FALSE)
}
if (!sheet_exist && update) {
stop(paste0("Sheet ", sheet, " does not exist. Cannot update."), call. = FALSE)
}
content <- xml2::read_xml(contentfile)
spreadsheet_node <- xml2::xml_children(xml2::xml_children(content)[[which(!is.na(xml2::xml_find_first(xml2::xml_children(content),"office:spreadsheet")))]])[[1]]
if (update) {
## clean up the sheet
sheet_node <- .find_sheet_node_by_sheet(spreadsheet_node, sheet)
xml2::xml_remove(xml2::xml_children(sheet_node)[2:length(xml2::xml_children(sheet_node))])
}
if (append) {
## Add a new sheet
sheet_node <- xml2::xml_add_child(spreadsheet_node, .silent_add_sheet_node(sheet))
}
## numeric
normalized_sheet <- which(is_in_sheet_names)
throwaway_xml_file <- .convert_df_to_sheet(x = x, sheet = sheet, row_names = row_names, col_names = col_names,
na_as_string = na_as_string, padding = padding)
xml2::xml_replace(sheet_node, .silent_read_xml(throwaway_xml_file))
## write xml to contentfile
xml2::write_xml(content, contentfile)
if (append) {
return(splice_sheet_(contentfile, throwaway_xml_file, flat))
}
return(update_sheet_(contentfile, throwaway_xml_file, flat, normalized_sheet))
}

.write_ods <- function(x, path = tempfile(fileext = ".ods"), sheet = "Sheet1", append = FALSE, update = FALSE, row_names = FALSE, col_names = TRUE, na_as_string = FALSE, padding = FALSE, flat = FALSE) {
Expand All @@ -116,7 +89,7 @@
}
temp_ods_dir <- NULL
if (isFALSE(flat)) {
temp_ods_dir <- file.path(tempdir(), stringi::stri_rand_strings(1, 20, pattern = "[A-Za-z0-9]"))
temp_ods_dir <- file.path(tempdir(), stringi::stri_rand_strings(1, 30, pattern = "[A-Za-z0-9]"))
dir.create(temp_ods_dir)
on.exit(unlink(temp_ods_dir))
}
Expand Down Expand Up @@ -160,8 +133,8 @@
#' @param x data frame or list of data frames that will be sheets in the (f)ods. If the list is named, the names are used as sheet names
#' @param path Path to the (f)ods file to write
#' @param sheet Name of the sheet; ignore if `x` is a list of data frames
#' @param append logical, TRUE indicates that x should be appended to the existing file (path) as a new sheet. If a sheet with the same sheet_name exists, an exception is thrown. See update. Please also note that writing is slower if TRUE. Default is FALSE. Ignore if `x` is a list of data frames
#' @param update logical, TRUE indicates that the sheet with sheet_name in the existing file (path) should be updated with the content of x. If a sheet with sheet_name does not exist, an exception is thrown. Please also note that writing is slower if TRUE. Default is FALSE. Ignore if `x` is a list of data frames
#' @param append logical, TRUE indicates that x should be appended to the existing file (path) as a new sheet. If a sheet with the same sheet_name exists, an exception is thrown. See update. Please also note that writing is slightly slower if TRUE. Default is FALSE. Ignore if `x` is a list of data frames
#' @param update logical, TRUE indicates that the sheet with sheet_name in the existing file (path) should be updated with the content of x. If a sheet with sheet_name does not exist, an exception is thrown. Please also note that writing is slightly slower if TRUE. Default is FALSE. Ignore if `x` is a list of data frames
#' @param row_names logical, TRUE indicates that row names of x are to be included in the sheet. Default is FALSE
#' @param col_names logical, TRUE indicates that column names of x are to be included in the sheet. Default is TRUE
#' @param na_as_string logical, TRUE indicates that NAs are written as string; FALSE indicates that NAs are written as empty cells
Expand Down
47 changes: 27 additions & 20 deletions benchmark/write_ods_apend.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
date()
```

[1] "Sat Aug 19 00:00:24 2023"
[1] "Sun Aug 27 17:00:14 2023"

``` r
devtools::load_all()
Expand All @@ -29,14 +29,21 @@ system.time(write_ods(df1, path = path, sheet = "aaaa", append = TRUE))
```

user system elapsed
2.625 0.028 2.657
0.259 0.020 0.279

``` r
system.time(write_ods(df1, path = path, sheet = "aaaa", update = TRUE))
```

user system elapsed
2.715 0.013 2.735
0.315 0.040 0.359

``` r
system.time(write_ods(mtcars, path = path, sheet = "aaaa", update = TRUE))
```

user system elapsed
0.213 0.027 0.241

``` r
sessionInfo()
Expand Down Expand Up @@ -65,22 +72,22 @@ sessionInfo()
[1] stats graphics grDevices utils datasets methods base

other attached packages:
[1] readODS_2.0.1 testthat_3.1.10
[1] readODS_2.0.7 testthat_3.1.10

loaded via a namespace (and not attached):
[1] utf8_1.2.3 xml2_1.3.5 stringi_1.7.12 hms_1.1.3
[5] digest_0.6.33 magrittr_2.0.3 evaluate_0.21 pkgload_1.3.2.1
[9] fastmap_1.1.1 cellranger_1.1.0 rprojroot_2.0.3 jsonlite_1.8.7
[13] zip_2.3.0 processx_3.8.2 pkgbuild_1.4.2 sessioninfo_1.2.2
[17] brio_1.1.3 urlchecker_1.0.1 ps_1.7.5 promises_1.2.1
[21] purrr_1.0.2 fansi_1.0.4 cli_3.6.1 shiny_1.7.5
[25] rlang_1.1.1 crayon_1.5.2 ellipsis_0.3.2 remotes_2.4.2.1
[29] withr_2.5.0 cachem_1.0.8 yaml_2.3.7 devtools_2.4.5
[33] tools_4.3.1 tzdb_0.4.0 memoise_2.0.1 httpuv_1.6.11
[37] vctrs_0.6.3 R6_2.5.1 mime_0.12 lifecycle_1.0.3
[41] stringr_1.5.0 fs_1.6.3 htmlwidgets_1.6.2 usethis_2.2.2
[45] miniUI_0.1.1.1 pkgconfig_2.0.3 desc_1.4.2 callr_3.7.3
[49] pillar_1.9.0 later_1.3.1 glue_1.6.2 profvis_0.3.8
[53] Rcpp_1.0.11 xfun_0.40 tibble_3.2.1 rstudioapi_0.15.0
[57] knitr_1.43 xtable_1.8-4 htmltools_0.5.6 rmarkdown_2.24
[61] readr_2.1.4 compiler_4.3.1 prettyunits_1.1.1
[1] utf8_1.2.3 stringi_1.7.12 hms_1.1.3 digest_0.6.33
[5] magrittr_2.0.3 evaluate_0.21 pkgload_1.3.2.1 fastmap_1.1.1
[9] cellranger_1.1.0 rprojroot_2.0.3 jsonlite_1.8.7 zip_2.3.0
[13] processx_3.8.2 pkgbuild_1.4.2 sessioninfo_1.2.2 brio_1.1.3
[17] urlchecker_1.0.1 ps_1.7.5 promises_1.2.1 purrr_1.0.2
[21] fansi_1.0.4 cli_3.6.1 shiny_1.7.5 rlang_1.1.1
[25] crayon_1.5.2 ellipsis_0.3.2 remotes_2.4.2.1 withr_2.5.0
[29] cachem_1.0.8 yaml_2.3.7 devtools_2.4.5 tools_4.3.1
[33] tzdb_0.4.0 memoise_2.0.1 httpuv_1.6.11 vctrs_0.6.3
[37] R6_2.5.1 mime_0.12 lifecycle_1.0.3 stringr_1.5.0
[41] fs_1.6.3 htmlwidgets_1.6.2 usethis_2.2.2 miniUI_0.1.1.1
[45] pkgconfig_2.0.3 desc_1.4.2 callr_3.7.3 pillar_1.9.0
[49] later_1.3.1 glue_1.6.2 profvis_0.3.8 Rcpp_1.0.11
[53] xfun_0.40 tibble_3.2.1 rstudioapi_0.15.0 knitr_1.43
[57] xtable_1.8-4 htmltools_0.5.6 rmarkdown_2.24 readr_2.1.4
[61] compiler_4.3.1 prettyunits_1.1.1
1 change: 1 addition & 0 deletions benchmark/write_ods_apend.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ path <- tempfile(fileext = ".ods")
write_ods(df1, path = path)
system.time(write_ods(df1, path = path, sheet = "aaaa", append = TRUE))
system.time(write_ods(df1, path = path, sheet = "aaaa", update = TRUE))
system.time(write_ods(mtcars, path = path, sheet = "aaaa", update = TRUE))
```

```{r}
Expand Down
4 changes: 2 additions & 2 deletions man/write_ods.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@ extern "C" SEXP _readODS_read_ods_(SEXP file, SEXP start_row, SEXP stop_row, SEX
return cpp11::as_sexp(read_ods_(cpp11::as_cpp<cpp11::decay_t<const std::string>>(file), cpp11::as_cpp<cpp11::decay_t<int>>(start_row), cpp11::as_cpp<cpp11::decay_t<int>>(stop_row), cpp11::as_cpp<cpp11::decay_t<int>>(start_col), cpp11::as_cpp<cpp11::decay_t<int>>(stop_col), cpp11::as_cpp<cpp11::decay_t<const int>>(sheet), cpp11::as_cpp<cpp11::decay_t<const bool>>(formula_as_formula)));
END_CPP11
}
// splice.cpp
std::string splice_sheet_(const std::string original_xml, const std::string sheet_xml, const bool flat);
extern "C" SEXP _readODS_splice_sheet_(SEXP original_xml, SEXP sheet_xml, SEXP flat) {
BEGIN_CPP11
return cpp11::as_sexp(splice_sheet_(cpp11::as_cpp<cpp11::decay_t<const std::string>>(original_xml), cpp11::as_cpp<cpp11::decay_t<const std::string>>(sheet_xml), cpp11::as_cpp<cpp11::decay_t<const bool>>(flat)));
END_CPP11
}
// splice.cpp
std::string update_sheet_(const std::string original_xml, const std::string sheet_xml, const bool flat, const int sheet);
extern "C" SEXP _readODS_update_sheet_(SEXP original_xml, SEXP sheet_xml, SEXP flat, SEXP sheet) {
BEGIN_CPP11
return cpp11::as_sexp(update_sheet_(cpp11::as_cpp<cpp11::decay_t<const std::string>>(original_xml), cpp11::as_cpp<cpp11::decay_t<const std::string>>(sheet_xml), cpp11::as_cpp<cpp11::decay_t<const bool>>(flat), cpp11::as_cpp<cpp11::decay_t<const int>>(sheet)));
END_CPP11
}
// write_sheet_.cpp
cpp11::r_string write_sheet_(const std::string& filename, const cpp11::data_frame& x, const std::string& sheet, const bool row_names, const bool col_names, const bool na_as_string, const bool padding, const std::string& header, const std::string& footer);
extern "C" SEXP _readODS_write_sheet_(SEXP filename, SEXP x, SEXP sheet, SEXP row_names, SEXP col_names, SEXP na_as_string, SEXP padding, SEXP header, SEXP footer) {
Expand All @@ -54,6 +68,8 @@ static const R_CallMethodDef CallEntries[] = {
{"_readODS_get_sheet_names_", (DL_FUNC) &_readODS_get_sheet_names_, 2},
{"_readODS_read_flat_ods_", (DL_FUNC) &_readODS_read_flat_ods_, 7},
{"_readODS_read_ods_", (DL_FUNC) &_readODS_read_ods_, 7},
{"_readODS_splice_sheet_", (DL_FUNC) &_readODS_splice_sheet_, 3},
{"_readODS_update_sheet_", (DL_FUNC) &_readODS_update_sheet_, 4},
{"_readODS_write_sheet_", (DL_FUNC) &_readODS_write_sheet_, 9},
{"_readODS_write_sheet_list_", (DL_FUNC) &_readODS_write_sheet_list_, 9},
{NULL, NULL, 0}
Expand Down
37 changes: 37 additions & 0 deletions src/rapidxml/rapidxml_ext.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef RAPIDXML_EXT_H_
#define RAPIDXML_EXT_H_
#include "rapidxml.hpp"
/* Adding declarations to make it compatible with gcc 4.7 and greater */
namespace rapidxml {
namespace internal {
template <class OutIt, class Ch>
inline OutIt print_children(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_attributes(OutIt out, const xml_node<Ch>* node, int flags);

template <class OutIt, class Ch>
inline OutIt print_data_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_cdata_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_element_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_declaration_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_comment_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_doctype_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);

template <class OutIt, class Ch>
inline OutIt print_pi_node(OutIt out, const xml_node<Ch>* node, int flags, int indent);
}
}
#include "rapidxml_print.hpp"

#endif /* RAPIDXML_EXT_H_ */
Loading

0 comments on commit a81f54a

Please sign in to comment.