-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from b-cubed-eu/add-bootstrapping
Add bootstrapping functions
- Loading branch information
Showing
21 changed files
with
2,223 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
Package: dubicube | ||
Title: Calculation and Interpretation of Data Cube Indicator Uncertainty | ||
Version: 0.0.2 | ||
Version: 0.1.0 | ||
Authors@R: c( | ||
person("Ward", "Langeraert", , "[email protected]", role = c("aut", "cre"), | ||
comment = c(ORCID = "0000-0002-5900-8109", affiliation = "Research Institute for Nature and Forest (INBO)")), | ||
|
@@ -16,8 +16,19 @@ License: MIT + file LICENSE | |
URL: https://github.com/b-cubed-eu/dubicube, | ||
https://b-cubed-eu.github.io/dubicube/ | ||
BugReports: https://github.com/b-cubed-eu/dubicube/issues | ||
Suggests: | ||
Imports: | ||
assertthat, | ||
boot, | ||
dplyr, | ||
modelr, | ||
purrr, | ||
rlang, | ||
tidyr | ||
Suggests: | ||
b3gbi, | ||
testthat (>= 3.0.0) | ||
Remotes: | ||
b-cubed-eu/b3gbi | ||
Config/checklist/communities: b3; inbo | ||
Config/checklist/keywords: uncertainty quantification; uncertainty | ||
visualisation; biodiversity indicators; data cubes | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,16 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(bootstrap_cube) | ||
export(calculate_bootstrap_ci) | ||
import(assertthat) | ||
import(boot) | ||
import(dplyr) | ||
importFrom(modelr,bootstrap) | ||
importFrom(purrr,map) | ||
importFrom(rlang,.data) | ||
importFrom(rlang,inherits_any) | ||
importFrom(stats,pnorm) | ||
importFrom(stats,qnorm) | ||
importFrom(stats,sd) | ||
importFrom(stats,setNames) | ||
importFrom(tidyr,expand_grid) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
# dubicube 0.1.0 | ||
|
||
* Add bootstrapping functions | ||
|
||
# dubicube 0.0.2 | ||
|
||
* Add pkgdown website. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,317 @@ | ||
#' Perform bootstrapping over a data cube for a calculated statistic | ||
#' | ||
#' This function generate `samples` bootstrap replicates of a statistic applied | ||
#' to a data cube. It resamples the data cube and computes a statistic `fun` for | ||
#' each bootstrap replicate, optionally comparing the results to a reference | ||
#' group (`ref_group`). | ||
#' | ||
#' @param data_cube A data cube object (class 'processed_cube' or 'sim_cube', | ||
#' see `b3gbi::process_cube()`) or a dataframe (from `$data` slot of | ||
#' 'processed_cube' or 'sim_cube'). | ||
#' @param fun A function which, when applied to `data_cube` returns the | ||
#' statistic(s) of interest. This function must return a dataframe with a column | ||
#' `diversity_val` containing the statistic of interest. | ||
#' @param grouping_var A string specifying the grouping variable(s) for the | ||
#' bootstrap analysis. The output of `fun(data_cube)` returns a row per group. | ||
#' @param samples The number of bootstrap replicates. A single positive integer. | ||
#' Default is 1000. | ||
#' @param ref_group A string indicating the reference group to compare the | ||
#' statistic with. Default is `NA`, meaning no reference group is used. | ||
#' @param seed A positive numeric value setting the seed for random number | ||
#' generation to ensure reproducibility. If `NA` (default), then `set.seed()` | ||
#' is not called at all. If not `NA`, then the random number generator state is | ||
#' reset (to the state before calling this function) upon exiting this function. | ||
#' @param progress Logical. Whether to show a progress bar. Set to `TRUE` to | ||
#' display a progress bar, `FALSE` (default) to suppress it. | ||
#' | ||
#' @returns A dataframe containing the bootstrap results with the following | ||
#' columns: | ||
#' - `sample`: Sample ID of the bootstrap replicate | ||
#' - `est_original`: The statistic based on the full dataset per group | ||
#' - `rep_boot`: The statistic based on a bootstrapped dataset (bootstrap | ||
#' replicate) | ||
#' - `est_boot`: The bootstrap estimate (mean of bootstrap replicates per | ||
#' group) | ||
#' - `se_boot`: The standard error of the bootstrap estimate (standard | ||
#' deviation of the bootstrap replicates per group) | ||
#' - `bias_boot`: The bias of the bootstrap estimate per group | ||
#' | ||
#' @details | ||
#' Bootstrapping is a statistical technique used to estimate the distribution of | ||
#' a statistic by resampling with replacement from the original data | ||
#' (Davison & Hinkley, 1997; Efron & Tibshirani, 1994). | ||
#' In the case of data cubes, each row is sampled with replacement. | ||
#' Below are the common notations used in bootstrapping: | ||
#' | ||
#' 1. **Original Sample Data**: \eqn{\mathbf{X} = \{X_1, X_2, \ldots, X_n\}} | ||
#' - The initial set of observed data points. Here, \eqn{n} is the sample | ||
#' size. This corresponds to the number of cells in a data cube or the number | ||
#' of rows in tabular format. | ||
#' | ||
#' 2. **Statistic of Interest**: \eqn{\theta} | ||
#' - The parameter or statistic being estimated, such as the mean | ||
#' \eqn{\bar{X}}, variance \eqn{\sigma^2}, or a biodiversity indicator. Let | ||
#' \eqn{\hat{\theta}} | ||
#' denote the estimated value of \eqn{\theta} | ||
#' calculated from the complete dataset \eqn{\mathbf{X}}. | ||
#' | ||
#' 3. **Bootstrap Sample**: \eqn{\mathbf{X}^* = \{X_1^*, X_2^*, \ldots, X_n^*\}} | ||
#' - A sample of size \eqn{n} | ||
#' drawn with replacement from the original sample | ||
#' \eqn{\mathbf{X}}. Each \eqn{X_i^*} is drawn independently from | ||
#' \eqn{\mathbf{X}}. | ||
#' - A total of \eqn{B} | ||
#' bootstrap samples are drawn from the original data. | ||
#' Common choices for \eqn{B} are 1000 or 10,000 to ensure a good | ||
#' approximation of the distribution of the bootstrap replications (see | ||
#' further). | ||
#' | ||
#' 4. **Bootstrap Replication**: \eqn{\hat{\theta}^*_b} | ||
#' - The value of the statistic of interest calculated from the \eqn{b}-th | ||
#' bootstrap sample \eqn{\mathbf{X}^*_b}. For example, if \eqn{\theta} is | ||
#' the sample mean, \eqn{\hat{\theta}^*_b = \bar{X}^*_b}. | ||
#' | ||
#' 5. **Bootstrap Statistics**: | ||
#' | ||
#' - **Bootstrap Estimate of the Statistic**: \eqn{\hat{\theta}_{\text{boot}}} | ||
#' - The average of the bootstrap replications: | ||
#' | ||
#' \deqn{\hat{\theta}_{\text{boot}} = \frac{1}{B} \sum_{b=1}^B \hat{\theta}^*_b} | ||
#' | ||
#' - **Bootstrap Bias**: \eqn{\text{Bias}_{\text{boot}}} | ||
#' - This bias indicates how much the bootstrap estimate deviates from the | ||
#' original sample estimate. It is calculated as the difference between the | ||
#' average bootstrap estimate and the original estimate: | ||
#' | ||
#' \deqn{\text{Bias}_{\text{boot}} = \frac{1}{B} \sum_{b=1}^B (\hat{\theta}^*_b | ||
#' - \hat{\theta}) = \hat{\theta}_{\text{boot}} - \hat{\theta}} | ||
#' | ||
#' - **Bootstrap Standard Error**: \eqn{\text{SE}_{\text{boot}}} | ||
#' - The standard deviation of the bootstrap replications, which estimates | ||
#' the variability of the statistic. | ||
#' | ||
#' @references | ||
#' Davison, A. C., & Hinkley, D. V. (1997). Bootstrap Methods and their | ||
#' Application (1st ed.). Cambridge University Press. | ||
#' \doi{10.1017/CBO9780511802843} | ||
#' | ||
#' Efron, B., & Tibshirani, R. J. (1994). An Introduction to the Bootstrap | ||
#' (1st ed.). Chapman and Hall/CRC. \doi{10.1201/9780429246593} | ||
#' | ||
#' @export | ||
#' | ||
#' @import dplyr | ||
#' @import assertthat | ||
#' @importFrom rlang .data inherits_any | ||
#' @importFrom modelr bootstrap | ||
#' @importFrom purrr map | ||
#' @importFrom stats sd setNames | ||
#' | ||
#' @examples | ||
#' # Get example data | ||
#' # install.packages("remotes") | ||
#' # remotes::install_github("b-cubed-eu/b3gbi") | ||
#' library(b3gbi) | ||
#' cube_path <- system.file( | ||
#' "extdata", "denmark_mammals_cube_eqdgc.csv", | ||
#' package = "b3gbi") | ||
#' denmark_cube <- process_cube( | ||
#' cube_path, | ||
#' first_year = 2014, | ||
#' last_year = 2020) | ||
#' | ||
#' # Function to calculate statistic of interest | ||
#' # Mean observations per year | ||
#' mean_obs <- function(data) { | ||
#' out_df <- aggregate(obs ~ year, data, mean) # Calculate mean obs per year | ||
#' names(out_df) <- c("year", "diversity_val") # Rename columns | ||
#' return(out_df) | ||
#' } | ||
#' mean_obs(denmark_cube$data) | ||
#' | ||
#' # Perform bootstrapping | ||
#' \donttest{ | ||
#' bootstrap_mean_obs <- bootstrap_cube( | ||
#' data_cube = denmark_cube$data, | ||
#' fun = mean_obs, | ||
#' grouping_var = "year", | ||
#' samples = 1000, | ||
#' seed = 123, | ||
#' progress = FALSE) | ||
#' head(bootstrap_mean_obs) | ||
#' } | ||
|
||
bootstrap_cube <- function( | ||
data_cube, | ||
fun, | ||
grouping_var, | ||
samples = 1000, | ||
ref_group = NA, | ||
seed = NA, | ||
progress = FALSE) { | ||
### Start checks | ||
# Check data_cube input | ||
cube_message <- paste("`data_cube` must be a data cube object (class", | ||
"'processed_cube' or 'sim_cube') or a dataframe.") | ||
do.call(stopifnot, | ||
stats::setNames(list( | ||
rlang::inherits_any(data_cube, | ||
c("processed_cube", "sim_cube", "data.frame"))), | ||
cube_message) | ||
) | ||
|
||
# Check fun input | ||
stopifnot("`fun` must be a function." = is.function(fun)) | ||
|
||
# Check if grouping_var is a character vector of length 1 | ||
stopifnot("`grouping_var` must be a character vector of length 1." = | ||
assertthat::is.string(grouping_var)) | ||
|
||
# Check if samples is a positive integer | ||
stopifnot( | ||
"`samples` must be a single positive integer." = | ||
assertthat::is.count(samples)) | ||
|
||
# Check if ref_group is NA or a number or a string | ||
stopifnot( | ||
"`ref_group` must be a numeric/character vector of length 1 or NA." = | ||
(assertthat::is.number(ref_group) | assertthat::is.string(ref_group) | | ||
is.na(ref_group)) & | ||
length(ref_group) == 1) | ||
|
||
# Check if seed is NA or a number | ||
stopifnot("`seed` must be a numeric vector of length 1 or NA." = | ||
(is.numeric(seed) | is.na(seed)) & length(seed) == 1) | ||
|
||
# Check if progress is a logical vector of length 1 | ||
stopifnot("`progress` must be a logical vector of length 1." = | ||
assertthat::is.flag(progress)) | ||
### End checks | ||
|
||
# Set seed if provided | ||
if (!is.na(seed)) { | ||
if (exists(".Random.seed", envir = .GlobalEnv)) { | ||
rng_state_old <- get(".Random.seed", envir = .GlobalEnv) | ||
on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) | ||
} | ||
set.seed(seed) | ||
} | ||
|
||
if (rlang::inherits_any(data_cube, c("processed_cube", "sim_cube"))) { | ||
# Check if grouping_var column is present in data cube | ||
stopifnot("`data_cube` should contain column `grouping_var`" = | ||
grouping_var %in% names(data_cube$data)) | ||
|
||
# Check if ref_group is present in grouping_var | ||
stopifnot( | ||
"`ref_group` is not present in `grouping_var` column of `data_cube`." = | ||
is.na(ref_group) | | ||
(ref_group %in% data_cube$data[[grouping_var]] & | ||
mode(ref_group) == mode(data_cube$data[[grouping_var]])) | ||
) | ||
|
||
# Generate bootstrap replicates | ||
resample_df <- modelr::bootstrap(data_cube$data, samples, id = "id") | ||
|
||
# Function for bootstrapping | ||
bootstrap_resample <- function(x, fun) { | ||
resample_obj <- x$strap[[1]] | ||
indices <- as.integer(resample_obj) | ||
data <- resample_obj$data[indices, ] | ||
|
||
data_cube_copy <- data_cube | ||
data_cube_copy$data <- data | ||
|
||
fun(data_cube_copy)$data %>% | ||
dplyr::mutate(sample = as.integer(x$id)) | ||
} | ||
} else { | ||
# Check if grouping_var column is present in data cube | ||
stopifnot("`data_cube` should contain column `grouping_var`" = | ||
grouping_var %in% names(data_cube)) | ||
|
||
# Check if ref_group is present in grouping_var | ||
stopifnot( | ||
"`ref_group` is not present in `grouping_var` column of `data_cube`." = | ||
is.na(ref_group) | | ||
(ref_group %in% data_cube[[grouping_var]] & | ||
mode(ref_group) == mode(data_cube[[grouping_var]])) | ||
) | ||
|
||
# Generate bootstrap replicates | ||
resample_df <- modelr::bootstrap(data_cube, samples, id = "id") | ||
|
||
# Function for bootstrapping | ||
bootstrap_resample <- function(x, fun) { | ||
resample_obj <- x$strap[[1]] | ||
indices <- as.integer(resample_obj) | ||
data <- resample_obj$data[indices, ] | ||
|
||
fun(data) %>% | ||
dplyr::mutate(sample = as.integer(x$id)) | ||
} | ||
} | ||
|
||
# Perform bootstrapping | ||
bootstrap_samples_list_raw <- resample_df %>% | ||
split(seq_len(nrow(resample_df))) %>% | ||
purrr::map( | ||
bootstrap_resample, | ||
fun = fun, | ||
.progress = ifelse(progress, "Bootstrapping", progress)) | ||
|
||
if (!is.na(ref_group)) { | ||
# Calculate true statistic | ||
if (rlang::inherits_any(data_cube, c("processed_cube", "sim_cube"))) { | ||
t0_full <- fun(data_cube)$data | ||
} else { | ||
t0_full <- fun(data_cube) | ||
} | ||
|
||
ref_val <- t0_full %>% | ||
dplyr::filter(.data[[grouping_var]] == !!ref_group) %>% | ||
dplyr::pull(.data$diversity_val) | ||
|
||
t0 <- t0_full %>% | ||
dplyr::filter(.data[[grouping_var]] != !!ref_group) %>% | ||
dplyr::mutate(diversity_val = .data$diversity_val - ref_val) | ||
|
||
# Get bootstrap samples as a list | ||
bootstrap_samples_list <- lapply(bootstrap_samples_list_raw, function(df) { | ||
ref_val <- df %>% | ||
dplyr::filter(.data[[grouping_var]] == !!ref_group) %>% | ||
dplyr::pull(.data$diversity_val) | ||
|
||
df %>% | ||
dplyr::filter(.data[[grouping_var]] != !!ref_group) %>% | ||
dplyr::mutate(diversity_val = .data$diversity_val - ref_val) | ||
}) | ||
} else { | ||
# Calculate true statistic | ||
if (rlang::inherits_any(data_cube, c("processed_cube", "sim_cube"))) { | ||
t0 <- fun(data_cube)$data | ||
} else { | ||
t0 <- fun(data_cube) | ||
} | ||
|
||
# Get bootstrap samples as a list | ||
bootstrap_samples_list <- bootstrap_samples_list_raw | ||
} | ||
|
||
# Summarise in dataframe | ||
bootstrap_samples_df <- bootstrap_samples_list %>% | ||
dplyr::bind_rows() %>% | ||
dplyr::rename("rep_boot" = "diversity_val") %>% | ||
dplyr::left_join(t0, by = grouping_var) %>% | ||
dplyr::rename("est_original" = "diversity_val") %>% | ||
dplyr::mutate( | ||
est_boot = mean(.data$rep_boot), | ||
se_boot = stats::sd(.data$rep_boot), | ||
.by = dplyr::all_of(grouping_var)) %>% | ||
dplyr::mutate(bias_boot = .data$est_boot - .data$est_original) %>% | ||
dplyr::arrange(.data[[grouping_var]]) %>% | ||
dplyr::select("sample", dplyr::all_of(grouping_var), "est_original", | ||
dplyr::everything()) | ||
|
||
return(bootstrap_samples_df) | ||
} |
Oops, something went wrong.