diff --git a/NAMESPACE b/NAMESPACE index 3405d737..05ffb3d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -154,6 +154,7 @@ export(psislw) export(relative_eff) export(scrps) export(sis) +export(srs_diff_est) export(stacking_weights) export(tis) export(waic) diff --git a/NEWS.md b/NEWS.md index 6f182376..8ebccfd6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -17,6 +17,7 @@ * Added contribution section. by @VisruthSK in #286 * Update LOO uncertainty paper to use BA doi by @avehtari in #311 * Update documentation for `E_loo()` function by @avehtari in #312 +* Export `srs_diff_est()` function by @vinniott and @avehtari in #340 # loo 2.8.0 diff --git a/R/loo_subsample.R b/R/loo_subsample.R index bcac4b17..a5c44141 100644 --- a/R/loo_subsample.R +++ b/R/loo_subsample.R @@ -24,7 +24,7 @@ #' same length containing the posterior density and the approximation density #' for the individual draws. #' -#' @seealso [loo()], [psis()], [loo_compare()] +#' @seealso [loo()], [psis()], [loo_compare()], [srs_diff_est()] #' @template loo-large-data-references #' #' @export loo_subsample loo_subsample.function @@ -1166,12 +1166,88 @@ loo_subsample_estimation_diff_srs <- function(x) { update_psis_loo_ss_estimates(x) } -#' Difference estimation using SRS-WOR sampling (Magnusson et al., 2020) -#' @noRd -#' @param y_approx Approximated values of all observations. -#' @param y The values observed. -#' @param y_idx The index of `y` in `y_approx`. -#' @return A list with estimates. +#' Difference estimator with simple random sampling without replacement. +#' +#' The difference estimator `srs_diff()` estimates +#' the expectation \eqn{n E[y]} when we have \eqn{n} approximate values \eqn{\tilde{y}_i}, +#' \eqn{i = 1, \ldots, n} and \eqn{m < n} accurate values \eqn{y_j}, \eqn{j \in \mathcal{S}}, +#' where \eqn{m} is the subsample size and \eqn{\mathcal{S}} is +#' a simple random subsample without replacement. The original +#' approach is by Cochran (1977) and we follow the equations 7--9 by +#' Magnusson et al. (2020). +#' +#' @details In Magnusson et al. (2020) Eq (9) first row, the second `+` should +#' be a `-`; Supplementary Material Eq (6) has this correct. +#' As `srs_diff_est()` in the `loo` package is used for \eqn{n E[y]}, there is +#' a proportional difference of \eqn{1/n} compared to the paper. +#' +#' @param y_approx (numeric) `n` approximated values. +#' @param y (numeric) `m +#' # distinct() +#' # +#' # wine_scaled <- as.data.frame(scale(wine)) +#' # +#' # fitos <- brm(ordered(quality) ~ ., +#' # family = cumulative("logit"), +#' # prior = prior(R2D2(mean_R2 = 1/3, prec_R2 = 3)), +#' # data = wine_scaled, +#' # seed = 1, +#' # silent = 2, +#' # refresh = 0) +#' # +#' # log_lik_matrix <- log_lik(fitos) +#' # +#' # N <- nrow(wine_scaled) +#' # Nsub <- 100 +#' # +#' # # posterior log-score +#' # lpd <- elpd(log_lik_matrix) +#' # sum(lpd$pointwise[,"elpd"]) +#' # # Use PSIS-LOO for subsample of Nsub randomly selected observations +#' # set.seed(1) +#' # idx <- sample(1:N, Nsub) +#' # elpd_loo_sub <- loo(log_lik_matrix[,idx]) +#' # sum(elpd_loo_sub$pointwise[,"elpd_loo"]) / Nsub * N +#' # +#' # # Use difference estimator to combine fast result and subsampled accurate result +#' # loo:::srs_diff_est(lpd$pointwise[,"elpd"], elpd_loo_sub$pointwise[,"elpd_loo"], idx) +#' # +#' # # Comparison to using PSIS-LOO for all observations +#' # loo(log_lik_matrix) +#' @export srs_diff_est <- function(y_approx, y, y_idx) { checkmate::assert_numeric(y_approx) checkmate::assert_numeric(y, max.len = length(y_approx)) diff --git a/man/loo_subsample.Rd b/man/loo_subsample.Rd index 6f381db6..5889be1e 100644 --- a/man/loo_subsample.Rd +++ b/man/loo_subsample.Rd @@ -196,5 +196,5 @@ In \emph{Proceedings of the 23rd International Conference on Artificial Intelligence and Statistics (AISTATS)}, PMLR 108:341-351. } \seealso{ -\code{\link[=loo]{loo()}}, \code{\link[=psis]{psis()}}, \code{\link[=loo_compare]{loo_compare()}} +\code{\link[=loo]{loo()}}, \code{\link[=psis]{psis()}}, \code{\link[=loo_compare]{loo_compare()}}, \code{\link[=srs_diff_est]{srs_diff_est()}} } diff --git a/man/srs_diff_est.Rd b/man/srs_diff_est.Rd new file mode 100644 index 00000000..72f8faf5 --- /dev/null +++ b/man/srs_diff_est.Rd @@ -0,0 +1,97 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loo_subsample.R +\name{srs_diff_est} +\alias{srs_diff_est} +\title{Difference estimator with simple random sampling without replacement.} +\usage{ +srs_diff_est(y_approx, y, y_idx) +} +\arguments{ +\item{y_approx}{(numeric) \code{n} approximated values.} + +\item{y}{(numeric) \code{m +# distinct() +# +# wine_scaled <- as.data.frame(scale(wine)) +# +# fitos <- brm(ordered(quality) ~ ., +# family = cumulative("logit"), +# prior = prior(R2D2(mean_R2 = 1/3, prec_R2 = 3)), +# data = wine_scaled, +# seed = 1, +# silent = 2, +# refresh = 0) +# +# log_lik_matrix <- log_lik(fitos) +# +# N <- nrow(wine_scaled) +# Nsub <- 100 +# +# # posterior log-score +# lpd <- elpd(log_lik_matrix) +# sum(lpd$pointwise[,"elpd"]) +# # Use PSIS-LOO for subsample of Nsub randomly selected observations +# set.seed(1) +# idx <- sample(1:N, Nsub) +# elpd_loo_sub <- loo(log_lik_matrix[,idx]) +# sum(elpd_loo_sub$pointwise[,"elpd_loo"]) / Nsub * N +# +# # Use difference estimator to combine fast result and subsampled accurate result +# loo:::srs_diff_est(lpd$pointwise[,"elpd"], elpd_loo_sub$pointwise[,"elpd_loo"], idx) +# +# # Comparison to using PSIS-LOO for all observations +# loo(log_lik_matrix) +} +\references{ +Magnusson, M., Riis Andersen, M., Jonasson, J. and Vehtari, A. (2020). +Leave-One-Out Cross-Validation for Model Comparison in Large Data. +In \emph{Proceedings of the 23rd International Conference on Artificial +Intelligence and Statistics (AISTATS)}, PMLR 108:341-351. + +Cochran, W. G. (1977). \emph{Sampling Techniques, 3rd Edition}. John Wiley. + +Cortez, P., Cerdeira, A.L., Almeida, F., Matos, T., & Reis, J. (2009). +Modeling wine preferences by data mining from physicochemical properties. +\emph{Decis. Support Syst.}, \emph{47}, 547-553. +} +\seealso{ +\code{\link[=loo_subsample]{loo_subsample()}} +}