|
1 | 1 | # Internal shared helpers for ModelArray analysis functions |
2 | 2 | # These are not exported — used by ModelArray.lm, ModelArray.gam, ModelArray.wrap |
3 | 3 |
|
4 | | - |
| 4 | +# Validation and dataset preparation ---- |
5 | 5 | #' Validate that data is a ModelArray |
6 | 6 | #' @noRd |
7 | 7 | .validate_modelarray_input <- function(data) { |
|
110 | 110 | max(num.subj.total * num.subj.lthr.rel, num.subj.lthr.abs) |
111 | 111 | } |
112 | 112 |
|
| 113 | +# Create analysis context ---- |
| 114 | +# Precomputed context builders for element-wise analysis functions. |
| 115 | +# These hoist loop-invariant computations out of the per-element functions |
| 116 | +# so they execute once before the loop rather than millions of times inside it. |
| 117 | + |
| 118 | +# Level 1: Base context shared by all analysis functions (lm, gam, wrap). |
| 119 | +# Handles scalar discovery, collision checks, and cross-scalar source alignment. |
| 120 | + |
| 121 | +#' Build base context shared by all element-wise analysis functions |
| 122 | +#' |
| 123 | +#' Precomputes scalar name lookups, column-name collision checks, and |
| 124 | +#' cross-scalar source-alignment indices that are invariant across elements. |
| 125 | +#' This is the foundation on which model-specific contexts are built. |
| 126 | +#' |
| 127 | +#' @param modelarray A \linkS4class{ModelArray} object. |
| 128 | +#' @param phenotypes A data.frame, already aligned to \code{modelarray} via |
| 129 | +#' \code{.align_phenotypes()}. |
| 130 | +#' @param scalar Character. The name of the response scalar. |
| 131 | +#' @param scalar_subset Character vector or \code{NULL}. Which scalars to |
| 132 | +#' attach and align. If \code{NULL}, all scalars in the ModelArray are used |
| 133 | +#' (the \code{wrap} behaviour). If a character vector, only those scalars |
| 134 | +#' are included (the \code{lm}/\code{gam} behaviour where we detect |
| 135 | +#' formula-referenced scalars). |
| 136 | +#' @return A named list with components: |
| 137 | +#' \describe{ |
| 138 | +#' \item{modelarray}{The ModelArray object (read-only reference).} |
| 139 | +#' \item{phenotypes}{The aligned phenotypes data.frame.} |
| 140 | +#' \item{scalar}{Character. The response scalar name.} |
| 141 | +#' \item{all_scalar_names}{Character vector. Names of all scalars in the |
| 142 | +#' ModelArray.} |
| 143 | +#' \item{attached_scalars}{Character vector. Names of scalars that will |
| 144 | +#' be attached to the per-element data.frame (may be a subset of |
| 145 | +#' \code{all_scalar_names} or all of them).} |
| 146 | +#' \item{predictor_reorder}{A named list of integer vectors. For each |
| 147 | +#' scalar in \code{attached_scalars}, the precomputed index vector |
| 148 | +#' from \code{match(phen_sources, scalar_sources)} that reorders |
| 149 | +#' scalar columns to match phenotype rows. For the response scalar |
| 150 | +#' (which is already aligned by \code{.align_phenotypes()}), this |
| 151 | +#' entry is \code{NULL} (no reordering needed).} |
| 152 | +#' } |
| 153 | +#' @noRd |
| 154 | +.build_base_context <- function(modelarray, phenotypes, scalar, |
| 155 | + scalar_subset = NULL) { |
| 156 | + all_scalar_names <- names(scalars(modelarray)) |
| 157 | + |
| 158 | + |
| 159 | + # Determine which scalars to attach |
| 160 | + |
| 161 | + if (is.null(scalar_subset)) { |
| 162 | + # wrap mode: attach all scalars |
| 163 | + attached_scalars <- all_scalar_names |
| 164 | + } else { |
| 165 | + # lm/gam mode: attach only the explicitly requested scalars |
| 166 | + # (response + any predictor scalars detected from the formula) |
| 167 | + attached_scalars <- unique(scalar_subset) |
| 168 | + } |
| 169 | + |
| 170 | + |
| 171 | + # Collision check: scalar names vs phenotype column names. |
| 172 | + # This is invariant — the same collision would occur at every element, |
| 173 | + |
| 174 | + # so we fail fast once rather than millions of times. |
| 175 | + |
| 176 | + collisions <- intersect(attached_scalars, colnames(phenotypes)) |
| 177 | + if (length(collisions) > 0) { |
| 178 | + stop( |
| 179 | + "Column name collision between phenotypes and scalar names: ", |
| 180 | + paste(collisions, collapse = ", "), |
| 181 | + ". Please rename or remove these columns from phenotypes before ", |
| 182 | + "modeling." |
| 183 | + ) |
| 184 | + } |
| 185 | + |
| 186 | + # Precompute source-alignment reorder indices for each attached scalar. |
| 187 | + # The response scalar is already aligned by .align_phenotypes(), so it |
| 188 | + |
| 189 | + # needs no reordering. Predictor scalars from mergeModelArrays() may |
| 190 | + # have different column orderings and need match()-based reordering. |
| 191 | + phen_sources <- phenotypes[["source_file"]] |
| 192 | + predictor_reorder <- stats::setNames( |
| 193 | + vector("list", length(attached_scalars)), |
| 194 | + attached_scalars |
| 195 | + ) |
| 196 | + |
| 197 | + for (sname in attached_scalars) { |
| 198 | + s_sources <- sources(modelarray)[[sname]] |
| 199 | + |
| 200 | + if (identical(s_sources, phen_sources)) { |
| 201 | + # Already aligned (typical for the response scalar, or when all |
| 202 | + # scalars share the same source order after mergeModelArrays) |
| 203 | + predictor_reorder[[sname]] <- NULL |
| 204 | + } else { |
| 205 | + # Validate that the sets match (once, not per element) |
| 206 | + if (!(all(s_sources %in% phen_sources) && |
| 207 | + all(phen_sources %in% s_sources))) { |
| 208 | + stop( |
| 209 | + "The source files for scalar '", sname, |
| 210 | + "' do not match phenotypes$source_file." |
| 211 | + ) |
| 212 | + } |
| 213 | + predictor_reorder[[sname]] <- match(phen_sources, s_sources) |
| 214 | + } |
| 215 | + } |
| 216 | + |
| 217 | + list( |
| 218 | + modelarray = modelarray, |
| 219 | + phenotypes = phenotypes, |
| 220 | + scalar = scalar, |
| 221 | + all_scalar_names = all_scalar_names, |
| 222 | + attached_scalars = attached_scalars, |
| 223 | + predictor_reorder = predictor_reorder |
| 224 | + ) |
| 225 | +} |
| 226 | + |
| 227 | + |
| 228 | +# Level 2: Model-specific context builders that add formula-derived |
| 229 | +# information on top of the base context. |
| 230 | + |
| 231 | +#' Build context for element-wise linear model fitting |
| 232 | +#' |
| 233 | +#' Extends the base context with formula parsing results: the LHS variable |
| 234 | +#' name, RHS variable names, and which RHS variables are scalars (as opposed |
| 235 | +#' to phenotype columns). All of this is invariant across elements. |
| 236 | +#' |
| 237 | +#' @param formula A \code{\link[stats]{formula}} to be passed to |
| 238 | +#' \code{\link[stats]{lm}}. |
| 239 | +#' @param modelarray A \linkS4class{ModelArray} object. |
| 240 | +#' @param phenotypes A data.frame, already aligned via |
| 241 | +#' \code{.align_phenotypes()}. |
| 242 | +#' @param scalar Character. The response scalar name. |
| 243 | +#' @return A named list inheriting all components from |
| 244 | +#' \code{.build_base_context()} plus: |
| 245 | +#' \describe{ |
| 246 | +#' \item{formula}{The model formula.} |
| 247 | +#' \item{lhs_name}{Character. The response variable name from the |
| 248 | +#' formula LHS.} |
| 249 | +#' \item{rhs_vars}{Character vector. All variable names on the RHS.} |
| 250 | +#' \item{scalar_predictors}{Character vector. RHS variables that are |
| 251 | +#' scalar names in the ModelArray (i.e., cross-scalar predictors).} |
| 252 | +#' } |
| 253 | +#' @noRd |
| 254 | +.build_lm_context <- function(formula, modelarray, phenotypes, scalar) { |
| 255 | + # Formula parsing — currently repeated per element inside |
| 256 | + # analyseOneElement.lm [4] |
| 257 | + all_vars <- all.vars(formula) |
| 258 | + lhs_name <- tryCatch( |
| 259 | + as.character(formula[[2]]), |
| 260 | + error = function(e) NULL |
| 261 | + ) |
| 262 | + rhs_vars <- setdiff(all_vars, lhs_name) |
| 263 | + |
| 264 | + # Detect which RHS variables are scalars in the ModelArray |
| 265 | + all_scalar_names <- names(scalars(modelarray)) |
| 266 | + scalar_predictors <- intersect(rhs_vars, all_scalar_names) |
| 267 | + |
| 268 | + # The scalars to attach: response + any scalar predictors |
| 269 | + scalar_subset <- unique(c(scalar, scalar_predictors)) |
| 270 | + |
| 271 | + # Build the base context with only the needed scalars |
| 272 | + base_ctx <- .build_base_context( |
| 273 | + modelarray = modelarray, |
| 274 | + phenotypes = phenotypes, |
| 275 | + scalar = scalar, |
| 276 | + scalar_subset = scalar_subset |
| 277 | + ) |
| 278 | + |
| 279 | + # Extend with formula-specific fields |
| 280 | + base_ctx$formula <- formula |
| 281 | + base_ctx$lhs_name <- lhs_name |
| 282 | + base_ctx$rhs_vars <- rhs_vars |
| 283 | + base_ctx$scalar_predictors <- scalar_predictors |
113 | 284 |
|
| 285 | + base_ctx |
| 286 | +} |
| 287 | + |
| 288 | + |
| 289 | +#' Build context for element-wise GAM fitting |
| 290 | +#' |
| 291 | +#' Extends the base context with formula parsing results and GAM-specific |
| 292 | +#' formula validation that currently runs inside the ModelArray.gam() |
| 293 | +#' preamble but whose results are never passed to the per-element function. |
| 294 | +#' |
| 295 | +#' @param formula A \code{\link[stats]{formula}} to be passed to |
| 296 | +#' \code{\link[mgcv]{gam}}. |
| 297 | +#' @param modelarray A \linkS4class{ModelArray} object. |
| 298 | +#' @param phenotypes A data.frame, already aligned via |
| 299 | +#' \code{.align_phenotypes()}. |
| 300 | +#' @param scalar Character. The response scalar name. |
| 301 | +#' @return A named list inheriting all components from |
| 302 | +#' \code{.build_lm_context()} (which itself inherits from |
| 303 | +#' \code{.build_base_context()}) plus: |
| 304 | +#' \describe{ |
| 305 | +#' \item{gam_formula_breakdown}{The result of |
| 306 | +#' \code{mgcv::interpret.gam(formula)}, cached for reuse.} |
| 307 | +#' } |
| 308 | +#' @noRd |
| 309 | +.build_gam_context <- function(formula, modelarray, phenotypes, scalar) { |
| 310 | + # GAM formula validation — currently runs in ModelArray.gam() preamble [1] |
| 311 | + # but the breakdown result is discarded. We cache it here. |
| 312 | + gam_breakdown <- tryCatch( |
| 313 | + mgcv::interpret.gam(formula), |
| 314 | + error = function(cond) { |
| 315 | + stop("The formula is not valid for mgcv::gam()! Please check and revise.") |
| 316 | + } |
| 317 | + ) |
| 318 | + |
| 319 | + # The formula structure is the same as lm for variable detection purposes |
| 320 | + ctx <- .build_lm_context(formula, modelarray, phenotypes, scalar) |
| 321 | + |
| 322 | + # Add GAM-specific cached data |
| 323 | + ctx$gam_formula_breakdown <- gam_breakdown |
| 324 | + |
| 325 | + ctx |
| 326 | +} |
| 327 | + |
| 328 | + |
| 329 | +#' Build context for element-wise user-supplied function execution |
| 330 | +#' |
| 331 | +#' Uses the base context in "attach all scalars" mode since |
| 332 | +#' \code{analyseOneElement.wrap} attaches every scalar in the ModelArray |
| 333 | +#' to the per-element data.frame, not just formula-referenced ones. |
| 334 | +#' |
| 335 | +#' @param modelarray A \linkS4class{ModelArray} object. |
| 336 | +#' @param phenotypes A data.frame, already aligned via |
| 337 | +#' \code{.align_phenotypes()}. |
| 338 | +#' @param scalar Character. The primary scalar name (used for the initial |
| 339 | +#' validity check). |
| 340 | +#' @return A named list: the base context with \code{scalar_subset = NULL} |
| 341 | +#' (all scalars attached). |
| 342 | +#' @noRd |
| 343 | +.build_wrap_context <- function(modelarray, phenotypes, scalar) { |
| 344 | + # scalar_subset = NULL triggers "attach all" mode in .build_base_context() |
| 345 | + ctx <- .build_base_context( |
| 346 | + modelarray = modelarray, |
| 347 | + phenotypes = phenotypes, |
| 348 | + scalar = scalar, |
| 349 | + scalar_subset = NULL |
| 350 | + ) |
| 351 | + |
| 352 | + ctx |
| 353 | +} |
| 354 | + |
| 355 | + |
| 356 | +# Level 3: Shared per-element data assembly helper. |
| 357 | +# Extracts scalar values for one element using the precomputed context, |
| 358 | +# builds the validity mask, and returns the filtered data.frame. |
| 359 | +# This replaces duplicated logic across analyseOneElement.lm, |
| 360 | +# analyseOneElement.gam, and analyseOneElement.wrap [4]. |
| 361 | + |
| 362 | +#' Assemble per-element data.frame from precomputed context |
| 363 | +#' |
| 364 | +#' Reads scalar rows from the ModelArray, applies precomputed reorder |
| 365 | +#' indices, builds the intersection validity mask, and returns the |
| 366 | +#' filtered data.frame ready for model fitting or user function execution. |
| 367 | +#' |
| 368 | +#' @param i_element Integer. 1-based element index. |
| 369 | +#' @param ctx A context list from one of the \code{.build_*_context()} |
| 370 | +#' functions. |
| 371 | +#' @param num.subj.lthr Numeric. Minimum number of subjects with finite |
| 372 | +#' values required. |
| 373 | +#' @return A list with components: |
| 374 | +#' \describe{ |
| 375 | +#' \item{dat}{A data.frame: the filtered phenotypes with scalar columns |
| 376 | +#' attached. \code{NULL} if the element has insufficient valid |
| 377 | +#' subjects.} |
| 378 | +#' \item{sufficient}{Logical. Whether the element passed the subject |
| 379 | +#' threshold.} |
| 380 | +#' \item{num_valid}{Integer. Number of subjects with finite values |
| 381 | +#' across all attached scalars.} |
| 382 | +#' } |
| 383 | +#' @noRd |
| 384 | +.assemble_element_data <- function(i_element, ctx, num.subj.lthr) { |
| 385 | + # Read the response scalar row — the only mandatory per-element I/O |
| 386 | + response_vals <- scalars(ctx$modelarray)[[ctx$scalar]][i_element, ] |
| 387 | + |
| 388 | + # Start the validity mask with the response scalar |
| 389 | + masks <- list(is.finite(response_vals)) |
| 390 | + |
| 391 | + # Read and reorder additional attached scalars |
| 392 | + scalar_values <- list() |
| 393 | + scalar_values[[ctx$scalar]] <- response_vals |
| 394 | + |
| 395 | + other_scalars <- setdiff(ctx$attached_scalars, ctx$scalar) |
| 396 | + for (sname in other_scalars) { |
| 397 | + s_vals <- scalars(ctx$modelarray)[[sname]][i_element, ] |
| 398 | + |
| 399 | + # Apply precomputed reorder index (or use as-is if NULL) |
| 400 | + reorder_idx <- ctx$predictor_reorder[[sname]] |
| 401 | + if (!is.null(reorder_idx)) { |
| 402 | + s_vals <- s_vals[reorder_idx] |
| 403 | + } |
| 404 | + |
| 405 | + scalar_values[[sname]] <- s_vals |
| 406 | + masks[[length(masks) + 1L]] <- is.finite(s_vals) |
| 407 | + } |
| 408 | + |
| 409 | + # Intersection mask across all scalars |
| 410 | + valid_mask <- Reduce("&", masks) |
| 411 | + num_valid <- sum(valid_mask) |
| 412 | + |
| 413 | + if (!(num_valid > num.subj.lthr)) { |
| 414 | + return(list(dat = NULL, sufficient = FALSE, num_valid = num_valid)) |
| 415 | + } |
| 416 | + |
| 417 | + # Build filtered data.frame |
| 418 | + dat <- ctx$phenotypes[valid_mask, , drop = FALSE] |
| 419 | + for (sname in ctx$attached_scalars) { |
| 420 | + dat[[sname]] <- scalar_values[[sname]][valid_mask] |
| 421 | + } |
| 422 | + |
| 423 | + list(dat = dat, sufficient = TRUE, num_valid = num_valid) |
| 424 | +} |
| 425 | + |
| 426 | +# Find initiator element ---- |
114 | 427 | #' Find a valid initiator element by searching middle → forward → backward |
115 | 428 | #' |
116 | 429 | #' @param analyse_one_fn The per-element analysis function |
|
186 | 499 | } |
187 | 500 |
|
188 | 501 |
|
| 502 | +# Parallelization ---- |
189 | 503 | #' Dispatch parallel/serial apply with optional progress bar |
190 | 504 | #' |
191 | 505 | #' @param element.subset Integer vector of element indices |
|
236 | 550 | fits |
237 | 551 | } |
238 | 552 |
|
239 | | - |
| 553 | +# P-value correction ---- |
240 | 554 | #' Correct p-values for a set of terms and append corrected columns |
241 | 555 | #' |
242 | 556 | #' @param df_out Data.frame of results |
|
264 | 578 | df_out |
265 | 579 | } |
266 | 580 |
|
267 | | - |
| 581 | +# Streaming writes ---- |
268 | 582 | #' Initialize an incremental writer for /results datasets |
269 | 583 | #' @noRd |
270 | 584 | .init_results_stream_writer <- function(write_results_name, |
|
0 commit comments