Skip to content

Commit df35f4b

Browse files
committed
Refactor analysers to use precomputed contexts
1 parent a46d78a commit df35f4b

9 files changed

Lines changed: 1873 additions & 1058 deletions

R/ModelArray_Constructor.R

Lines changed: 394 additions & 698 deletions
Large diffs are not rendered by default.

R/analyse-helpers.R

Lines changed: 317 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Internal shared helpers for ModelArray analysis functions
22
# These are not exported — used by ModelArray.lm, ModelArray.gam, ModelArray.wrap
33

4-
4+
# Validation and dataset preparation ----
55
#' Validate that data is a ModelArray
66
#' @noRd
77
.validate_modelarray_input <- function(data) {
@@ -110,7 +110,320 @@
110110
max(num.subj.total * num.subj.lthr.rel, num.subj.lthr.abs)
111111
}
112112

113+
# Create analysis context ----
114+
# Precomputed context builders for element-wise analysis functions.
115+
# These hoist loop-invariant computations out of the per-element functions
116+
# so they execute once before the loop rather than millions of times inside it.
117+
118+
# Level 1: Base context shared by all analysis functions (lm, gam, wrap).
119+
# Handles scalar discovery, collision checks, and cross-scalar source alignment.
120+
121+
#' Build base context shared by all element-wise analysis functions
122+
#'
123+
#' Precomputes scalar name lookups, column-name collision checks, and
124+
#' cross-scalar source-alignment indices that are invariant across elements.
125+
#' This is the foundation on which model-specific contexts are built.
126+
#'
127+
#' @param modelarray A \linkS4class{ModelArray} object.
128+
#' @param phenotypes A data.frame, already aligned to \code{modelarray} via
129+
#' \code{.align_phenotypes()}.
130+
#' @param scalar Character. The name of the response scalar.
131+
#' @param scalar_subset Character vector or \code{NULL}. Which scalars to
132+
#' attach and align. If \code{NULL}, all scalars in the ModelArray are used
133+
#' (the \code{wrap} behaviour). If a character vector, only those scalars
134+
#' are included (the \code{lm}/\code{gam} behaviour where we detect
135+
#' formula-referenced scalars).
136+
#' @return A named list with components:
137+
#' \describe{
138+
#' \item{modelarray}{The ModelArray object (read-only reference).}
139+
#' \item{phenotypes}{The aligned phenotypes data.frame.}
140+
#' \item{scalar}{Character. The response scalar name.}
141+
#' \item{all_scalar_names}{Character vector. Names of all scalars in the
142+
#' ModelArray.}
143+
#' \item{attached_scalars}{Character vector. Names of scalars that will
144+
#' be attached to the per-element data.frame (may be a subset of
145+
#' \code{all_scalar_names} or all of them).}
146+
#' \item{predictor_reorder}{A named list of integer vectors. For each
147+
#' scalar in \code{attached_scalars}, the precomputed index vector
148+
#' from \code{match(phen_sources, scalar_sources)} that reorders
149+
#' scalar columns to match phenotype rows. For the response scalar
150+
#' (which is already aligned by \code{.align_phenotypes()}), this
151+
#' entry is \code{NULL} (no reordering needed).}
152+
#' }
153+
#' @noRd
154+
.build_base_context <- function(modelarray, phenotypes, scalar,
155+
scalar_subset = NULL) {
156+
all_scalar_names <- names(scalars(modelarray))
157+
158+
159+
# Determine which scalars to attach
160+
161+
if (is.null(scalar_subset)) {
162+
# wrap mode: attach all scalars
163+
attached_scalars <- all_scalar_names
164+
} else {
165+
# lm/gam mode: attach only the explicitly requested scalars
166+
# (response + any predictor scalars detected from the formula)
167+
attached_scalars <- unique(scalar_subset)
168+
}
169+
170+
171+
# Collision check: scalar names vs phenotype column names.
172+
# This is invariant — the same collision would occur at every element,
173+
174+
# so we fail fast once rather than millions of times.
175+
176+
collisions <- intersect(attached_scalars, colnames(phenotypes))
177+
if (length(collisions) > 0) {
178+
stop(
179+
"Column name collision between phenotypes and scalar names: ",
180+
paste(collisions, collapse = ", "),
181+
". Please rename or remove these columns from phenotypes before ",
182+
"modeling."
183+
)
184+
}
185+
186+
# Precompute source-alignment reorder indices for each attached scalar.
187+
# The response scalar is already aligned by .align_phenotypes(), so it
188+
189+
# needs no reordering. Predictor scalars from mergeModelArrays() may
190+
# have different column orderings and need match()-based reordering.
191+
phen_sources <- phenotypes[["source_file"]]
192+
predictor_reorder <- stats::setNames(
193+
vector("list", length(attached_scalars)),
194+
attached_scalars
195+
)
196+
197+
for (sname in attached_scalars) {
198+
s_sources <- sources(modelarray)[[sname]]
199+
200+
if (identical(s_sources, phen_sources)) {
201+
# Already aligned (typical for the response scalar, or when all
202+
# scalars share the same source order after mergeModelArrays)
203+
predictor_reorder[[sname]] <- NULL
204+
} else {
205+
# Validate that the sets match (once, not per element)
206+
if (!(all(s_sources %in% phen_sources) &&
207+
all(phen_sources %in% s_sources))) {
208+
stop(
209+
"The source files for scalar '", sname,
210+
"' do not match phenotypes$source_file."
211+
)
212+
}
213+
predictor_reorder[[sname]] <- match(phen_sources, s_sources)
214+
}
215+
}
216+
217+
list(
218+
modelarray = modelarray,
219+
phenotypes = phenotypes,
220+
scalar = scalar,
221+
all_scalar_names = all_scalar_names,
222+
attached_scalars = attached_scalars,
223+
predictor_reorder = predictor_reorder
224+
)
225+
}
226+
227+
228+
# Level 2: Model-specific context builders that add formula-derived
229+
# information on top of the base context.
230+
231+
#' Build context for element-wise linear model fitting
232+
#'
233+
#' Extends the base context with formula parsing results: the LHS variable
234+
#' name, RHS variable names, and which RHS variables are scalars (as opposed
235+
#' to phenotype columns). All of this is invariant across elements.
236+
#'
237+
#' @param formula A \code{\link[stats]{formula}} to be passed to
238+
#' \code{\link[stats]{lm}}.
239+
#' @param modelarray A \linkS4class{ModelArray} object.
240+
#' @param phenotypes A data.frame, already aligned via
241+
#' \code{.align_phenotypes()}.
242+
#' @param scalar Character. The response scalar name.
243+
#' @return A named list inheriting all components from
244+
#' \code{.build_base_context()} plus:
245+
#' \describe{
246+
#' \item{formula}{The model formula.}
247+
#' \item{lhs_name}{Character. The response variable name from the
248+
#' formula LHS.}
249+
#' \item{rhs_vars}{Character vector. All variable names on the RHS.}
250+
#' \item{scalar_predictors}{Character vector. RHS variables that are
251+
#' scalar names in the ModelArray (i.e., cross-scalar predictors).}
252+
#' }
253+
#' @noRd
254+
.build_lm_context <- function(formula, modelarray, phenotypes, scalar) {
255+
# Formula parsing — currently repeated per element inside
256+
# analyseOneElement.lm [4]
257+
all_vars <- all.vars(formula)
258+
lhs_name <- tryCatch(
259+
as.character(formula[[2]]),
260+
error = function(e) NULL
261+
)
262+
rhs_vars <- setdiff(all_vars, lhs_name)
263+
264+
# Detect which RHS variables are scalars in the ModelArray
265+
all_scalar_names <- names(scalars(modelarray))
266+
scalar_predictors <- intersect(rhs_vars, all_scalar_names)
267+
268+
# The scalars to attach: response + any scalar predictors
269+
scalar_subset <- unique(c(scalar, scalar_predictors))
270+
271+
# Build the base context with only the needed scalars
272+
base_ctx <- .build_base_context(
273+
modelarray = modelarray,
274+
phenotypes = phenotypes,
275+
scalar = scalar,
276+
scalar_subset = scalar_subset
277+
)
278+
279+
# Extend with formula-specific fields
280+
base_ctx$formula <- formula
281+
base_ctx$lhs_name <- lhs_name
282+
base_ctx$rhs_vars <- rhs_vars
283+
base_ctx$scalar_predictors <- scalar_predictors
113284

285+
base_ctx
286+
}
287+
288+
289+
#' Build context for element-wise GAM fitting
290+
#'
291+
#' Extends the base context with formula parsing results and GAM-specific
292+
#' formula validation that currently runs inside the ModelArray.gam()
293+
#' preamble but whose results are never passed to the per-element function.
294+
#'
295+
#' @param formula A \code{\link[stats]{formula}} to be passed to
296+
#' \code{\link[mgcv]{gam}}.
297+
#' @param modelarray A \linkS4class{ModelArray} object.
298+
#' @param phenotypes A data.frame, already aligned via
299+
#' \code{.align_phenotypes()}.
300+
#' @param scalar Character. The response scalar name.
301+
#' @return A named list inheriting all components from
302+
#' \code{.build_lm_context()} (which itself inherits from
303+
#' \code{.build_base_context()}) plus:
304+
#' \describe{
305+
#' \item{gam_formula_breakdown}{The result of
306+
#' \code{mgcv::interpret.gam(formula)}, cached for reuse.}
307+
#' }
308+
#' @noRd
309+
.build_gam_context <- function(formula, modelarray, phenotypes, scalar) {
310+
# GAM formula validation — currently runs in ModelArray.gam() preamble [1]
311+
# but the breakdown result is discarded. We cache it here.
312+
gam_breakdown <- tryCatch(
313+
mgcv::interpret.gam(formula),
314+
error = function(cond) {
315+
stop("The formula is not valid for mgcv::gam()! Please check and revise.")
316+
}
317+
)
318+
319+
# The formula structure is the same as lm for variable detection purposes
320+
ctx <- .build_lm_context(formula, modelarray, phenotypes, scalar)
321+
322+
# Add GAM-specific cached data
323+
ctx$gam_formula_breakdown <- gam_breakdown
324+
325+
ctx
326+
}
327+
328+
329+
#' Build context for element-wise user-supplied function execution
330+
#'
331+
#' Uses the base context in "attach all scalars" mode since
332+
#' \code{analyseOneElement.wrap} attaches every scalar in the ModelArray
333+
#' to the per-element data.frame, not just formula-referenced ones.
334+
#'
335+
#' @param modelarray A \linkS4class{ModelArray} object.
336+
#' @param phenotypes A data.frame, already aligned via
337+
#' \code{.align_phenotypes()}.
338+
#' @param scalar Character. The primary scalar name (used for the initial
339+
#' validity check).
340+
#' @return A named list: the base context with \code{scalar_subset = NULL}
341+
#' (all scalars attached).
342+
#' @noRd
343+
.build_wrap_context <- function(modelarray, phenotypes, scalar) {
344+
# scalar_subset = NULL triggers "attach all" mode in .build_base_context()
345+
ctx <- .build_base_context(
346+
modelarray = modelarray,
347+
phenotypes = phenotypes,
348+
scalar = scalar,
349+
scalar_subset = NULL
350+
)
351+
352+
ctx
353+
}
354+
355+
356+
# Level 3: Shared per-element data assembly helper.
357+
# Extracts scalar values for one element using the precomputed context,
358+
# builds the validity mask, and returns the filtered data.frame.
359+
# This replaces duplicated logic across analyseOneElement.lm,
360+
# analyseOneElement.gam, and analyseOneElement.wrap [4].
361+
362+
#' Assemble per-element data.frame from precomputed context
363+
#'
364+
#' Reads scalar rows from the ModelArray, applies precomputed reorder
365+
#' indices, builds the intersection validity mask, and returns the
366+
#' filtered data.frame ready for model fitting or user function execution.
367+
#'
368+
#' @param i_element Integer. 1-based element index.
369+
#' @param ctx A context list from one of the \code{.build_*_context()}
370+
#' functions.
371+
#' @param num.subj.lthr Numeric. Minimum number of subjects with finite
372+
#' values required.
373+
#' @return A list with components:
374+
#' \describe{
375+
#' \item{dat}{A data.frame: the filtered phenotypes with scalar columns
376+
#' attached. \code{NULL} if the element has insufficient valid
377+
#' subjects.}
378+
#' \item{sufficient}{Logical. Whether the element passed the subject
379+
#' threshold.}
380+
#' \item{num_valid}{Integer. Number of subjects with finite values
381+
#' across all attached scalars.}
382+
#' }
383+
#' @noRd
384+
.assemble_element_data <- function(i_element, ctx, num.subj.lthr) {
385+
# Read the response scalar row — the only mandatory per-element I/O
386+
response_vals <- scalars(ctx$modelarray)[[ctx$scalar]][i_element, ]
387+
388+
# Start the validity mask with the response scalar
389+
masks <- list(is.finite(response_vals))
390+
391+
# Read and reorder additional attached scalars
392+
scalar_values <- list()
393+
scalar_values[[ctx$scalar]] <- response_vals
394+
395+
other_scalars <- setdiff(ctx$attached_scalars, ctx$scalar)
396+
for (sname in other_scalars) {
397+
s_vals <- scalars(ctx$modelarray)[[sname]][i_element, ]
398+
399+
# Apply precomputed reorder index (or use as-is if NULL)
400+
reorder_idx <- ctx$predictor_reorder[[sname]]
401+
if (!is.null(reorder_idx)) {
402+
s_vals <- s_vals[reorder_idx]
403+
}
404+
405+
scalar_values[[sname]] <- s_vals
406+
masks[[length(masks) + 1L]] <- is.finite(s_vals)
407+
}
408+
409+
# Intersection mask across all scalars
410+
valid_mask <- Reduce("&", masks)
411+
num_valid <- sum(valid_mask)
412+
413+
if (!(num_valid > num.subj.lthr)) {
414+
return(list(dat = NULL, sufficient = FALSE, num_valid = num_valid))
415+
}
416+
417+
# Build filtered data.frame
418+
dat <- ctx$phenotypes[valid_mask, , drop = FALSE]
419+
for (sname in ctx$attached_scalars) {
420+
dat[[sname]] <- scalar_values[[sname]][valid_mask]
421+
}
422+
423+
list(dat = dat, sufficient = TRUE, num_valid = num_valid)
424+
}
425+
426+
# Find initiator element ----
114427
#' Find a valid initiator element by searching middle → forward → backward
115428
#'
116429
#' @param analyse_one_fn The per-element analysis function
@@ -186,6 +499,7 @@
186499
}
187500

188501

502+
# Parallelization ----
189503
#' Dispatch parallel/serial apply with optional progress bar
190504
#'
191505
#' @param element.subset Integer vector of element indices
@@ -236,7 +550,7 @@
236550
fits
237551
}
238552

239-
553+
# P-value correction ----
240554
#' Correct p-values for a set of terms and append corrected columns
241555
#'
242556
#' @param df_out Data.frame of results
@@ -264,7 +578,7 @@
264578
df_out
265579
}
266580

267-
581+
# Streaming writes ----
268582
#' Initialize an incremental writer for /results datasets
269583
#' @noRd
270584
.init_results_stream_writer <- function(write_results_name,

0 commit comments

Comments
 (0)