From e59047b188e13648e5caaa2e01cbc6f996bc8993 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Tue, 21 Apr 2026 21:54:07 -0400
Subject: [PATCH 1/7] initial iteration for convolve atom

---
 include/atoms/affine.h                      |   8 +
 include/subexpr.h                           |  17 ++
 src/atoms/affine/convolve.c                 | 274 ++++++++++++++++++++
 tests/all_tests.c                           |  11 +
 tests/forward_pass/affine/test_convolve.h   |  53 ++++
 tests/jacobian_tests/affine/test_convolve.h |  68 +++++
 tests/problem/test_param_broadcast.h        |  65 +++++
 tests/problem/test_param_prob.h             |  92 +++++++
 tests/wsum_hess/affine/test_convolve.h      |  57 ++++
 9 files changed, 645 insertions(+)
 create mode 100644 src/atoms/affine/convolve.c
 create mode 100644 tests/forward_pass/affine/test_convolve.h
 create mode 100644 tests/jacobian_tests/affine/test_convolve.h
 create mode 100644 tests/wsum_hess/affine/test_convolve.h

diff --git a/include/atoms/affine.h b/include/atoms/affine.h
index 94afe8b..a66add6 100644
--- a/include/atoms/affine.h
+++ b/include/atoms/affine.h
@@ -65,4 +65,12 @@ expr *new_scalar_mult(expr *param_node, expr *child);
  * param_node */
 expr *new_vector_mult(expr *param_node, expr *child);
 
+/* 1D full convolution: y = conv(a, child) where a is the kernel held by
+ * param_node (length m) and child is a length-n vector (shape (n, 1) or
+ * (1, n)). Output is length (m + n - 1). param_node may be PARAM_FIXED
+ * or an updatable parameter; the kernel values refresh automatically on
+ * problem_update_params (same convention as new_vector_mult /
+ * new_scalar_mult). */
+expr *new_convolve(expr *param_node, expr *child);
+
 #endif /* AFFINE_H */
diff --git a/include/subexpr.h b/include/subexpr.h
index 9c57d23..b225c84 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -148,6 +148,23 @@ typedef struct vector_mult_expr
     expr *param_source;
 } vector_mult_expr;
 
+/* 1D convolution: y = conv(a, child) where a is a length-m kernel held by
+ * param_source. Output has size (m + n - 1) where n is the child length.
+ * Forward and wsum_hess backprop are computed as direct loops; for Jacobian
+ * we materialize T(a) as a CSR once at jacobian_init and reuse the engine's
+ * block-left-mult machinery for composite children. */
+typedef struct convolve_expr
+{
+    expr base;
+    expr *param_source;  /* length-m kernel */
+    int m;               /* kernel length */
+    int n;               /* input length */
+    Matrix *T;           /* (m+n-1) x n Toeplitz Sparse_Matrix */
+    CSC_Matrix *Jchild_CSC;
+    CSC_Matrix *J_CSC;
+    int *csc_to_csr_work;
+} convolve_expr;
+
 /* Bivariate matrix multiplication: Z = f(u) @ g(u) where both children
  * may be composite expressions. */
 typedef struct matmul_expr
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
new file mode 100644
index 0000000..e6466d8
--- /dev/null
+++ b/src/atoms/affine/convolve.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "atoms/affine.h"
+#include "subexpr.h"
+#include "utils/CSR_Matrix.h"
+#include "utils/matrix.h"
+#include "utils/tracked_alloc.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* 1D full convolution: y = conv(a, child), where a is a length-m kernel
+ * held by param_source and child is a length-n vector. Output length is
+ * m + n - 1. In matrix form, y = T(a) @ child, where T(a) is the
+ * (m+n-1) x n Toeplitz matrix with T(a)[k, j] = a[k-j] when 0 <= k-j < m.
+ *
+ * Forward and Hessian backprop (w' = T(a)^T w) are direct loops, avoiding
+ * CSR traversal and AT materialization. Jacobian values go through the
+ * existing block-left-mult path so composite children work correctly.
+ *
+ * When param_source is updatable, forward() refreshes T(a)'s CSR values
+ * from the new kernel before running the convolution. The sparsity of
+ * T(a) is kernel-independent (staircase), so only values are rewritten. */
+
+/* Fill T(a)'s CSR row pointers, column indices, and values from kernel a.
+   Row r holds T[r, col] = a[r - col] for col in
+   [max(0, r-m+1), min(n-1, r)]. */
+static void build_toeplitz(CSR_Matrix *T_csr, const double *a, int m, int n)
+{
+    int out_rows = m + n - 1;
+    int nnz = 0;
+    for (int r = 0; r < out_rows; r++)
+    {
+        T_csr->p[r] = nnz;
+        int col_lo = r - m + 1 > 0 ? r - m + 1 : 0;
+        int col_hi = r < n - 1 ? r : n - 1;
+        for (int col = col_lo; col <= col_hi; col++)
+        {
+            T_csr->i[nnz] = col;
+            T_csr->x[nnz] = a[r - col];
+            nnz++;
+        }
+    }
+    T_csr->p[out_rows] = nnz;
+}
+
+/* Overwrite T(a)'s CSR values (only) from the current kernel. Sparsity
+   pattern is unchanged, so we just walk the existing CSR entries. */
+static void refresh_toeplitz_values(convolve_expr *cnode)
+{
+    CSR_Matrix *T_csr = ((Sparse_Matrix *) cnode->T)->csr;
+    const double *a = cnode->param_source->value;
+    int out_rows = cnode->m + cnode->n - 1;
+    for (int r = 0; r < out_rows; r++)
+    {
+        for (int k = T_csr->p[r]; k < T_csr->p[r + 1]; k++)
+        {
+            T_csr->x[k] = a[r - T_csr->i[k]];
+        }
+    }
+}
+
+static void forward(expr *node, const double *u)
+{
+    expr *child = node->left;
+    convolve_expr *cnode = (convolve_expr *) node;
+
+    if (cnode->base.needs_parameter_refresh)
+    {
+        cnode->param_source->forward(cnode->param_source, NULL);
+        /* T is NULL until jacobian_init_impl runs; forward can be called
+           first in standalone tests, in which case T is built later from
+           the already-refreshed param values. */
+        if (cnode->T != NULL)
+        {
+            refresh_toeplitz_values(cnode);
+        }
+        cnode->base.needs_parameter_refresh = false;
+    }
+
+    child->forward(child, u);
+
+    const double *a = cnode->param_source->value;
+    const double *x = child->value;
+    double *y = node->value;
+    int m = cnode->m;
+    int n = cnode->n;
+
+    memset(y, 0, node->size * sizeof(double));
+    for (int j = 0; j < n; j++)
+    {
+        for (int i = 0; i < m; i++)
+        {
+            y[i + j] += a[i] * x[j];
+        }
+    }
+}
+
+static void jacobian_init_impl(expr *node)
+{
+    expr *child = node->left;
+    convolve_expr *cnode = (convolve_expr *) node;
+    int m = cnode->m;
+    int n = cnode->n;
+    const double *a = cnode->param_source->value;
+
+    jacobian_init(child);
+
+    /* Build T(a) as CSR: (m+n-1) x n, m*n nonzeros, staircase pattern. */
+    int out_rows = m + n - 1;
+    CSR_Matrix *T_csr = new_csr_matrix(out_rows, n, m * n);
+    build_toeplitz(T_csr, a, m, n);
+
+    cnode->T = new_sparse_matrix(T_csr);
+    free_csr_matrix(T_csr);
+
+    /* Reuse left_matmul's sparsity precompute: J_node = T @ J_child via
+       block_left_mult with n_blocks = 1. */
+    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian, node->work->iwork);
+    cnode->J_CSC =
+        cnode->T->block_left_mult_sparsity(cnode->T, cnode->Jchild_CSC, 1);
+    node->jacobian = csc_to_csr_alloc(cnode->J_CSC, cnode->csc_to_csr_work);
+}
+
+static void eval_jacobian(expr *node)
+{
+    expr *child = node->left;
+    convolve_expr *cnode = (convolve_expr *) node;
+
+    child->eval_jacobian(child);
+
+    /* T values have been refreshed by forward() if the kernel changed; here
+       we just need to refresh child's Jacobian and rerun the block matmul. */
+    csr_to_csc_fill_values(child->jacobian, cnode->Jchild_CSC, node->work->iwork);
+    cnode->T->block_left_mult_values(cnode->T, cnode->Jchild_CSC, cnode->J_CSC);
+    csc_to_csr_fill_values(cnode->J_CSC, node->jacobian, cnode->csc_to_csr_work);
+}
+
+static void wsum_hess_init_impl(expr *node)
+{
+    expr *child = node->left;
+    convolve_expr *cnode = (convolve_expr *) node;
+
+    wsum_hess_init(child);
+    node->wsum_hess = new_csr_copy_sparsity(child->wsum_hess);
+    node->work->dwork = (double *) SP_MALLOC(cnode->n * sizeof(double));
+}
+
+static void eval_wsum_hess(expr *node, const double *w)
+{
+    expr *child = node->left;
+    convolve_expr *cnode = (convolve_expr *) node;
+    int m = cnode->m;
+    int n = cnode->n;
+    const double *a = cnode->param_source->value;
+    double *w_prime = node->work->dwork;
+
+    /* w' = T(a)^T w. T(a)^T[j, k] = a[k-j] for j <= k < j+m, so
+       w'[j] = sum_{i=0..m-1} a[i] * w[i + j]. */
+    memset(w_prime, 0, n * sizeof(double));
+    for (int j = 0; j < n; j++)
+    {
+        for (int i = 0; i < m; i++)
+        {
+            w_prime[j] += a[i] * w[i + j];
+        }
+    }
+
+    child->eval_wsum_hess(child, w_prime);
+    memcpy(node->wsum_hess->x, child->wsum_hess->x,
+           child->wsum_hess->nnz * sizeof(double));
+}
+
+static bool is_affine(const expr *node)
+{
+    return node->left->is_affine(node->left);
+}
+
+static void free_type_data(expr *node)
+{
+    convolve_expr *cnode = (convolve_expr *) node;
+    free_matrix(cnode->T);
+    free_csc_matrix(cnode->Jchild_CSC);
+    free_csc_matrix(cnode->J_CSC);
+    free(cnode->csc_to_csr_work);
+    if (cnode->param_source != NULL)
+    {
+        free_expr(cnode->param_source);
+    }
+    cnode->T = NULL;
+    cnode->Jchild_CSC = NULL;
+    cnode->J_CSC = NULL;
+    cnode->csc_to_csr_work = NULL;
+    cnode->param_source = NULL;
+}
+
+expr *new_convolve(expr *param_node, expr *child)
+{
+    if (param_node == NULL)
+    {
+        fprintf(stderr, "Error in new_convolve: param_node must not be NULL\n");
+        exit(1);
+    }
+
+    int m = param_node->size;
+    int n;
+    int out_d1, out_d2;
+
+    if (child->d2 == 1)
+    {
+        /* child is (n, 1) column vector */
+        n = child->d1;
+        out_d1 = m + n - 1;
+        out_d2 = 1;
+    }
+    else if (child->d1 == 1)
+    {
+        /* child is (1, n) row vector */
+        n = child->d2;
+        out_d1 = 1;
+        out_d2 = m + n - 1;
+    }
+    else
+    {
+        fprintf(stderr, "Error in new_convolve: child must be a vector "
+                        "(shape (n, 1) or (1, n))\n");
+        exit(1);
+    }
+
+    if (m < 1 || n < 1)
+    {
+        fprintf(stderr, "Error in new_convolve: m and n must be >= 1\n");
+        exit(1);
+    }
+
+    convolve_expr *cnode =
+        (convolve_expr *) SP_CALLOC(1, sizeof(convolve_expr));
+    expr *node = &cnode->base;
+    init_expr(node, out_d1, out_d2, child->n_vars, forward, jacobian_init_impl,
+              eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess,
+              free_type_data);
+    node->left = child;
+    expr_retain(child);
+
+    cnode->param_source = param_node;
+    expr_retain(param_node);
+    cnode->m = m;
+    cnode->n = n;
+
+    /* iwork is used for csr_to_csc of child's Jacobian (size n_vars). */
+    node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int));
+    cnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
+
+    /* Ensure first forward() pulls current param values through any
+       broadcast/promote wrappers and reflects them in T (once T is built). */
+    cnode->base.needs_parameter_refresh = true;
+
+    return node;
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index efaad33..4af1dd4 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -7,6 +7,7 @@
 #ifndef PROFILE_ONLY
 #include "forward_pass/affine/test_add.h"
 #include "forward_pass/affine/test_broadcast.h"
+#include "forward_pass/affine/test_convolve.h"
 #include "forward_pass/affine/test_diag_mat.h"
 #include "forward_pass/affine/test_hstack.h"
 #include "forward_pass/affine/test_left_matmul_dense.h"
@@ -25,6 +26,7 @@
 #include "forward_pass/other/test_prod_axis_one.h"
 #include "forward_pass/other/test_prod_axis_zero.h"
 #include "jacobian_tests/affine/test_broadcast.h"
+#include "jacobian_tests/affine/test_convolve.h"
 #include "jacobian_tests/affine/test_diag_mat.h"
 #include "jacobian_tests/affine/test_hstack.h"
 #include "jacobian_tests/affine/test_index.h"
@@ -65,6 +67,7 @@
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
 #include "wsum_hess/affine/test_broadcast.h"
+#include "wsum_hess/affine/test_convolve.h"
 #include "wsum_hess/affine/test_diag_mat.h"
 #include "wsum_hess/affine/test_hstack.h"
 #include "wsum_hess/affine/test_index.h"
@@ -134,6 +137,8 @@ int main(void)
     mu_run_test(test_forward_prod_axis_one, tests_run);
     mu_run_test(test_matmul, tests_run);
     mu_run_test(test_left_matmul_dense, tests_run);
+    mu_run_test(test_convolve_forward, tests_run);
+    mu_run_test(test_convolve_forward_param, tests_run);
     mu_run_test(test_diag_mat_forward, tests_run);
     mu_run_test(test_upper_tri_forward_4x4, tests_run);
 
@@ -215,6 +220,8 @@ int main(void)
     mu_run_test(test_jacobian_right_matmul_log, tests_run);
     mu_run_test(test_jacobian_right_matmul_log_vector, tests_run);
     mu_run_test(test_jacobian_matmul, tests_run);
+    mu_run_test(test_jacobian_convolve, tests_run);
+    mu_run_test(test_jacobian_convolve_composite, tests_run);
     mu_run_test(test_jacobian_transpose, tests_run);
     mu_run_test(test_diag_mat_jacobian_variable, tests_run);
     mu_run_test(test_diag_mat_jacobian_of_log, tests_run);
@@ -280,6 +287,8 @@ int main(void)
     mu_run_test(test_wsum_hess_matmul_yx, tests_run);
     mu_run_test(test_wsum_hess_right_matmul, tests_run);
     mu_run_test(test_wsum_hess_right_matmul_vector, tests_run);
+    mu_run_test(test_wsum_hess_convolve, tests_run);
+    mu_run_test(test_wsum_hess_convolve_composite, tests_run);
     mu_run_test(test_wsum_hess_broadcast_row, tests_run);
     mu_run_test(test_wsum_hess_broadcast_col, tests_run);
     mu_run_test(test_wsum_hess_broadcast_scalar_to_matrix, tests_run);
@@ -374,6 +383,7 @@ int main(void)
     mu_run_test(test_param_shared_left_matmul_problem, tests_run);
     mu_run_test(test_param_fixed_skip_in_update, tests_run);
     mu_run_test(test_param_scalar_mult_problem_with_constant, tests_run);
+    mu_run_test(test_param_convolve_problem, tests_run);
 
     printf("\n--- Parameter + Broadcast Tests ---\n");
     mu_run_test(test_constant_broadcast_vector_mult, tests_run);
@@ -384,6 +394,7 @@ int main(void)
     mu_run_test(test_param_sum_scalar_mult, tests_run);
     mu_run_test(test_const_hstack_left_matmul, tests_run);
     mu_run_test(test_param_hstack_left_matmul, tests_run);
+    mu_run_test(test_param_scalar_mult_convolve, tests_run);
 #endif /* PROFILE_ONLY */
 
 #ifdef PROFILE_ONLY
diff --git a/tests/forward_pass/affine/test_convolve.h b/tests/forward_pass/affine/test_convolve.h
new file mode 100644
index 0000000..6a2cfd2
--- /dev/null
+++ b/tests/forward_pass/affine/test_convolve.h
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "atoms/affine.h"
+#include "expr.h"
+#include "minunit.h"
+#include "subexpr.h"
+#include "test_helpers.h"
+
+const char *test_convolve_forward(void)
+{
+    /* Test: y = conv([1, 2, 3], x) where x = [1, 2, 3].
+     * Expected y = [1, 4, 10, 12, 9]. */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
+    expr *x = new_variable(3, 1, 0, 3);
+    expr *y = new_convolve(kernel_param, x);
+
+    double u[3] = {1.0, 2.0, 3.0};
+    y->forward(y, u);
+
+    double expected[5] = {1.0, 4.0, 10.0, 12.0, 9.0};
+
+    mu_assert("convolve result should have d1=5", y->d1 == 5);
+    mu_assert("convolve result should have d2=1", y->d2 == 1);
+    mu_assert("convolve result should have size=5", y->size == 5);
+    mu_assert("Convolve forward pass test failed",
+              cmp_double_array(y->value, expected, 5));
+
+    free_expr(y);
+    return 0;
+}
+
+const char *test_convolve_forward_param(void)
+{
+    /* Same as test_convolve_forward, but the kernel is an updatable parameter
+     * (param_id = 0). Exercises the needs_parameter_refresh path in forward(). */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    expr *kernel_param = new_parameter(3, 1, 0, 3, kernel);
+    expr *x = new_variable(3, 1, 0, 3);
+    expr *y = new_convolve(kernel_param, x);
+
+    double u[3] = {1.0, 2.0, 3.0};
+    y->forward(y, u);
+
+    double expected[5] = {1.0, 4.0, 10.0, 12.0, 9.0};
+
+    mu_assert("Convolve forward pass with param kernel failed",
+              cmp_double_array(y->value, expected, 5));
+
+    free_expr(y);
+    return 0;
+}
diff --git a/tests/jacobian_tests/affine/test_convolve.h b/tests/jacobian_tests/affine/test_convolve.h
new file mode 100644
index 0000000..c3aae98
--- /dev/null
+++ b/tests/jacobian_tests/affine/test_convolve.h
@@ -0,0 +1,68 @@
+#include <math.h>
+#include <stdio.h>
+
+#include "atoms/affine.h"
+#include "atoms/elementwise_restricted_dom.h"
+#include "expr.h"
+#include "minunit.h"
+#include "numerical_diff.h"
+#include "subexpr.h"
+#include "test_helpers.h"
+
+const char *test_jacobian_convolve(void)
+{
+    /* Test: Jacobian of y = conv([1, 2, 3], x) for x = Variable(3).
+     * J should equal T(kernel):
+     *   [[1, 0, 0],
+     *    [2, 1, 0],
+     *    [3, 2, 1],
+     *    [0, 3, 2],
+     *    [0, 0, 3]]
+     * stored in CSR with nnz = 9, shape 5 x 3. */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
+    expr *x = new_variable(3, 1, 0, 3);
+    expr *y = new_convolve(kernel_param, x);
+
+    double u[3] = {1.0, 2.0, 3.0};
+    y->forward(y, u);
+    jacobian_init(y);
+    y->eval_jacobian(y);
+
+    mu_assert("Jacobian should have 5 rows", y->jacobian->m == 5);
+    mu_assert("Jacobian should have 3 columns", y->jacobian->n == 3);
+    mu_assert("Jacobian should have 9 nonzeros", y->jacobian->nnz == 9);
+
+    int expected_p[6] = {0, 1, 3, 6, 8, 9};
+    int expected_i[9] = {0, 0, 1, 0, 1, 2, 1, 2, 2};
+    double expected_x[9] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0};
+
+    mu_assert("Convolve Jacobian row pointers incorrect",
+              cmp_int_array(y->jacobian->p, expected_p, 6));
+    mu_assert("Convolve Jacobian column indices incorrect",
+              cmp_int_array(y->jacobian->i, expected_i, 9));
+    mu_assert("Convolve Jacobian values incorrect",
+              cmp_double_array(y->jacobian->x, expected_x, 9));
+
+    free_expr(y);
+    return 0;
+}
+
+const char *test_jacobian_convolve_composite(void)
+{
+    /* y = conv([1, 2, 3], log(x)) — verify via numerical differentiation that
+     * the composite path (T(a) @ J_log) matches. */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    double x_vals[3] = {1.0, 2.0, 3.0};
+
+    expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
+    expr *x = new_variable(3, 1, 0, 3);
+    expr *log_x = new_log(x);
+    expr *y = new_convolve(kernel_param, log_x);
+
+    mu_assert("check_jacobian failed",
+              check_jacobian_num(y, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(y);
+    return 0;
+}
diff --git a/tests/problem/test_param_broadcast.h b/tests/problem/test_param_broadcast.h
index 032e8eb..d9b43f1 100644
--- a/tests/problem/test_param_broadcast.h
+++ b/tests/problem/test_param_broadcast.h
@@ -341,4 +341,69 @@ const char *test_param_hstack_left_matmul(void)
     return 0;
 }
 
+const char *test_param_scalar_mult_convolve(void)
+{
+    int n = 3;
+
+    /* minimize sum(x) subject to conv(s * kernel, x), where kernel is a fixed
+       length-4 vector and s is an updatable scalar parameter. Scaling s
+       rescales the entire kernel, so Jacobian and constraint values scale
+       linearly with s. */
+    expr *x = new_variable(3, 1, 0, n);
+    expr *objective = new_sum(x, -1);
+
+    double kernel_vals[4] = {1.0, 2.0, 3.0, 4.0};
+    expr *kernel = new_parameter(4, 1, PARAM_FIXED, n, kernel_vals);
+    double s_vals[1] = {1.0};
+    expr *s = new_parameter(1, 1, 0, n, s_vals);
+    expr *scaled_kernel = new_scalar_mult(s, kernel);
+
+    expr *constraint = new_convolve(scaled_kernel, x);
+    expr *constraints[1] = {constraint};
+    problem *prob = new_problem(objective, constraints, 1, false);
+
+    expr *param_nodes[1] = {s};
+    problem_register_params(prob, param_nodes, 1);
+    problem_init_derivatives(prob);
+
+    /* point for evaluating and sparsity arrays (T is 6x3 with 12 nonzeros) */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    int Ap[7] = {0, 1, 3, 6, 9, 11, 12};
+    int Ai[12] = {0, 0, 1, 0, 1, 2, 0, 1, 2, 1, 2, 2};
+
+    /* test 1: s = 1, effective kernel = [1, 2, 3, 4] */
+    problem_constraint_forward(prob, x_vals);
+    problem_jacobian(prob);
+    double constrs[6] = {1.0, 4.0, 10.0, 16.0, 17.0, 12.0};
+    double Ax[12] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0,
+                     4.0, 3.0, 2.0, 4.0, 3.0, 4.0};
+    mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 6));
+    mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 7));
+    mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 12));
+    mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 12));
+
+    mu_assert("check_jacobian failed",
+              check_jacobian_num(constraint, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    /* test 2: s = 10 via problem_update_params, effective kernel = [10,20,30,40] */
+    double theta[1] = {10.0};
+    problem_update_params(prob, theta);
+    problem_constraint_forward(prob, x_vals);
+    problem_jacobian(prob);
+    double updated_constrs[6] = {10.0, 40.0, 100.0, 160.0, 170.0, 120.0};
+    double updated_Ax[12] = {10.0, 20.0, 10.0, 30.0, 20.0, 10.0,
+                             40.0, 30.0, 20.0, 40.0, 30.0, 40.0};
+    mu_assert("vals fail",
+              cmp_double_array(prob->constraint_values, updated_constrs, 6));
+    mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 7));
+    mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 12));
+    mu_assert("vals fail", cmp_double_array(prob->jacobian->x, updated_Ax, 12));
+
+    mu_assert("check_jacobian failed",
+              check_jacobian_num(constraint, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_problem(prob);
+    return 0;
+}
+
 #endif /* TEST_PARAM_BROADCAST_H */
diff --git a/tests/problem/test_param_prob.h b/tests/problem/test_param_prob.h
index 40350d6..e3022a5 100644
--- a/tests/problem/test_param_prob.h
+++ b/tests/problem/test_param_prob.h
@@ -6,6 +6,7 @@
 #include <string.h>
 
 #include "atoms/affine.h"
+#include "atoms/elementwise_full_dom.h"
 #include "atoms/elementwise_restricted_dom.h"
 #include "expr.h"
 #include "minunit.h"
@@ -535,4 +536,95 @@ const char *test_param_shared_left_matmul_problem(void)
     return 0;
 }
 
+const char *test_param_convolve_problem(void)
+{
+    int n = 3;
+
+    /* minimize sum(x) subject to conv(a, exp(x)), with kernel a as parameter.
+       The exp nonlinearity gives a non-zero Hessian that depends on the kernel,
+       so both Jacobian and Hessian refresh paths get exercised on
+       problem_update_params. At x = [0, 0, 0] we have exp(x) = 1, so J = T
+       and H diagonal = T^T @ 1_5 = sum(a) per column. */
+    expr *x = new_variable(3, 1, 0, n);
+    expr *exp_x = new_exp(x);
+    expr *objective = new_sum(x, -1);
+    double theta[3] = {1.0, 2.0, 3.0};
+    expr *a_param = new_parameter(3, 1, 0, n, theta);
+    expr *constraint = new_convolve(a_param, exp_x);
+    expr *constraints[1] = {constraint};
+    problem *prob = new_problem(objective, constraints, 1, true);
+
+    /* register parameters and fill sparsity patterns */
+    expr *param_nodes[1] = {a_param};
+    problem_register_params(prob, param_nodes, 1);
+    problem_init_derivatives(prob);
+
+    /* point for evaluating and constant sparsity arrays */
+    double x_vals[3] = {0.0, 0.0, 0.0};
+    double w[5] = {1.0, 1.0, 1.0, 1.0, 1.0};
+    int Ap[6] = {0, 1, 3, 6, 8, 9};
+    int Ai[9] = {0, 0, 1, 0, 1, 2, 1, 2, 2};
+    int Hp[4] = {0, 1, 2, 3};
+    int Hi[3] = {0, 1, 2};
+
+    /* test 1: kernel a = [1, 2, 3] */
+    problem_constraint_forward(prob, x_vals);
+    problem_jacobian(prob);
+    problem_hessian(prob, 0.0, w);
+    double constrs[5] = {1.0, 3.0, 6.0, 5.0, 3.0};
+    double Ax[9] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0};
+    double Hx[3] = {6.0, 6.0, 6.0};
+    mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 5));
+    mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 6));
+    mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 9));
+    mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 9));
+    mu_assert("hess rows fail",
+              cmp_int_array(prob->lagrange_hessian->p, Hp, 4));
+    mu_assert("hess cols fail",
+              cmp_int_array(prob->lagrange_hessian->i, Hi, 3));
+    mu_assert("hess vals fail",
+              cmp_double_array(prob->lagrange_hessian->x, Hx, 3));
+
+    /* test 2: kernel a = [4, 5, 6] — refresh path must rebuild T(a) values
+       and propagate to both Jacobian and Hessian. */
+    theta[0] = 4.0;
+    theta[1] = 5.0;
+    theta[2] = 6.0;
+    problem_update_params(prob, theta);
+    problem_constraint_forward(prob, x_vals);
+    problem_jacobian(prob);
+    problem_hessian(prob, 0.0, w);
+    constrs[0] = 4.0;
+    constrs[1] = 9.0;
+    constrs[2] = 15.0;
+    constrs[3] = 11.0;
+    constrs[4] = 6.0;
+    Ax[0] = 4.0;
+    Ax[1] = 5.0;
+    Ax[2] = 4.0;
+    Ax[3] = 6.0;
+    Ax[4] = 5.0;
+    Ax[5] = 4.0;
+    Ax[6] = 6.0;
+    Ax[7] = 5.0;
+    Ax[8] = 6.0;
+    Hx[0] = 15.0;
+    Hx[1] = 15.0;
+    Hx[2] = 15.0;
+    mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 5));
+    mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 6));
+    mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 9));
+    mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 9));
+    mu_assert("hess rows fail",
+              cmp_int_array(prob->lagrange_hessian->p, Hp, 4));
+    mu_assert("hess cols fail",
+              cmp_int_array(prob->lagrange_hessian->i, Hi, 3));
+    mu_assert("hess vals fail",
+              cmp_double_array(prob->lagrange_hessian->x, Hx, 3));
+
+    free_problem(prob);
+
+    return 0;
+}
+
 #endif /* TEST_PARAM_PROB_H */
diff --git a/tests/wsum_hess/affine/test_convolve.h b/tests/wsum_hess/affine/test_convolve.h
new file mode 100644
index 0000000..98dd7ab
--- /dev/null
+++ b/tests/wsum_hess/affine/test_convolve.h
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+#include "atoms/affine.h"
+#include "atoms/elementwise_restricted_dom.h"
+#include "expr.h"
+#include "minunit.h"
+#include "numerical_diff.h"
+#include "subexpr.h"
+#include "test_helpers.h"
+
+const char *test_wsum_hess_convolve(void)
+{
+    /* y = conv([1, 2, 3], x) is linear in x when the kernel is constant.
+     * Child x is a leaf variable (zero Hessian), so y's weighted Hessian
+     * should be zero for any weight vector w. */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
+    expr *x = new_variable(3, 1, 0, 3);
+    expr *y = new_convolve(kernel_param, x);
+
+    double u[3] = {1.0, 2.0, 3.0};
+    double w[5] = {1.0, -1.0, 2.0, 3.0, -2.0};
+
+    y->forward(y, u);
+    jacobian_init(y);
+    wsum_hess_init(y);
+    y->eval_wsum_hess(y, w);
+
+    mu_assert("Convolve wsum_hess should be 3x3", y->wsum_hess->m == 3);
+    mu_assert("Convolve wsum_hess should be square", y->wsum_hess->n == 3);
+    mu_assert("Convolve wsum_hess should have zero nonzeros",
+              y->wsum_hess->nnz == 0);
+
+    free_expr(y);
+    return 0;
+}
+
+const char *test_wsum_hess_convolve_composite(void)
+{
+    /* y = conv([1, 2, 3], log(x)) — nonlinear in x, so backprop of weights
+     * through the Toeplitz should produce the same Hessian as a numerical
+     * second-derivative check. */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    double w[5] = {1.0, -1.0, 2.0, 3.0, -2.0};
+
+    expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
+    expr *x = new_variable(3, 1, 0, 3);
+    expr *log_x = new_log(x);
+    expr *y = new_convolve(kernel_param, log_x);
+
+    mu_assert("check_wsum_hess failed",
+              check_wsum_hess(y, x_vals, w, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(y);
+    return 0;
+}

From d2cd16a704ca3996c2e32a0e79ad1138bbd1586f Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Tue, 21 Apr 2026 22:14:28 -0400
Subject: [PATCH 2/7] apply formatter

---
 include/subexpr.h                    |  8 ++++----
 src/atoms/affine/convolve.c          |  3 +--
 tests/problem/test_param_broadcast.h |  3 +--
 tests/problem/test_param_prob.h      | 18 ++++++------------
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index b225c84..46bbd57 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -156,10 +156,10 @@ typedef struct vector_mult_expr
 typedef struct convolve_expr
 {
     expr base;
-    expr *param_source;  /* length-m kernel */
-    int m;               /* kernel length */
-    int n;               /* input length */
-    Matrix *T;           /* (m+n-1) x n Toeplitz Sparse_Matrix */
+    expr *param_source; /* length-m kernel */
+    int m;              /* kernel length */
+    int n;              /* input length */
+    Matrix *T;          /* (m+n-1) x n Toeplitz Sparse_Matrix */
     CSC_Matrix *Jchild_CSC;
     CSC_Matrix *J_CSC;
     int *csc_to_csr_work;
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index e6466d8..133fd67 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -248,8 +248,7 @@ expr *new_convolve(expr *param_node, expr *child)
         exit(1);
     }
 
-    convolve_expr *cnode =
-        (convolve_expr *) SP_CALLOC(1, sizeof(convolve_expr));
+    convolve_expr *cnode = (convolve_expr *) SP_CALLOC(1, sizeof(convolve_expr));
     expr *node = &cnode->base;
     init_expr(node, out_d1, out_d2, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess,
diff --git a/tests/problem/test_param_broadcast.h b/tests/problem/test_param_broadcast.h
index d9b43f1..53e4456 100644
--- a/tests/problem/test_param_broadcast.h
+++ b/tests/problem/test_param_broadcast.h
@@ -375,8 +375,7 @@ const char *test_param_scalar_mult_convolve(void)
     problem_constraint_forward(prob, x_vals);
     problem_jacobian(prob);
     double constrs[6] = {1.0, 4.0, 10.0, 16.0, 17.0, 12.0};
-    double Ax[12] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0,
-                     4.0, 3.0, 2.0, 4.0, 3.0, 4.0};
+    double Ax[12] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 4.0, 3.0, 2.0, 4.0, 3.0, 4.0};
     mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 6));
     mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 7));
     mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 12));
diff --git a/tests/problem/test_param_prob.h b/tests/problem/test_param_prob.h
index e3022a5..02c1a67 100644
--- a/tests/problem/test_param_prob.h
+++ b/tests/problem/test_param_prob.h
@@ -578,12 +578,9 @@ const char *test_param_convolve_problem(void)
     mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 6));
     mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 9));
     mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 9));
-    mu_assert("hess rows fail",
-              cmp_int_array(prob->lagrange_hessian->p, Hp, 4));
-    mu_assert("hess cols fail",
-              cmp_int_array(prob->lagrange_hessian->i, Hi, 3));
-    mu_assert("hess vals fail",
-              cmp_double_array(prob->lagrange_hessian->x, Hx, 3));
+    mu_assert("hess rows fail", cmp_int_array(prob->lagrange_hessian->p, Hp, 4));
+    mu_assert("hess cols fail", cmp_int_array(prob->lagrange_hessian->i, Hi, 3));
+    mu_assert("hess vals fail", cmp_double_array(prob->lagrange_hessian->x, Hx, 3));
 
     /* test 2: kernel a = [4, 5, 6] — refresh path must rebuild T(a) values
        and propagate to both Jacobian and Hessian. */
@@ -615,12 +612,9 @@ const char *test_param_convolve_problem(void)
     mu_assert("rows fail", cmp_int_array(prob->jacobian->p, Ap, 6));
     mu_assert("cols fail", cmp_int_array(prob->jacobian->i, Ai, 9));
     mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 9));
-    mu_assert("hess rows fail",
-              cmp_int_array(prob->lagrange_hessian->p, Hp, 4));
-    mu_assert("hess cols fail",
-              cmp_int_array(prob->lagrange_hessian->i, Hi, 3));
-    mu_assert("hess vals fail",
-              cmp_double_array(prob->lagrange_hessian->x, Hx, 3));
+    mu_assert("hess rows fail", cmp_int_array(prob->lagrange_hessian->p, Hp, 4));
+    mu_assert("hess cols fail", cmp_int_array(prob->lagrange_hessian->i, Hi, 3));
+    mu_assert("hess vals fail", cmp_double_array(prob->lagrange_hessian->x, Hx, 3));
 
     free_problem(prob);
 

From b8dbf3d3fc0c31fbdec6b1ae89f88ba436fe506f Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Tue, 21 Apr 2026 23:03:55 -0400
Subject: [PATCH 3/7] adds test for row convolve in case it is 1d in python

---
 src/atoms/affine/convolve.c               | 36 ++++++++---------------
 tests/all_tests.c                         |  1 +
 tests/forward_pass/affine/test_convolve.h | 24 +++++++++++++++
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 133fd67..51b80dc 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -59,8 +59,9 @@ static void build_toeplitz(CSR_Matrix *T_csr, const double *a, int m, int n)
     T_csr->p[out_rows] = nnz;
 }
 
-/* Overwrite T(a)'s CSR values (only) from the current kernel. Sparsity
-   pattern is unchanged, so we just walk the existing CSR entries. */
+/* Values-only refresh analogous to refresh_dense_left in left_matmul.c:
+   sparsity is kernel-independent so p/i stay intact; we just walk the
+   existing CSR entries and overwrite x from the current kernel. */
 static void refresh_toeplitz_values(convolve_expr *cnode)
 {
     CSR_Matrix *T_csr = ((Sparse_Matrix *) cnode->T)->csr;
@@ -172,13 +173,14 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* w' = T(a)^T w. T(a)^T[j, k] = a[k-j] for j <= k < j+m, so
        w'[j] = sum_{i=0..m-1} a[i] * w[i + j]. */
-    memset(w_prime, 0, n * sizeof(double));
     for (int j = 0; j < n; j++)
     {
+        double sum = 0.0;
         for (int i = 0; i < m; i++)
         {
-            w_prime[j] += a[i] * w[i + j];
+            sum += a[i] * w[i + j];
         }
+        w_prime[j] = sum;
     }
 
     child->eval_wsum_hess(child, w_prime);
@@ -198,39 +200,25 @@ static void free_type_data(expr *node)
     free_csc_matrix(cnode->Jchild_CSC);
     free_csc_matrix(cnode->J_CSC);
     free(cnode->csc_to_csr_work);
-    if (cnode->param_source != NULL)
-    {
-        free_expr(cnode->param_source);
-    }
-    cnode->T = NULL;
-    cnode->Jchild_CSC = NULL;
-    cnode->J_CSC = NULL;
-    cnode->csc_to_csr_work = NULL;
-    cnode->param_source = NULL;
+    free_expr(cnode->param_source);
 }
 
 expr *new_convolve(expr *param_node, expr *child)
 {
-    if (param_node == NULL)
-    {
-        fprintf(stderr, "Error in new_convolve: param_node must not be NULL\n");
-        exit(1);
-    }
-
+    /* Accept both (n, 1) column and (1, n) row shapes — numpy-style 1D
+       arrays reach us as (1, n) through the Python bindings, and the math
+       is shape-agnostic (flat buffers of size m + n - 1). Output shape
+       matches the input orientation. */
     int m = param_node->size;
-    int n;
-    int out_d1, out_d2;
-
+    int n, out_d1, out_d2;
     if (child->d2 == 1)
     {
-        /* child is (n, 1) column vector */
         n = child->d1;
         out_d1 = m + n - 1;
         out_d2 = 1;
     }
     else if (child->d1 == 1)
     {
-        /* child is (1, n) row vector */
         n = child->d2;
         out_d1 = 1;
         out_d2 = m + n - 1;
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 4af1dd4..76807b6 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -138,6 +138,7 @@ int main(void)
     mu_run_test(test_matmul, tests_run);
     mu_run_test(test_left_matmul_dense, tests_run);
     mu_run_test(test_convolve_forward, tests_run);
+    mu_run_test(test_convolve_forward_row, tests_run);
     mu_run_test(test_convolve_forward_param, tests_run);
     mu_run_test(test_diag_mat_forward, tests_run);
     mu_run_test(test_upper_tri_forward_4x4, tests_run);
diff --git a/tests/forward_pass/affine/test_convolve.h b/tests/forward_pass/affine/test_convolve.h
index 6a2cfd2..14d5988 100644
--- a/tests/forward_pass/affine/test_convolve.h
+++ b/tests/forward_pass/affine/test_convolve.h
@@ -31,6 +31,30 @@ const char *test_convolve_forward(void)
     return 0;
 }
 
+const char *test_convolve_forward_row(void)
+{
+    /* Same as test_convolve_forward but x is a (1, 3) row vector.
+     * Output orientation should match: (1, 5). */
+    double kernel[3] = {1.0, 2.0, 3.0};
+    expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
+    expr *x = new_variable(1, 3, 0, 3);
+    expr *y = new_convolve(kernel_param, x);
+
+    double u[3] = {1.0, 2.0, 3.0};
+    y->forward(y, u);
+
+    double expected[5] = {1.0, 4.0, 10.0, 12.0, 9.0};
+
+    mu_assert("convolve row result should have d1=1", y->d1 == 1);
+    mu_assert("convolve row result should have d2=5", y->d2 == 5);
+    mu_assert("convolve row result should have size=5", y->size == 5);
+    mu_assert("Convolve forward pass (row vector) failed",
+              cmp_double_array(y->value, expected, 5));
+
+    free_expr(y);
+    return 0;
+}
+
 const char *test_convolve_forward_param(void)
 {
     /* Same as test_convolve_forward, but the kernel is an updatable parameter

From c3d3dfeaebb2ce20eea7a573d8482317912fac74 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Thu, 23 Apr 2026 09:20:35 -0700
Subject: [PATCH 4/7] improve convention of data pointer when we have
 parameters for left and right matmul

---
 include/atoms/affine.h               | 32 ++++++++++++++---------
 include/utils/dense_matrix.h         |  3 ++-
 src/atoms/affine/left_matmul.c       | 26 ++++++++++++++++---
 src/atoms/affine/right_matmul.c      | 30 +++++++++++++--------
 src/utils/dense_matrix.c             |  5 +++-
 tests/problem/test_param_broadcast.h |  8 ++----
 tests/problem/test_param_prob.h      | 39 +++++++++++++---------------
 7 files changed, 88 insertions(+), 55 deletions(-)

diff --git a/include/atoms/affine.h b/include/atoms/affine.h
index a66add6..9527d3c 100644
--- a/include/atoms/affine.h
+++ b/include/atoms/affine.h
@@ -42,19 +42,31 @@ expr *new_diag_mat(expr *child);
 expr *new_upper_tri(expr *child);
 expr *new_transpose(expr *child);
 
-/* Left matrix multiplication: A @ f(x) where A is a constant or parameter
- * sparse matrix. param_node is NULL for fixed constants. */
+/* Left matrix multiplication: A @ f(x) where A is a constant sparse matrix.
+   param_node is NULL for fixed constants. We currently do not support sparse
+   parameters, so param_node should always be null. */
 expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A);
 
-/* Left matrix multiplication: A @ f(x) where A is a constant or parameter
- * dense matrix (row-major, m x n). Uses CBLAS for efficient computation. */
+/* Left matrix multiplication: A @ f(x) where A is a constant dense matrix
+   (in row-major, m x n, with values given by 'data') or a parameter
+   representing a dense m x n matrix in row-major.
+
+   The 'data' pointer only represents the values of the dense matrix if
+   param_node is null. If param_node is not null, data must be null. */
 expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
                             const double *data);
 
-/* Right matrix multiplication: f(x) @ A where A is a constant or parameter
- * matrix. */
+/* Right matrix multiplication: f(x) @ A where A is a constant sparse matrix.
+   We currently do not support sparse parameters, so param_node should always be
+   null. */
 expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A);
 
+/* Right matrix multiplication: f(x) @ A where A is a constant dense matrix
+   (in row-major, m x n, with values given by 'data') or a parameter
+   representing a dense m x n matrix in row-major.
+
+   The 'data' pointer only represents the values of the dense matrix if
+   param_node is null. If param_node is not null, data must be null. */
 expr *new_right_matmul_dense(expr *param_node, expr *u, int m, int n,
                              const double *data);
 
@@ -65,12 +77,8 @@ expr *new_scalar_mult(expr *param_node, expr *child);
  * param_node */
 expr *new_vector_mult(expr *param_node, expr *child);
 
-/* 1D full convolution: y = conv(a, child) where a is the kernel held by
- * param_node (length m) and child is a length-n vector (shape (n, 1) or
- * (1, n)). Output is length (m + n - 1). param_node may be PARAM_FIXED
- * or an updatable parameter; the kernel values refresh automatically on
- * problem_update_params (same convention as new_vector_mult /
- * new_scalar_mult). */
+/* 1D full convolution: y = conv(param_node, child) where param_node is the
+   kernel and may either represent a constant or an updatable parameter */
 expr *new_convolve(expr *param_node, expr *child);
 
 #endif /* AFFINE_H */
diff --git a/include/utils/dense_matrix.h b/include/utils/dense_matrix.h
index ab3aea0..bd1693b 100644
--- a/include/utils/dense_matrix.h
+++ b/include/utils/dense_matrix.h
@@ -28,7 +28,8 @@ typedef struct Dense_Matrix
     double *work; /* scratch buffer, length n */
 } Dense_Matrix;
 
-/* Constructors */
+/* Constructors. If data is NULL, the value buffer is allocated but left
+   uninitialized; otherwise m*n entries are copied from data. */
 Matrix *new_dense_matrix(int m, int n, const double *data);
 
 /* Transpose helper */
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 2928d23..d5d87a1 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -276,15 +276,35 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
     lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
     lnode->n_blocks = n_blocks;
 
-    lnode->A = new_dense_matrix(m, n, data);
-    lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
-
     /* parameter support */
     lnode->param_source = param_node;
     if (param_node != NULL)
     {
+        if (data != NULL)
+        {
+            fprintf(stderr, "Error in new_left_matmul_dense with ptr convention \n");
+            exit(1);
+        }
+
         expr_retain(param_node);
         lnode->refresh_param_values = refresh_dense_left;
+
+        /* A and AT buffers are filled by refresh_dense_left from the parameter. */
+        lnode->A = new_dense_matrix(m, n, NULL);
+        lnode->AT = new_dense_matrix(n, m, NULL);
+        node->needs_parameter_refresh = true;
+    }
+    /* constant matrix and param_source should be null */
+    else
+    {
+        if (data == NULL)
+        {
+            fprintf(stderr, "Error in new_left_matmul_dense with ptr convention \n");
+            exit(1);
+        }
+
+        lnode->A = new_dense_matrix(m, n, data);
+        lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
     }
 
     return node;
diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c
index 56075dc..c3344f2 100644
--- a/src/atoms/affine/right_matmul.c
+++ b/src/atoms/affine/right_matmul.c
@@ -82,24 +82,32 @@ expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
 expr *new_right_matmul_dense(expr *param_node, expr *u, int m, int n,
                              const double *data)
 {
-    /* We express: u @ A = (A^T @ u^T)^T. A is m x n, so A^T is n x m. */
-    double *AT = (double *) SP_MALLOC(n * m * sizeof(double));
-    A_transpose(AT, data, m, n);
+    if ((param_node != NULL && data != NULL) || (param_node == NULL && data == NULL))
+    {
+        fprintf(stderr, "Error in new_right_matmul_dense with ptr convention \n");
+        exit(1);
+    }
 
+    /* We express: u @ A = (A^T @ u^T)^T. A is m x n, so A^T is n x m. */
     expr *u_transpose = new_transpose(u);
-    expr *left_matmul_node = new_left_matmul_dense(NULL, u_transpose, n, m, AT);
+    expr *left_matmul_node;
 
-    /* If parameterized, attach param_source and custom refresh */
     if (param_node != NULL)
     {
+        left_matmul_node =
+            new_left_matmul_dense(param_node, u_transpose, n, m, NULL);
         left_matmul_expr *lnode = (left_matmul_expr *) left_matmul_node;
-        lnode->param_source = param_node;
-        expr_retain(param_node);
+
+        /* swap the refresh function */
         lnode->refresh_param_values = refresh_dense_right;
     }
+    else
+    {
+        double *AT = (double *) SP_MALLOC(n * m * sizeof(double));
+        A_transpose(AT, data, m, n);
+        left_matmul_node = new_left_matmul_dense(NULL, u_transpose, n, m, AT);
+        free(AT);
+    }
 
-    expr *node = new_transpose(left_matmul_node);
-
-    free(AT);
-    return node;
+    return new_transpose(left_matmul_node);
 }
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
index a521f15..5c628c2 100644
--- a/src/utils/dense_matrix.c
+++ b/src/utils/dense_matrix.c
@@ -67,7 +67,10 @@ Matrix *new_dense_matrix(int m, int n, const double *data)
     dm->base.update_values = dense_update_values;
     dm->base.free_fn = dense_free;
     dm->x = (double *) SP_MALLOC(m * n * sizeof(double));
-    memcpy(dm->x, data, m * n * sizeof(double));
+    if (data != NULL)
+    {
+        memcpy(dm->x, data, m * n * sizeof(double));
+    }
     dm->work = (double *) SP_MALLOC(n * sizeof(double));
     return &dm->base;
 }
diff --git a/tests/problem/test_param_broadcast.h b/tests/problem/test_param_broadcast.h
index 53e4456..377121a 100644
--- a/tests/problem/test_param_broadcast.h
+++ b/tests/problem/test_param_broadcast.h
@@ -269,9 +269,7 @@ const char *test_const_hstack_left_matmul(void)
     expr *p2 = new_parameter(1, n, PARAM_FIXED, 2 * n, p2_vals);
     expr *param_nodes[2] = {p1, p2};
     expr *p_hstack = new_hstack(param_nodes, 2, 2 * n);
-    /* pass concatenated parameter vectors */
-    double A_data[8] = {1.0, 2.0, 3.0, 0.0, 4.0, 0.0, 5.0, 6.0};
-    expr *objective = new_left_matmul_dense(p_hstack, x, 1, 2 * n, A_data);
+    expr *objective = new_left_matmul_dense(p_hstack, x, 1, 2 * n, NULL);
     problem *prob = new_problem(objective, NULL, 0, false);
 
     problem_init_derivatives(prob);
@@ -304,9 +302,7 @@ const char *test_param_hstack_left_matmul(void)
     expr *p2 = new_parameter(1, n, n, 2 * n, p2_vals);
     expr *param_nodes[2] = {p1, p2};
     expr *p_hstack = new_hstack(param_nodes, 2, 2 * n);
-    /* pass concatenated parameter vectors */
-    double A_data[8] = {1.0, 2.0, 3.0, 0.0, 4.0, 0.0, 5.0, 6.0};
-    expr *objective = new_left_matmul_dense(p_hstack, x, 1, 2 * n, A_data);
+    expr *objective = new_left_matmul_dense(p_hstack, x, 1, 2 * n, NULL);
     problem *prob = new_problem(objective, NULL, 0, false);
 
     problem_register_params(prob, param_nodes, 2);
diff --git a/tests/problem/test_param_prob.h b/tests/problem/test_param_prob.h
index 02c1a67..a638b31 100644
--- a/tests/problem/test_param_prob.h
+++ b/tests/problem/test_param_prob.h
@@ -167,12 +167,11 @@ const char *test_param_left_matmul_problem(void)
     /* minimize sum(x) subject to Ax = ?, with A parameter */
     expr *x = new_variable(2, 1, 0, n);
     expr *objective = new_sum(x, -1);
-    double theta[4] = {1.0, 0.0, 0.0, 1.0};
+    /* column-major of A = [[2,4],[0,1]] */
+    double theta[4] = {2.0, 0.0, 4.0, 1.0};
     expr *A_param = new_parameter(2, 2, 0, n, theta);
 
-    /* dense 2x2 matrix */
-    double Ax[4] = {2.0, 0.0, 0.0, 1.0};
-    expr *constraint = new_left_matmul_dense(A_param, x, 2, 2, Ax);
+    expr *constraint = new_left_matmul_dense(A_param, x, 2, 2, NULL);
     expr *constraints[1] = {constraint};
     problem *prob = new_problem(objective, constraints, 1, false);
 
@@ -188,7 +187,8 @@ const char *test_param_left_matmul_problem(void)
 
     /* test 1: initial jacobian */
     problem_constraint_forward(prob, x_vals);
-    double constrs[2] = {2.0, 2.0};
+    double constrs[2] = {10.0, 2.0};
+    double Ax[4] = {2.0, 4.0, 0.0, 1.0};
     problem_jacobian(prob);
     mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 2));
     mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 4));
@@ -227,12 +227,11 @@ const char *test_param_right_matmul_problem(void)
     /* minimize sum(x) subject to xA = ?, with A parameter */
     expr *x = new_variable(1, 2, 0, n);
     expr *objective = new_sum(x, -1);
-    double theta[4] = {1.0, 0.0, 0.0, 1.0};
+    /* column-major of A = [[2,0],[0,1]] */
+    double theta[4] = {2.0, 0.0, 0.0, 1.0};
     expr *A_param = new_parameter(2, 2, 0, n, theta);
 
-    /* dense 2x2 matrix */
-    double Ax[4] = {2.0, 0.0, 0.0, 1.0};
-    expr *constraint = new_right_matmul_dense(A_param, x, 2, 2, Ax);
+    expr *constraint = new_right_matmul_dense(A_param, x, 2, 2, NULL);
     expr *constraints[1] = {constraint};
     problem *prob = new_problem(objective, constraints, 1, false);
 
@@ -249,6 +248,7 @@ const char *test_param_right_matmul_problem(void)
     /* test 1: initial jacobian */
     problem_constraint_forward(prob, x_vals);
     double constrs[2] = {2.0, 2.0};
+    double Ax[4] = {2.0, 0.0, 0.0, 1.0};
     problem_jacobian(prob);
     mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 2));
     mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 4));
@@ -343,12 +343,11 @@ const char *test_param_left_matmul_rectangular(void)
     /* minimize sum(x) subject to Ax = ?, with A parameter (3x2) */
     expr *x = new_variable(2, 1, 0, n);
     expr *objective = new_sum(x, -1);
-    double theta[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    /* column-major of A = [[1,2],[3,4],[5,6]] */
+    double theta[6] = {1.0, 3.0, 5.0, 2.0, 4.0, 6.0};
     expr *A_param = new_parameter(3, 2, 0, n, theta);
 
-    /* dense 3x2 matrix */
-    double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    expr *constraint = new_left_matmul_dense(A_param, x, 3, 2, Ax);
+    expr *constraint = new_left_matmul_dense(A_param, x, 3, 2, NULL);
     expr *constraints[1] = {constraint};
     problem *prob = new_problem(objective, constraints, 1, false);
 
@@ -365,6 +364,7 @@ const char *test_param_left_matmul_rectangular(void)
     /* test 1: initial jacobian */
     problem_constraint_forward(prob, x_vals);
     double constrs[3] = {5.0, 11.0, 17.0};
+    double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     problem_jacobian(prob);
     mu_assert("vals fail", cmp_double_array(prob->constraint_values, constrs, 3));
     mu_assert("vals fail", cmp_double_array(prob->jacobian->x, Ax, 6));
@@ -408,12 +408,11 @@ const char *test_param_right_matmul_rectangular(void)
     /* minimize sum(x) subject to xA = ?, with A parameter (2x3) */
     expr *x = new_variable(1, 2, 0, n);
     expr *objective = new_sum(x, -1);
-    double theta[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    /* column-major of A = [[1,2,3],[4,5,6]] */
+    double theta[6] = {1.0, 4.0, 2.0, 5.0, 3.0, 6.0};
     expr *A_param = new_parameter(2, 3, 0, n, theta);
 
-    /* dense 2x3 matrix */
-    double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    expr *constraint = new_right_matmul_dense(A_param, x, 2, 3, Ax);
+    expr *constraint = new_right_matmul_dense(A_param, x, 2, 3, NULL);
     expr *constraints[1] = {constraint};
     problem *prob = new_problem(objective, constraints, 1, false);
 
@@ -478,11 +477,9 @@ const char *test_param_shared_left_matmul_problem(void)
     double theta[4] = {1.0, 0.0, 0.0, 1.0};
     expr *A_param = new_parameter(2, 2, 0, n, theta);
 
-    /* dense 2x2 identity */
-    double Ax[4] = {1.0, 0.0, 0.0, 1.0};
     expr *constraints[2];
-    constraints[0] = new_left_matmul_dense(A_param, x, 2, 2, Ax);
-    constraints[1] = new_left_matmul_dense(A_param, y, 2, 2, Ax);
+    constraints[0] = new_left_matmul_dense(A_param, x, 2, 2, NULL);
+    constraints[1] = new_left_matmul_dense(A_param, y, 2, 2, NULL);
     problem *prob = new_problem(objective, constraints, 2, false);
 
     /* register parameters and fill sparsity patterns */

From 82adda44534e3fa23b4c87fe2be526e8f117e116 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Thu, 23 Apr 2026 09:48:47 -0700
Subject: [PATCH 5/7] get rid of sparse matrix abstraction and just do CSR
 instead

---
 include/subexpr.h              |  2 +-
 include/utils/mini_numpy.h     | 11 +++++
 src/atoms/affine/convolve.c    | 76 +++++++---------------------------
 src/atoms/affine/left_matmul.c |  2 +-
 src/utils/mini_numpy.c         | 28 +++++++++++++
 5 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index 46bbd57..b129f2e 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -159,7 +159,7 @@ typedef struct convolve_expr
     expr *param_source; /* length-m kernel */
     int m;              /* kernel length */
     int n;              /* input length */
-    Matrix *T;          /* (m+n-1) x n Toeplitz Sparse_Matrix */
+    CSR_Matrix *T;      /* (m+n-1) x n convolution matrix */
     CSC_Matrix *Jchild_CSC;
     CSC_Matrix *J_CSC;
     int *csc_to_csr_work;
diff --git a/include/utils/mini_numpy.h b/include/utils/mini_numpy.h
index 1011de3..c87f80a 100644
--- a/include/utils/mini_numpy.h
+++ b/include/utils/mini_numpy.h
@@ -18,6 +18,8 @@
 #ifndef MINI_NUMPY_H
 #define MINI_NUMPY_H
 
+#include "utils/CSR_Matrix.h"
+
 /* Example: a = [1, 2], len = 2, repeats = 3, result = [1, 1, 1, 2, 2, 2] */
 void repeat(double *result, const double *a, int len, int repeats);
 
@@ -41,4 +43,13 @@ void Y_kron_I_vec(int m, int k, int n, const double *Y, const double *w, double
    compute v = vec(X^T @ W). */
 void I_kron_XT_vec(int m, int k, int n, const double *X, const double *w, double *v);
 
+/* Fill T_csr's row pointers and column indices for the 1D full-convolution
+   Toeplitz matrix T(a), sized (m+n-1) x n with m*n nonzeros. Values (x) are
+   not written; call conv_matrix_fill_values to populate them. */
+void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n);
+
+/* Overwrite T_csr->x from kernel a, using the sparsity already written by
+   conv_matrix_fill_sparsity. T[r, col] = a[r - col]. */
+void conv_matrix_fill_values(CSR_Matrix *T_csr, const double *a);
+
 #endif /* MINI_NUMPY_H */
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 51b80dc..ab0fbaf 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -18,7 +18,8 @@
 #include "atoms/affine.h"
 #include "subexpr.h"
 #include "utils/CSR_Matrix.h"
-#include "utils/matrix.h"
+#include "utils/linalg_sparse_matmuls.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -37,45 +38,6 @@
  * from the new kernel before running the convolution. The sparsity of
  * T(a) is kernel-independent (staircase), so only values are rewritten. */
 
-/* Fill T(a)'s CSR row pointers, column indices, and values from kernel a.
-   Row r holds T[r, col] = a[r - col] for col in
-   [max(0, r-m+1), min(n-1, r)]. */
-static void build_toeplitz(CSR_Matrix *T_csr, const double *a, int m, int n)
-{
-    int out_rows = m + n - 1;
-    int nnz = 0;
-    for (int r = 0; r < out_rows; r++)
-    {
-        T_csr->p[r] = nnz;
-        int col_lo = r - m + 1 > 0 ? r - m + 1 : 0;
-        int col_hi = r < n - 1 ? r : n - 1;
-        for (int col = col_lo; col <= col_hi; col++)
-        {
-            T_csr->i[nnz] = col;
-            T_csr->x[nnz] = a[r - col];
-            nnz++;
-        }
-    }
-    T_csr->p[out_rows] = nnz;
-}
-
-/* Values-only refresh analogous to refresh_dense_left in left_matmul.c:
-   sparsity is kernel-independent so p/i stay intact; we just walk the
-   existing CSR entries and overwrite x from the current kernel. */
-static void refresh_toeplitz_values(convolve_expr *cnode)
-{
-    CSR_Matrix *T_csr = ((Sparse_Matrix *) cnode->T)->csr;
-    const double *a = cnode->param_source->value;
-    int out_rows = cnode->m + cnode->n - 1;
-    for (int r = 0; r < out_rows; r++)
-    {
-        for (int k = T_csr->p[r]; k < T_csr->p[r + 1]; k++)
-        {
-            T_csr->x[k] = a[r - T_csr->i[k]];
-        }
-    }
-}
-
 static void forward(expr *node, const double *u)
 {
     expr *child = node->left;
@@ -84,12 +46,12 @@ static void forward(expr *node, const double *u)
     if (cnode->base.needs_parameter_refresh)
     {
         cnode->param_source->forward(cnode->param_source, NULL);
-        /* T is NULL until jacobian_init_impl runs; forward can be called
-           first in standalone tests, in which case T is built later from
-           the already-refreshed param values. */
+        /* refresh the convolution matrix values if it exists (necessary to check
+           for null in case someone calls forward before initializing the jacobian,
+           which might happen when we expose Python bindings to SparseDiffEngine) */
         if (cnode->T != NULL)
         {
-            refresh_toeplitz_values(cnode);
+            conv_matrix_fill_values(cnode->T, cnode->param_source->value);
         }
         cnode->base.needs_parameter_refresh = false;
     }
@@ -99,13 +61,11 @@ static void forward(expr *node, const double *u)
     const double *a = cnode->param_source->value;
     const double *x = child->value;
     double *y = node->value;
-    int m = cnode->m;
-    int n = cnode->n;
 
     memset(y, 0, node->size * sizeof(double));
-    for (int j = 0; j < n; j++)
+    for (int j = 0; j < cnode->n; j++)
     {
-        for (int i = 0; i < m; i++)
+        for (int i = 0; i < cnode->m; i++)
         {
             y[i + j] += a[i] * x[j];
         }
@@ -122,19 +82,15 @@ static void jacobian_init_impl(expr *node)
 
     jacobian_init(child);
 
-    /* Build T(a) as CSR: (m+n-1) x n, m*n nonzeros, staircase pattern. */
-    int out_rows = m + n - 1;
-    CSR_Matrix *T_csr = new_csr_matrix(out_rows, n, m * n);
-    build_toeplitz(T_csr, a, m, n);
-
-    cnode->T = new_sparse_matrix(T_csr);
-    free_csr_matrix(T_csr);
+    /* Build convolution matrix of size (m+n-1) x n with m*n nonzeros */
+    cnode->T = new_csr_matrix(m + n - 1, n, m * n);
+    conv_matrix_fill_sparsity(cnode->T, m, n);
+    conv_matrix_fill_values(cnode->T, a);
 
-    /* Reuse left_matmul's sparsity precompute: J_node = T @ J_child via
-       block_left_mult with n_blocks = 1. */
+    /* J_node = T @ J_child via block_left_mult with n_blocks = 1. */
     cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian, node->work->iwork);
     cnode->J_CSC =
-        cnode->T->block_left_mult_sparsity(cnode->T, cnode->Jchild_CSC, 1);
+        block_left_multiply_fill_sparsity(cnode->T, cnode->Jchild_CSC, 1);
     node->jacobian = csc_to_csr_alloc(cnode->J_CSC, cnode->csc_to_csr_work);
 }
 
@@ -148,7 +104,7 @@ static void eval_jacobian(expr *node)
     /* T values have been refreshed by forward() if the kernel changed; here
        we just need to refresh child's Jacobian and rerun the block matmul. */
     csr_to_csc_fill_values(child->jacobian, cnode->Jchild_CSC, node->work->iwork);
-    cnode->T->block_left_mult_values(cnode->T, cnode->Jchild_CSC, cnode->J_CSC);
+    block_left_multiply_fill_values(cnode->T, cnode->Jchild_CSC, cnode->J_CSC);
     csc_to_csr_fill_values(cnode->J_CSC, node->jacobian, cnode->csc_to_csr_work);
 }
 
@@ -196,7 +152,7 @@ static bool is_affine(const expr *node)
 static void free_type_data(expr *node)
 {
     convolve_expr *cnode = (convolve_expr *) node;
-    free_matrix(cnode->T);
+    free_csr_matrix(cnode->T);
     free_csc_matrix(cnode->Jchild_CSC);
     free_csc_matrix(cnode->J_CSC);
     free(cnode->csc_to_csr_work);
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index d5d87a1..c083385 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -294,7 +294,7 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
         lnode->AT = new_dense_matrix(n, m, NULL);
         node->needs_parameter_refresh = true;
     }
-    /* constant matrix and param_source should be null */
+    /* constant matrix case */
     else
     {
         if (data == NULL)
diff --git a/src/utils/mini_numpy.c b/src/utils/mini_numpy.c
index aa5f862..baa00c5 100644
--- a/src/utils/mini_numpy.c
+++ b/src/utils/mini_numpy.c
@@ -100,3 +100,31 @@ void I_kron_XT_vec(int m, int k, int n, const double *X, const double *w, double
         }
     }
 }
+
+void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n)
+{
+    int nnz = 0;
+    for (int r = 0; r < T_csr->m; r++)
+    {
+        T_csr->p[r] = nnz;
+        int col_lo = r - m + 1 > 0 ? r - m + 1 : 0;
+        int col_hi = r < n - 1 ? r : n - 1;
+        for (int col = col_lo; col <= col_hi; col++)
+        {
+            T_csr->i[nnz] = col;
+            nnz++;
+        }
+    }
+    T_csr->p[T_csr->m] = nnz;
+}
+
+void conv_matrix_fill_values(CSR_Matrix *T_csr, const double *a)
+{
+    for (int r = 0; r < T_csr->m; r++)
+    {
+        for (int k = T_csr->p[r]; k < T_csr->p[r + 1]; k++)
+        {
+            T_csr->x[k] = a[r - T_csr->i[k]];
+        }
+    }
+}

From 945ae93868266e6dae839847412bef904717273a Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Thu, 23 Apr 2026 09:54:18 -0700
Subject: [PATCH 6/7] get rid of block abstraction and use non-block logic
 instead

---
 include/subexpr.h           |  2 --
 src/atoms/affine/convolve.c | 15 ++++-----------
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index b129f2e..f97feef 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -161,8 +161,6 @@ typedef struct convolve_expr
     int n;              /* input length */
     CSR_Matrix *T;      /* (m+n-1) x n convolution matrix */
     CSC_Matrix *Jchild_CSC;
-    CSC_Matrix *J_CSC;
-    int *csc_to_csr_work;
 } convolve_expr;
 
 /* Bivariate matrix multiplication: Z = f(u) @ g(u) where both children
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index ab0fbaf..43877aa 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -87,11 +87,9 @@ static void jacobian_init_impl(expr *node)
     conv_matrix_fill_sparsity(cnode->T, m, n);
     conv_matrix_fill_values(cnode->T, a);
 
-    /* J_node = T @ J_child via block_left_mult with n_blocks = 1. */
+    /* J = T @ J_child */
     cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian, node->work->iwork);
-    cnode->J_CSC =
-        block_left_multiply_fill_sparsity(cnode->T, cnode->Jchild_CSC, 1);
-    node->jacobian = csc_to_csr_alloc(cnode->J_CSC, cnode->csc_to_csr_work);
+    node->jacobian = csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC);
 }
 
 static void eval_jacobian(expr *node)
@@ -101,11 +99,9 @@ static void eval_jacobian(expr *node)
 
     child->eval_jacobian(child);
 
-    /* T values have been refreshed by forward() if the kernel changed; here
-       we just need to refresh child's Jacobian and rerun the block matmul. */
+    /* J = T @ J_child */
     csr_to_csc_fill_values(child->jacobian, cnode->Jchild_CSC, node->work->iwork);
-    block_left_multiply_fill_values(cnode->T, cnode->Jchild_CSC, cnode->J_CSC);
-    csc_to_csr_fill_values(cnode->J_CSC, node->jacobian, cnode->csc_to_csr_work);
+    csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -154,8 +150,6 @@ static void free_type_data(expr *node)
     convolve_expr *cnode = (convolve_expr *) node;
     free_csr_matrix(cnode->T);
     free_csc_matrix(cnode->Jchild_CSC);
-    free_csc_matrix(cnode->J_CSC);
-    free(cnode->csc_to_csr_work);
     free_expr(cnode->param_source);
 }
 
@@ -207,7 +201,6 @@ expr *new_convolve(expr *param_node, expr *child)
 
     /* iwork is used for csr_to_csc of child's Jacobian (size n_vars). */
     node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int));
-    cnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
 
     /* Ensure first forward() pulls current param values through any
        broadcast/promote wrappers and reflects them in T (once T is built). */

From 806d0947d170c044b1cc2e4bbd84fcf709ef6806 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Thu, 23 Apr 2026 10:02:23 -0700
Subject: [PATCH 7/7] happy with convolve.c

---
 src/atoms/affine/convolve.c | 38 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 43877aa..55e5857 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -28,15 +28,8 @@
 /* 1D full convolution: y = conv(a, child), where a is a length-m kernel
  * held by param_source and child is a length-n vector. Output length is
  * m + n - 1. In matrix form, y = T(a) @ child, where T(a) is the
- * (m+n-1) x n Toeplitz matrix with T(a)[k, j] = a[k-j] when 0 <= k-j < m.
- *
- * Forward and Hessian backprop (w' = T(a)^T w) are direct loops, avoiding
- * CSR traversal and AT materialization. Jacobian values go through the
- * existing block-left-mult path so composite children work correctly.
- *
- * When param_source is updatable, forward() refreshes T(a)'s CSR values
- * from the new kernel before running the convolution. The sparsity of
- * T(a) is kernel-independent (staircase), so only values are rewritten. */
+ * (m+n-1) x n Toeplitz/convolution matrix with T(a)[k, j] = a[k-j] when
+ * 0 <= k-j < m. */
 
 static void forward(expr *node, const double *u)
 {
@@ -118,17 +111,14 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     expr *child = node->left;
     convolve_expr *cnode = (convolve_expr *) node;
-    int m = cnode->m;
-    int n = cnode->n;
     const double *a = cnode->param_source->value;
     double *w_prime = node->work->dwork;
 
-    /* w' = T(a)^T w. T(a)^T[j, k] = a[k-j] for j <= k < j+m, so
-       w'[j] = sum_{i=0..m-1} a[i] * w[i + j]. */
-    for (int j = 0; j < n; j++)
+    /* w' = conv_matrix.T w, so w'[j] = sum_{i=0..m-1} a[i] * w[i + j]. */
+    for (int j = 0; j < cnode->n; j++)
     {
         double sum = 0.0;
-        for (int i = 0; i < m; i++)
+        for (int i = 0; i < cnode->m; i++)
         {
             sum += a[i] * w[i + j];
         }
@@ -160,18 +150,18 @@ expr *new_convolve(expr *param_node, expr *child)
        is shape-agnostic (flat buffers of size m + n - 1). Output shape
        matches the input orientation. */
     int m = param_node->size;
-    int n, out_d1, out_d2;
+    int n, d1, d2;
     if (child->d2 == 1)
     {
         n = child->d1;
-        out_d1 = m + n - 1;
-        out_d2 = 1;
+        d1 = m + n - 1;
+        d2 = 1;
     }
     else if (child->d1 == 1)
     {
         n = child->d2;
-        out_d1 = 1;
-        out_d2 = m + n - 1;
+        d1 = 1;
+        d2 = m + n - 1;
     }
     else
     {
@@ -180,15 +170,9 @@ expr *new_convolve(expr *param_node, expr *child)
         exit(1);
     }
 
-    if (m < 1 || n < 1)
-    {
-        fprintf(stderr, "Error in new_convolve: m and n must be >= 1\n");
-        exit(1);
-    }
-
     convolve_expr *cnode = (convolve_expr *) SP_CALLOC(1, sizeof(convolve_expr));
     expr *node = &cnode->base;
-    init_expr(node, out_d1, out_d2, child->n_vars, forward, jacobian_init_impl,
+    init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess,
               free_type_data);
     node->left = child;