## Problem data
n <- 30
M <- 50
K <- 200
set.seed(n * M * K)
X <- matrix(stats::rnorm(K * n), nrow = K, ncol = n)
beta_true <- matrix(stats::rnorm(n), nrow = n, ncol = 1)
y <- X %*% beta_true + 0.3 * sqrt(n) * stats::rnorm(K)
## Order variables based on y
idx <- order(y, decreasing = FALSE)
y_ordered <- y[idx]
X_ordered <- X[idx, ]
## Find cutoff and censor
D <- (y_ordered[M] + y_ordered[M + 1]) / 2
censored <- (y_ordered > D)
y_censored <- pmin(y_ordered, D)Censored Regression
Introduction
Data collected from an experimental study is sometimes censored, so that only partial information is known about a subset of observations. For instance, when measuring the lifespan of mice, we may find a number of subjects live beyond the duration of the project. Thus, all we know is the lower bound on their lifespan. This right censoring can be incorporated into a regression model via convex optimization.
Suppose that only
This avoids the bias introduced by standard OLS, while still utilizing all of the data points in the regression. The constraint requires only one more line in CVXR.
Example
We will generate synthetic data for this example, censoring observations beyond a value
We now fit regular OLS, OLS using just the censored data and finally the censored regression.
## Regular OLS
beta <- Variable(n)
obj <- sum((y_censored - X_ordered %*% beta)^2)
prob <- Problem(Minimize(obj))
result <- psolve(prob)
check_solver_status(prob)
beta_ols <- value(beta)
## OLS using uncensored data
obj <- sum((y_censored[1:M] - X_ordered[1:M,] %*% beta)^2)
prob <- Problem(Minimize(obj))
result <- psolve(prob)
check_solver_status(prob)
beta_unc <- value(beta)
## Censored regression
obj <- sum((y_censored[1:M] - X_ordered[1:M,] %*% beta)^2)
constr <- list(X_ordered[(M+1):K,] %*% beta >= D)
prob <- Problem(Minimize(obj), constr)
result <- psolve(prob)
check_solver_status(prob)
beta_cens <- value(beta)Here’s are some plots comparing the results. The blue diamond points are estimates.
plot_results <- function(beta_res, title) {
d <- data.frame(index = seq_len(K),
y_ordered = y_ordered,
y_fit = as.numeric(X_ordered %*% beta_res),
censored = as.factor(censored))
ggplot(data = d) +
geom_point(mapping = aes(x = index, y = y_ordered, color = censored)) +
geom_point(mapping = aes(x = index, y = y_fit), color = "blue", shape = 23) +
geom_abline(intercept = D, slope = 0, lty = "dashed") +
labs(x = "Observations", y = "y") +
ggtitle(title)
}plot_results(beta_ols, "Regular OLS.")
plot_results(beta_unc, "OLS using uncensored data.")
plot_results(beta_cens, "Censored Regression.")
Session Info
R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Tahoe 26.3.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Los_Angeles
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_4.0.2 CVXR_1.8.1
loaded via a namespace (and not attached):
[1] gmp_0.7-5.1 generics_0.1.4 clarabel_0.11.2 slam_0.1-55
[5] lattice_0.22-9 digest_0.6.39 magrittr_2.0.4 evaluate_1.0.5
[9] grid_4.5.2 RColorBrewer_1.1-3 fastmap_1.2.0 rprojroot_2.1.1
[13] jsonlite_2.0.0 Matrix_1.7-4 ECOSolveR_0.6.1 backports_1.5.0
[17] scs_3.2.7 Rmosek_11.1.1 scales_1.4.0 codetools_0.2-20
[21] cli_3.6.5 rlang_1.1.7 Rglpk_0.6-5.1 withr_3.0.2
[25] yaml_2.3.12 otel_0.2.0 tools_4.5.2 osqp_1.0.0
[29] Rcplex_0.3-8 checkmate_2.3.4 dplyr_1.2.0 here_1.0.2
[33] gurobi_13.0-1 vctrs_0.7.1 R6_2.6.1 lifecycle_1.0.5
[37] htmlwidgets_1.6.4 pkgconfig_2.0.3 cccp_0.3-3 pillar_1.11.1
[41] gtable_0.3.6 glue_1.8.0 Rcpp_1.1.1 xfun_0.56
[45] tibble_3.3.1 tidyselect_1.2.1 knitr_1.51 dichromat_2.0-0.1
[49] highs_1.12.0-3 farver_2.1.2 htmltools_0.5.9 labeling_0.4.3
[53] rmarkdown_2.30 piqp_0.6.2 compiler_4.5.2 S7_0.2.1