--- title: "JSS Example Code" author: "Daniela Dunkler" date: 2016-03-09 output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{JSS Example Code} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include=FALSE} library(knitr) opts_chunk$set(out.extra='style="display:block; margin: auto"', fig.align="center") ``` This is the example code for Dunkler D, Sauerbrei W, Heinze G (2016). Global, Parameterwise and Joint Shrinkage Factor Estimation. *Journal of Statistical Software*. **69**(8), 1-19. [http://dx.doi.org/10.18637/jss.v069.i08](http://dx.doi.org/10.18637/jss.v069.i08). ## Example R Code ```{r, eval = TRUE, warning = FALSE, fig.width = 7, fig.height = 5} ################################################################################ ### R code for ### "Global, Parameterwise and Joint Shrinkage Factor Estimation" ### written by Daniela Dunkler, March 2016 ################################################################################ ## works with R 3.2.3 & shrink-package 1.2.1 ## load the packages library("shrink") library("survival") library("mfp") library("aod") # for wald-test sessionInfo() ``` ## Section 2.1: Deep Vein Thrombosis Study ```{r, eval = TRUE, warning = FALSE, fig.width = 7, fig.height = 5} ## Section 2.1: Deep Vein Thrombosis Study ## load data data("deepvein") # number of observations, events, median observation time, and names of # variables nrow(deepvein) deepvein$status2 <- abs(deepvein$status-1) survfit(Surv(time, status2) ~ 1, data = deepvein) table(deepvein$status) round(100*table(deepvein$status)/nrow(deepvein), 2) sort(names(deepvein)) ``` ## Section 2.2: Breast Cancer Study ```{r, eval = TRUE, warning = FALSE, fig.width = 7, fig.height = 5} ## Section 2.2: Breast Cancer Study ## load data data("GBSG") # number of observations, events, median observation time, and names of # variables nrow(GBSG) table(GBSG$cens) round(100*table(GBSG$cens)/nrow(GBSG), 2) GBSG$cens2 <- abs(GBSG$cens-1) # median observation time is given in days here survfit(Surv(rfst, cens2) ~ 1, data = GBSG) # median observation time in months 1645 / 30.5 sort(names(GBSG)) ``` ## Section 5.1: Deep Vein Thrombosis Study ```{r, eval = TRUE, warning = FALSE, fig.width = 7, fig.height = 5} ## Section 5.1: Deep Vein Thrombosis Study # set the reference level for all categorical variables deepvein$sex <- relevel(deepvein$sex, ref = "female") deepvein$fiimut <- relevel(deepvein$fiimut, ref = "absent") deepvein$fvleid <- relevel(deepvein$fvleid, ref = "absent") deepvein$loc <- relevel(deepvein$loc, ref = "PE") # contrasts(deepvein$loc) ## fit full model, and compute shrinkage factors (jackknife estimates and dfbeta ## approximations) fitfull <- coxph(Surv(time, status) ~ log2ddim + bmi + durther + age + sex + loc + fiimut + fvleid, data = deepvein, x = TRUE) summary(fitfull) ## wald-test for categorical predictor "loc" wald.test(Sigma = vcov(fitfull)[c("locdistal", "locproximal"), c("locdistal", "locproximal")], b = fitfull$coeff[c("locdistal", "locproximal")], Terms = 1:2) shrink(fitfull, type = "global", method = "jackknife") ## apply backward elimination to full model*, and compute shrinkage factors to ## selected model ## * dummy variables of loc are jointly tested and selected fitd <- step(fitfull, direction = "backward") print(fitd) ## wald-test for categorical predictor "loc" wald.test(Sigma = vcov(fitd)[c("locdistal", "locproximal"), c("locdistal", "locproximal")], b = fitd$coeff[c("locdistal", "locproximal")], Terms = 1:2) shrink(fitd, type = "global", method = "jackknife") pwsf <- shrink(fitd, type = "parameterwise", method = "jackknife") pwsf sqrt(diag(pwsf$ShrinkageFactorsVCOV)) shrink(fitd, type = "parameterwise", method = "jackknife", join = list(c("locdistal", "locproximal"))) ## dummy variables of loc are separately tested and selected ## generate dummy variables, fit full model and then apply backward elimination deepvein <- cbind(deepvein, model.matrix( ~ loc, data = deepvein)[, -1]) fitfull2 <- coxph(Surv(time, status) ~ log2ddim + bmi + durther + age + sex + locdistal + locproximal + fiimut + fvleid, data = deepvein, x = TRUE) # fitfull2 is identical to fitfull fitd2 <- step(fitfull2, direction = "backward") summary(fitd2) shrink(fitd2, type = "global", method = "jackknife") shrink(fitd2, type = "parameterwise", method = "jackknife") ## Table 2 t2_jack <- c(shrink(fitd, type = "global", method = "jackknife")$ShrinkageFactors, pwsf$ShrinkageFactors, shrink(fitd, type = "parameterwise", method = "jackknife", join = list(c("locdistal", "locproximal")))$ShrinkageFactors, system.time(shrink(fitd, type = "parameterwise", method = "jackknife"))[1]) t2_dfbeta <- c(shrink(fitd, type = "global", method = "dfbeta")$ShrinkageFactors, shrink(fitd, type = "parameterwise", method = "dfbeta")$ShrinkageFactors, shrink(fitd, type = "parameterwise", method = "dfbeta", join = list(c("locdistal", "locproximal")))$ShrinkageFactors, system.time(shrink(fitd, type = "parameterwise", method = "dfbeta"))[1]) t2 <- cbind(Jackknife = t2_jack, DFBETA = t2_dfbeta) Table2 <- cbind(t2, round((t2[, 2] - t2[, 1]) / t2[, 1] * 100, 1)) Table2[, 1:2] <- round(Table2[, 1:2], 4) dimnames(Table2)[[2]][3] <- "Relative difference" Table2 <- rbind(Table2[1,], rep(NA, 3), Table2[2:5,], rep(NA, 3), Table2[6:8,], rep(NA, 3), Table2[10,]) dimnames(Table2)[[1]][c(1:2, 7, 10, 12)] <- c("Global shrinkage", "Parameterwise shrinkage", "Joint shrinkage", "loc", "Computing time") Table2 # max. difference max(abs(Table2[1:10, 1] - Table2[1:10, 2]), na.rm=TRUE) ## Figure 1: Data simulated from deep vein thrombosis study: Comparison of Jackknife and ## DFBETA-approximated global shrinkage factors for selected and unselected ## models. deepvein0 <- subset(deepvein, status == 0) deepvein1 <- subset(deepvein, status == 1) n0 <- nrow(deepvein0) n1 <- nrow(deepvein1) ratio <- 0.2 # based on n1 / n0 ## Note: Running this code is time-consuming, since 2 * B * S data sets are analyzed. ## size <- seq(from = 200, to = 2000, length.out = 19) # based on nrow(deepvein) # B <- 100 ## You may want to reduce B and length.out of size: e.g. ## size <- seq(from = 200, to = 2000, length.out = 4) ## B <- 3 # # S <- length(size) # # shrinkGJ <- shrinkGD <- shrinkGJsel <- shrinkGDsel <- matrix(NA, nrow = B, ncol = S) # set.seed(954681) # # for (s in 1:S) { # cat("\n", s) # for (b in 1:B) { # cat(".") # data <- rbind(deepvein0[sample(x = 1:n0, size = size[s]*(1-ratio), replace = TRUE),], # deepvein1[sample(x = 1:n1, size = size[s]*ratio, replace = TRUE),]) # # fitfull <- coxph(Surv(time, status) ~ log2ddim + bmi + durther + age + sex + loc + # fiimut + fvleid, data = data, x = TRUE) # fitsel <- step(fitfull, direction = "backward", trace = 0) # # if (!is.null(fitsel$coefficients)) { # no null model selected # shrinkGJsel[b, s] <- shrink(fitsel, type = "global", method = "jackknife")$ShrinkageFactors # shrinkGDsel[b, s] <- shrink(fitsel, type = "global", method = "dfbeta")$ShrinkageFactors # } # # # fit <- coxph(Surv(time, status) ~ log2ddim + sex + loc, data = data, x = TRUE) # # shrinkGJ[b, s] <- shrink(fit, type = "global", method = "jackknife")$ShrinkageFactors # shrinkGD[b, s] <- shrink(fit, type = "global", method = "dfbeta")$ShrinkageFactors # } # } ## In some smaller data sets (and especially when shrinkage factors are estimated with the ## Jackknife method) there might be issues with convergence in individual data sets. ## However, coxph will issue a warning. # # # shrinkGDselm <- apply(shrinkGDsel, 2, median) # shrinkGJselm <- apply(shrinkGJsel, 2, median) # # cbind(n = size, Diff_J_D_sel = shrinkGJselm - shrinkGDselm) # # shrinkGDm <- apply(shrinkGD, 2, median) # shrinkGJm <- apply(shrinkGJ, 2, median) # # cbind(n = size, Diff_J_D = shrinkGJm - shrinkGDm) # # xrange <- c(0.5, 1) ## xrange <- range(shrinkGDselm, shrinkGJselm, shrinkGDm, shrinkGJm) ## pdf("compJvsD.pdf", width = 6, height = 4) # par(oma = c(0, 0, 0.5, 0.5), mar = c(3.5, 4, 0, 0)) # plot(range(size), xrange, type = "n", las = 1, xlab = "", ylab = "", xaxt = "n") # axis(1, at = size) # mtext(side = 1, text = "Sample size", line = 2.3) # mtext(side = 2, text = "Global shrinkage factor", line = 2.8) # # points(size, shrinkGDselm, pch = 3, col = "darkolivegreen4", cex = 1.5) # points(size, shrinkGJselm, pch = 1, col = "darkgoldenrod3", cex = 1.3) # lines(size, shrinkGDselm, lty = 2, col = "darkolivegreen4", lwd = 2) # lines(size, shrinkGJselm, lty = 3, col = "darkgoldenrod3", lwd = 2) # # points(size, shrinkGDm, pch = 3, col = "firebrick3", cex = 1.5) # points(size, shrinkGJm, pch = 1, col = "dodgerblue3", cex = 1.3) # lines(size, shrinkGDm, lty = 2, col = "firebrick3", lwd = 2) # lines(size, shrinkGJm, lty = 3, col = "dodgerblue3", lwd = 2) # # legend(x = 1600, y = 0.62, legend = c("Jackknife", "DFBETA"), lwd = 2, title = "unselected", # col = c("dodgerblue3", "firebrick3"), inset = 0.05, bty = "n", pch = c(1, 3), # text.col = c("dodgerblue3", "firebrick3"), title.col = "black") # # legend(x = 1000, y = 0.62, legend = c("Jackknife", "DFBETA"), lwd = 2, title = "selected", # col = c("darkgoldenrod3", "chartreuse4"), inset = 0.05, bty = "n", pch = c(1, 3), # text.col = c("darkgoldenrod3", "chartreuse4"), title.col = "black") ## dev.off() ``` ## Section 5.2: Breast Cancer Study ```{r, eval = TRUE, warning = FALSE, fig.width = 7, fig.height = 5} ## Section 5.2: Breast Cancer Study ## define predictors as suggested by Sauerbrei (1999, Applied Statistics) GBSG$enodes <- exp(-0.12*GBSG$posnodal) # create ordinal dummy-coded variable tumgrad for grades 1 to 3 contrasts(GBSG$tumgrad) <- matrix(c(0, 1, 1, 0, 0, 1), ncol = 2, byrow = FALSE, dimnames = list(1:3, c("tumgrad1", "tumgrad2"))) contrasts(GBSG$tumgrad) # for nicer variable names use default dimnames contrasts(GBSG$tumgrad) <- matrix(c(0, 1, 1, 0, 0, 1), ncol = 2, byrow = FALSE) ## # model selection (backward elimination) and estimation fitg <- mfp(Surv(rfst, cens) ~ fp(age) + fp(prm) + fp(esm) + fp(tumsize) + fp(enodes) + tumgrad + menostat + strata(htreat), family = cox, data = GBSG, alpha = 0.05, select = 0.05) print(fitg) ## Table 3 ## (p-values unconditional on the selected degree and power for prm, age, and ## enodes = fitg$pvalues) varorder <- c("age.1", "age.2", "prm.1", "enodes.1", "tumgrad1.1") t3_beta_j <- c(NA, round(fitg$coefficients, 3)[varorder]) t3_df <- c(fitg$df.initial[7, ], NA, NA, fitg$df.initial[c(2, 1, 3), ]) t3_p <- c(round(fitg$pvalues[7, "p.null"], 3), NA, NA, round(fitg$pvalues[c(2, 1, 3), "p.null"], 3)) t3_pwsf0 <- shrink(fitg, type = "parameterwise", method = "jackknife") t3_pwsf <- round(c(NA, t3_pwsf0$ShrinkageFactors[varorder]), 3) t3_pwsfse <- round(c(NA, sqrt(diag(t3_pwsf0$ShrinkageFactorsVCOV))[varorder]), 3) t3_jointsf0 <- shrink(fitg, type = "parameterwise", method = "jackknife", join=list(c("age.1", "age.2"))) t3_jointsf <- round(c(t3_jointsf0$ShrinkageFactors[varorder[1]], NA, NA, t3_jointsf0$ShrinkageFactors[varorder[-c(1:2)]]), 3) t3_jointsfse <- round(c(sqrt(diag(t3_jointsf0$ShrinkageFactorsVCOV))["join.age.1"], NA, NA, sqrt(diag(t3_jointsf0$ShrinkageFactorsVCOV))[varorder[-c(1:2)]]), 3) Table3 <- cbind("beta_j" = t3_beta_j, "df" = t3_df, "p value" = t3_p, "PWSF" = t3_pwsf, "PWSF SE" = t3_pwsfse, "Joint shrinkage" = t3_jointsf, "Joint shrinkage SE" = t3_jointsfse) dimnames(Table3)[[1]][1] <- "age" Table3 ## compute shrinkage factors globalsf <- shrink(fitg, type = "global", method = "jackknife") globalsf sqrt(globalsf$ShrinkageFactorsVCOV) pwsf <- shrink(fitg, type = "parameterwise", method = "jackknife") pwsf round(sqrt(diag(pwsf$ShrinkageFactorsVCOV)), 3) round(cov2cor(pwsf$ShrinkageFactorsVCOV), 3) jointsf <- shrink(fitg, type = "parameterwise", method = "jackknife", join=list(c("age.1", "age.2"))) jointsf round(sqrt(diag(jointsf$ShrinkageFactorsVCOV)), 3) ## Figure 2: selected model: Log hazard relative to 50 years # refit model fitg explicitly including the two dummy variables of # tumgrad GBSG <- cbind(GBSG, model.matrix(~tumgrad, data = GBSG)[, -1]) fitg <- mfp(Surv(rfst, cens) ~ fp(age) + fp(prm) + fp(esm) + fp(tumsize) + fp(enodes) + tumgrad1 + tumgrad2 + menostat + strata(htreat), family = cox, data = GBSG, alpha = 0.05, select = 0.05) globalsf <- shrink(fitg, type = "global", method = "jackknife") pwsf <- shrink(fitg, type = "parameterwise", method = "jackknife") jointsf <- shrink(fitg, type = "parameterwise", method = "jackknife", join=list(c("age.1", "age.2"))) # define new data for which prediction is requested # newdatref is the new data set for the reference observation age <- 30:75 newdat <- data.frame(age = age, enodes = 1, prm = 1, tumgrad1 = 0, htreat = factor(1)) newdatref <- data.frame(age = 50, enodes = 1, prm = 1, tumgrad1 = 0, htreat = factor(1)) xaxis <- seq(from = min(age), to = max(age), by = 5) # pdf("plotgbsg.pdf", width = 6, height = 6) par(fig = c(0, 1, 0, 0.3), new = FALSE, oma = c(0, 0, 0.5, 0.5), mar = c(3.5, 4, 0, 0)) hist(GBSG$age[(GBSG$age >= min(age)) & (GBSG$age <= max(age))], breaks = seq(from = min(age), to = max(age), by = 2.5), main = "", xlim = range(age), xaxs = "r", yaxs = "r", xlab = "", ylab = "", axes = FALSE, col = "gray88") axis(side = 1, at = xaxis) axis(side = 2, at = seq(from = 0, to = 60, by = 20), las = 2) mtext(side = 1, text = "Age", line = 2.3) mtext(side = 2, text = "Frequency", line = 2.8) box() par(fig = c(0,1,0.3,1), new = TRUE, oma = c(0, 0, 0.5, 0.5), mar = c(1, 4, 0, 0)) plot(range(age), c(-0.05, 0.8), xlab = "", ylab = "", type = "n", yaxs = "i", xaxt = "n", yaxt = "n") axis(side = 1, at = xaxis, labels = rep("", times = length(xaxis))) axis(side = 2, at = seq(from = 0, to = 0.7, by = 0.1), las = 2, labels = seq(from = 0, to = 0.7, by = 0.1)) mtext(side = 2, text = "Log hazard relative to 50 years", line = 2.8) lines(x = age, y = predict(fitg, newdata = newdat, type = "lp") - predict(fitg, newdata = newdatref, type = "lp"), lty = 1, col = "gray25", lwd = 2) lines(x = age, y = predict(globalsf, newdata = newdat, type = "lp") - predict(globalsf, newdata = newdatref, type = "lp"), lty = 5, col = "forestgreen", lwd = 2) lines(x = age, y = predict(pwsf, newdata = newdat, type="lp") - predict(pwsf, newdata = newdatref, type = "lp"), lty = 2, col = "dodgerblue3", lwd = 2) lines(x = age, y = predict(jointsf, newdata = newdat, type = "lp") - predict(jointsf, newdata = newdatref, type = "lp"), lty = 4, col = "firebrick3", lwd = 2) legend("topright", inset = 0.02, bty = "n", lty = c(1, 5, 4, 2), title = "SHRINKAGE", legend = c("No", "Global", "Joint", "Parameterwise"), col = c("gray25", "forestgreen", "firebrick3", "dodgerblue3"), lwd = 2, text.col = c("gray25", "forestgreen", "firebrick3", "dodgerblue3")) # dev.off() ```