This is the survival analysis of a dataset #####################
#####################
install.packages("car", dependencies=TRUE) install.packages("survival", dependencies=TRUE) install.packages("flexsurv", dependencies=TRUE) install.packages("KMsurv", dependencies=TRUE) install.packages("e1071", dependencies=TRUE) install.packages("rms", dependencies=TRUE)
#####################
#####################
gbcs <- read.csv("https://ryanwomack.com/data/gbcs.csv") attach(gbcs) names(gbcs) dim(gbcs) summary(gbcs) hist(age) plot(density(age)) table(menopause) table(hormone) hist(prog_recp) plot(density(prog_recp), main="Progesterone receptors, density plot")
#Fig 1, prog_recp par(mfrow=c(1,2)) plot(prog_recp, main="Progesterone receptors") plot(log(prog_recp), main="log of Progesterone receptors") par(mfrow=c(1,1))
hist(estrg_recp) plot(density(estrg_recp)) table(censrec) hist(rectime) plot(density(rectime))
#plots
plot(rectimeid)
plot(rectimeage)
plot(rectimemenopause)
plot(rectimehormone)
plot(rectimeprog_recp)
plot(rectimeestrg_recp)
plot(rectime~censrec)
#correlation matrix cor(gbcs)
#fancy scatterplot library(car) scatterplotMatrix(gbcs)
#####################
#####################
#reuseable survival objects library(survival) recsurv<-Surv(rectime,censrec)
#survfit fits survival curves with various methods #Kaplan-Meier is most common, so use that if in doubt
fit_KM <- survfit(recsurv1,type="kaplan-meier", conf.type="log-log")
fit_FH <- survfit(recsurv1,type="fleming-harrington", conf.type="log-log")
fit_FH2 <- survfit(recsurv~1, type="fh2", conf.type="log-log")
#plot survival plot(fit_KM, main="survival function for rectime (K-M estimate)", xlab="days", ylab="p") plot(fit_FH) plot(fit_FH2)
#print restricted means print(fit_KM,print.rmean=TRUE) print(fit_FH,print.rmean=TRUE) print(fit_FH2,print.rmean=TRUE)
#can plot cumulative hazard, and other mods as plot(fit_KM, fun="cumhaz")
#cumulative events (f(y)=1-y) plot(fit_KM, fun="event")
#log(-log(y)) plot(fit_KM, fun="cloglog")
#log t plot(fit_KM, fun="log")
#to generate figure for report par(mfrow=c(1,2)) plot(fit_KM, main="survival function for rectime (K-M estimate)", xlab="days", ylab="p") plot(fit_KM, fun="cumhaz", main="cumulative hazard function for rectime (K-M estimate)", xlab="days", ylab="p") par(mfrow=c(1,1))
#survfits to illustrate impact of variables leg.txt<-c("0", "1") fit <- survfit(recsurv~as.numeric(age>median(age))) plot(fit, col=c(2,4)) legend("topright",leg.txt,col=c(2,4),lty=1)
fit <- survfit(recsurv~menopause) plot(fit, col=c(2,4)) legend("topright",leg.txt,col=c(2,4),lty=1)
fit <- survfit(recsurv~hormone) plot(fit, col=c(2,4)) legend("topright",leg.txt,col=c(2,4),lty=1)
fit <- survfit(recsurv~as.numeric(prog_recp>median(prog_recp))) plot(fit, col=c(2,4)) legend("topright",leg.txt,col=c(2,4),lty=1)
fit <- survfit(recsurv~as.numeric(estrg_recp>median(estrg_recp))) plot(fit, col=c(2,4)) legend("topright",leg.txt,col=c(2,4),lty=1)
#####################
#####################
#categorical variables progtest <- as.numeric(prog_recp>median(prog_recp)) estrgtest <- as.numeric(estrg_recp>median(estrg_recp)) agetest <- as.numeric(age>median(age))
#ecdf plots empirical cdf plot(ecdf(age[menopause==1]), verticals=TRUE, pch=46, col=2) lines(ecdf(age[menopause==2]), verticals=TRUE, pch=46, col=4)
plot(ecdf(age[hormone==1]), verticals=TRUE, pch=46, col=2) lines(ecdf(age[hormone==2]), verticals=TRUE, pch=46, col=4)
plot(ecdf(age[prog_recp>median(prog_recp)]), verticals=TRUE, pch=46, col=2) lines(ecdf(age[prog_recp<=median(prog_recp)]), verticals=TRUE, pch=46, col=4)
plot(ecdf(age[estrg_recp>median(estrg_recp)]), verticals=TRUE, pch=46, col=2) lines(ecdf(age[estrg_recp<=median(estrg_recp)]), verticals=TRUE, pch=46, col=4)
#####################
#####################
#survreg, 1 variable
#here we are checking which distributional assumption provides
#the best fit
fit_exp<-survreg(recsurv1, dist="exponential")
fit_weibull<-survreg(recsurv1, dist="weibull")
fit_gauss<-survreg(recsurv1, dist="gaussian")
fit_logistic<-survreg(recsurv1, dist="logistic")
fit_lognormal<-survreg(recsurv1, dist="lognormal")
fit_loglogistic<-survreg(recsurv1, dist="loglogistic")
summary(fit_exp)
summary(fit_weibull)
summary(fit_gauss)
summary(fit_logistic)
summary(fit_lognormal)
summary(fit_loglogistic)
#try with package "flexsurv" #flexsurv provides access to additional distributions
library(flexsurv)
fit_exp<-flexsurvreg(recsurv1, dist="exp")
fit_weibull<-flexsurvreg(recsurv1, dist="weibull")
fit_gamma<-flexsurvreg(recsurv1, dist="gamma")
fit_gengamma<-flexsurvreg(recsurv1, dist="gengamma")
fit_genf<-flexsurvreg(recsurv1, dist="genf")
fit_lognormal<-flexsurvreg(recsurv1, dist="lnorm")
fit_gompertz<-flexsurvreg(recsurv~1, dist="gompertz")
fit_exp
fit_weibull
fit_gamma
fit_gengamma
fit_genf
fit_lognormal
fit_gompertz
plot(fit_exp)
plot(fit_weibull)
plot(fit_gamma)
plot(fit_gengamma)
plot(fit_genf)
plot(fit_lognormal)
plot(fit_gompertz)
#log likelihood test fit_exp$loglik fit_weibull$loglik fit_gamma$loglik fit_gengamma$loglik fit_lognormal$loglik fit_gompertz$loglik
qchisq(.95,2) qchisq(.95,1)
#test gengamma vs. lognormal f_lognormal<-2*(fit_gengamma$loglik - fit_lognormal$loglik) #2 df f_lognormal
#AIC is reported by flexsurv fit_exp$AIC fit_weibull$AIC fit_gamma$AIC fit_gengamma$AIC fit_lognormal$AIC fit_gompertz$AIC
#####################
#####################
#KMsurv package can generate life table #not really needed in this analysis library(KMsurv) lifetab()
#log transforms logprog<-log(prog_recp+.1) logestrg<-log(estrg_recp+.1) logrectime<-log(rectime) logrecsurv<-Surv(logrectime,censrec)
#work with logrec a bit
fit_exp<-flexsurvreg(logrecsurv1, dist="exp")
fit_weibull<-flexsurvreg(logrecsurv1, dist="weibull")
fit_gamma<-flexsurvreg(logrecsurv1, dist="gamma")
fit_gengamma<-flexsurvreg(logrecsurv1, dist="gengamma")
fit_genf<-flexsurvreg(logrecsurv1, dist="genf")
fit_lognormal<-flexsurvreg(logrecsurv1, dist="lnorm")
fit_gompertz<-flexsurvreg(logrecsurv~1, dist="gompertz")
fit_exp
fit_weibull
fit_gamma
fit_gengamma
fit_genf
fit_lognormal
fit_gompertz
plot(fit_exp)
plot(fit_weibull)
plot(fit_gamma)
plot(fit_gengamma)
plot(fit_genf)
plot(fit_lognormal)
plot(fit_gompertz)
#these options work too plot(fit_gamma, type="hazard") plot(fit_gamma, type="cumhaz") plot(fit_gamma, type="survival")
#log likelihood test fit_exp$loglik fit_weibull$loglik fit_gamma$loglik fit_gengamma$loglik fit_genf$loglik fit_lognormal$loglik fit_gompertz$loglik
qchisq(.95,2) qchisq(.95,1)
#test genf vs. gengamma f_gengamma<-2*(fit_genf$loglik-fit_gengamma$loglik) #2 df f_gengamma
#test genf vs. lognormal f_lognormal<-2*(fit_genf$loglik - fit_lognormal$loglik) #2 df f_lognormal
#lognormal is good enough here
#AIC fit_exp$AIC fit_weibull$AIC fit_gamma$AIC fit_gengamma$AIC fit_genf$AIC fit_lognormal$AIC fit_gompertz$AIC
#confirms choice of lognormal
#create graphic
par(mfrow=c(1,2)) fit_gengamma<-flexsurvreg(recsurv~1, dist="gengamma") plot(fit_gengamma, main="rectime fit to generalized gamma", xlab="days", ylab="p")
fit_lognormal<-flexsurvreg(logrecsurv~1, dist="lnorm") plot(fit_lognormal, main="log(rectime) fit to lognormal", xlab="days", ylab="p") par(mfrow=c(1,1))
#####################
##################### #probability plots to test for distribution of response variable library(e1071) probplot(rectime) probplot(rectime, "qunif") #best fit probplot(rectime, "qexp") probplot(rectime, "qnorm") probplot(rectime, "qweibull", shape=1) probplot(rectime, "qlnorm") probplot(rectime, "qgamma", shape=1)
#check subgroups, strata
#try different "type" options in coxph
#####################
#####################
#survplot from rms package #npsurv is equivalent to survreg #shows CIs easily library(rms) fit <- npsurv(recsurv~hormone) survplot(fit)
#survdiffplot #shows significant CI survdiffplot(fit)
#survdiff - numerical test of whether variable influences survival #rho=0 logrank (Mantel-Haenszel test #rho=1 Peto & Peto modification of the Gehan-Wilcoxon test
survdiff(recsurv ~ hormone, rho=0) survdiff(recsurv ~ menopause, rho=0) survdiff(recsurv ~ agetest, rho=0) survdiff(recsurv ~ progtest, rho=0) survdiff(recsurv ~ estrgtest, rho=0)
survdiff(recsurv ~ hormone, rho=1) survdiff(recsurv ~ menopause, rho=1) survdiff(recsurv ~ agetest, rho=1) survdiff(recsurv ~ progtest, rho=1) survdiff(recsurv ~ estrgtest, rho=1)
#####################
#####################
#coxph
fit <- coxph(recsurv~age) fit cox.zph(fit) plot(cox.zph(fit)) res_martingale<-residuals(fit, type="martingale") scatter.smooth(age,res_martingale)
fit <- coxph(recsurv~menopause) fit cox.zph(fit) plot(cox.zph(fit)) res_martingale<-residuals(fit, type="martingale") scatter.smooth(menopause,res_martingale)
fit <- coxph(recsurv~hormone) fit cox.zph(fit) plot(cox.zph(fit)) res_martingale<-residuals(fit, type="martingale") scatter.smooth(hormone,res_martingale)
fit <- coxph(recsurv~prog_recp) fit cox.zph(fit) plot(cox.zph(fit)) res_martingale<-residuals(fit, type="martingale") scatter.smooth(prog_recp,res_martingale)
fit <- coxph(recsurv~estrg_recp) fit cox.zph(fit) plot(cox.zph(fit)) res_martingale<-residuals(fit, type="martingale") scatter.smooth(estrg_recp,res_martingale)
#exact results are not really different than approximate methods #those methods will allow us to use Schoenfeld residuals #to check fit with cox.ph #looking for non-straight line, indicating violation of proportional hazards assumption
#dichotomize estrg_recp #still a bad violation, must stratify
fit <- coxph(recsurv ~ estrgtest) fit cox.zph(fit) plot(cox.zph(fit))
#inverse does not converge fit <- coxph(recsurv ~ I(estrgtest^(-1))) fit cox.zph(fit) plot(cox.zph(fit))
#####################
#####################
#can also access residuals of other types with mres<-residuals(fit, type="martingale") dfb<-residuals(fit, type="dfbetas") scatter.smooth(rectime,mres)
#get baseline survival survfit(fit)
#check collinearity cor(logprog,logestrg) cor(gbcs) cor(prog_recp,estrg_recp)
#do stepAIC to select model #from MASS package, has stepwisemethod for coxph #stepAIC also accepts forward and backward options for direction
#no interaction, no stratification library(MASS) coxbasemodel<-coxph(recsurv~prog_recp) no_I_no_S<-stepAIC(coxbasemodel, direction="both", scope=list(lower=.~1,upper=.~prog_recp+hormone+age))
#no interaction, stratification by estrg no_I_estrg<-coxph(recsurv~prog_recp+hormone+age+strata(estrgtest))
#it is very clear that the stratification makes a big difference in log-likelihood no_I_no_S$loglik no_I_estrg$loglik
#interaction, no stratification I_no_S<-stepAIC(coxbasemodel, direction="both", scope=list(lower=.~1,upper=.~hormone+prog_recp+estrg_recp+hormone:prog_recp+hormone:estrg_recp+prog_recp:estrg_recp))
extractAIC(no_I_estrg) extractAIC(no_I_no_S) extractAIC(I_no_S)
#check residuals fit <- no_I_estrg fit cox.zph(fit) plot(cox.zph(fit))
#####################
#####################
#can also access residuals of other types with res_martingale<-residuals(fit, type="martingale") res_dfbetas<-residuals(fit, type="dfbetas") res_score<-residuals(fit, type="score") res_deviance<-residuals(fit, type="deviance") res_schoenfeld<-residuals(fit, type="schoenfeld") res_dfbeta<-residuals(fit, type="dfbeta") res_scaledsch<-residuals(fit, type="scaledsch") res_partial<-residuals(fit, type="partial")
scatter.smooth(logrectime,res_martingale) scatter.smooth(logrectime,res_deviance)
#cox-snell residuals res_cox_snell=censrec-res_martingale
fit_cs=survfit(Surv(res_cox_snell,censrec)~1) Htilde=cumsum(fit_cs$n.event/fit_cs$n.risk) plot(fit_cs$time,Htilde,type='s',col='blue') abline(0,1,col='red',lty=2)
#####################
#####################
#AFT model uses survreg function #using lognormal distribution according to earlier fit #use survreg for stepwise #then flexsurvreg to generate graphs
fit_lognormal<- survreg(logrecsurv1, dist="lognormal")
aftmodel <- survreg(logrecsurvhormone+prog_recp+estrg_recp, dist="lognormal")
aftmodel
flexAFT<-flexsurvreg(logrecsurv~hormone+prog_recp, dist="lnorm") plot(flexAFT, type="survival") par(mfrow=c(1,2)) plot(flexAFT, type="cumhaz", sub="cumulative hazard for lognormal AFT") plot(flexAFT, type="hazard") par(mfrow=c(1,1))