Statified Case-Cohort Data Analysis using the Additive Hazards Model

This R script demonstrates how to use R package addhazard to fit the additive hazards (AH) model to a dataset that uses a cohort, case-cohort, or stratified case-cohort study design with Bernoulli sampling. It also illustrates how to obtain the model fitting results when the sampling design has changed from Bernoulli sampling to finite population stratified sampling (FPSS) for selecting the phase II subsample. In addition, it demonstrates how to utilize the auxiliary information in the full cohort to further increase estimation precision when fitting an AH model to the (stratified) case-cohort data with calibration techniques. Users can request either model-based or robust standard errors for all the estimators. Results are based on R package addhazard 1.1.0 as of 2017-03-21 and tb data available on the book website.

Load the required library

library(addhazard)
library(survival)

1. Full Cohort Analysis

# cohort data, sample size = 1720
tb <- read.table("tb.cohort.txt", header = T)
# fit the AH model to the full cohort with model-based standard errors (SEs)
fit1.mod <-  ah(Surv(age.in, age.out, bc) ~  dose + age.rx, data = tb,
                robust = F, ties = F)
# fit the AH model to the full cohort with robust standard errors (SEs)
fit1.rob <- ah(Surv(age.in, age.out, bc) ~  dose + age.rx, data = tb,
               robust = T, ties = F)
# get results for table 20.4
summary(fit1.mod)

## Call:
## ah(formula = Surv(age.in, age.out, bc) ~ dose + age.rx, data = tb, 
##     robust = F, ties = F)
## 
##              coef         se   lower.95   upper.95      z p.value  
## dose    4.782e-04  2.105e-04  6.571e-05  8.907e-04  2.272  0.0231 *
## age.rx -3.630e-05  2.004e-05 -7.558e-05  2.984e-06 -1.811  0.0701 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

### alternatively ahaz package can be used

2. Case-cohort Data analysis

# cohort data with the case-cohort sample labeled, sample size = 1720
tbccS <- read.table("tb.cascoh.S.txt", header = T)

Assign sampling probabilities from a sample survey approach

# note subjects not selected into phase II are labeled as NA
# convert the phase II sampling indicator incc to 1/0
tbccS$incc[is.na(tbccS$incc)] <- 0
# extract only phase II subsample
tbccS.phase2 <- tbccS[tbccS$incc == 1, ]
# calculate the sampling fractions for case-cohort data, one for the case group
# (bc = 1) and one for the control group (bc = 0)
fracs <- table(tbccS.phase2$bc)/table(tbccS$bc)
# assign a sampling probability to each cohort member based on  whether this member
# belongs to the case or the control group and fracs obtained above
for(i in seq_along(fracs)){
  tbccS$frac[tbccS$bc == (i-1)] <- fracs[i]
}

fit the AH model with Bernoulli Sampling, model-based and robust SEs

fit2.mod<- ah.2ph(Surv(age.in, age.out, bc)  ~  dose + age.rx , data = tbccS,
                  R = incc, Pi = frac, robust = FALSE, ties = F)
fit2.rob<- ah.2ph(Surv(age.in, age.out, bc)  ~  dose + age.rx , data = tbccS,
                  R = incc, Pi = frac, robust = TRUE, ties = F)

fit the AH model with finite population stratified sampling (FPSS) according to Chapt.20.6.3. We first fit the AH model to the case-cohort data using the sampling fraction frac as the Bernoulli sampling probability Pi and then we calibrate design weights calculated from Pi to the strata frequencies by using stratum membership strt1 and strt2 as our calibration variables. See the details below.

# create strt indicator columns, which become our calibration variables
for (i in 1:2){
  tbccS$strt_ind <- as.numeric(tbccS$bc == (i-1))
  names(tbccS)[ncol(tbccS)]= paste0("strt", i)
}
# fit the AH model with FPSS
fit2.1.mod <- ah.2ph(Surv(age.in, age.out, bc)  ~  dose + age.rx , data = tbccS,
                     R = incc, Pi = frac, robust = FALSE, ties = F,
                     calibration.variables = c("strt1","strt2"))
# get the results for table 20.4
summary(fit2.1.mod)

## Call:
## ah.2ph(formula = Surv(age.in, age.out, bc) ~ dose + age.rx, data = tbccS, 
##     R = incc, Pi = frac, ties = F, robust = FALSE, calibration.variables = c("strt1", 
##         "strt2"))
## 
##            coef         se   lower.95   upper.95      z p.value  
## [1,]  4.765e-04  2.820e-04 -7.616e-05  1.029e-03  1.690   0.091 .
## [2,] -1.795e-05  2.625e-05 -6.940e-05  3.350e-05 -0.684   0.494  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# obtain the phase I and phase II SEs
sqrt(diag(fit2.1.mod$var.pha1))

##       dose     age.rx 
## 1.8983e-04 2.1908e-05

sqrt(diag(fit2.1.mod$var.pha2))

##       dose     age.rx 
## 2.0851e-04 1.4458e-05

3. Stratified Case-cohort Data Analysis

# ingest data, cohort data with the case-cohort sample labeled, sample size = 1720
tbsaRxccS <-read.table("tb.saRxcc.S.txt", header=T)

Assign Bernoulli sampling probabilities from a sample survey approach

# note subjects not selected into phase II are labeled as NA
# convert the phase II sampling indicator incc to binary 1/0
tbsaRxccS$incc[is.na(tbsaRxccS$incc)] <- 0

# create strata variable based on age and bc disease outcomes
tbsaRxccS <- within(tbsaRxccS,
                    {saRx <- 1 + as.numeric(age.rx >= 19) + as.numeric(age.rx >= 29)})
tbsaRxccS$strata <- tbsaRxccS$saRx
tbsaRxccS$strata[tbsaRxccS$bc == 1] <- 4

# extract phase II subjects
tbsaRxccS.phase2 <- tbsaRxccS[tbsaRxccS$incc == TRUE, ]
# calculate the sampling fractions
fracs <- table(tbsaRxccS.phase2$strata)/table(tbsaRxccS$strata)

# assign a sampling probability to each cohort member
for (i in 1:4){
  tbsaRxccS$frac[tbsaRxccS$strata == i] <- fracs[i]
}

fit the AH model with Bernoulli Sampling

# fit the AH model with Bernoulli Sampling
fit3.mod<- ah.2ph(Surv(age.in, age.out, bc)  ~  dose + age.rx , data = tbsaRxccS,
                  R = incc, Pi = frac, robust = FALSE, ties = F)

fit the AH model with FPSS

# create strt indicators, which become our calibration variables
for (i in 1:4){
  tbsaRxccS$strat_ind <- as.numeric(tbsaRxccS$strat ==i)
   names(tbsaRxccS)[ncol(tbsaRxccS)]= paste0("strt", i)
}
# fit the AH model with FPSS by calibrating the sampling weights to the strata
# frequencies by using stratum membership as our calibration variables, see the detailed
# explanations in section 2
fit3.1.mod <- ah.2ph(Surv(age.in, age.out, bc)  ~  dose + age.rx , data = tbsaRxccS,
                     R = incc, Pi = frac,  robust = FALSE, ties = F,
                     calibration.variables = c("strt1", "strt2", "strt3", "strt4"))
# get the results for table 20.4
summary(fit3.1.mod)

## Call:
## ah.2ph(formula = Surv(age.in, age.out, bc) ~ dose + age.rx, data = tbsaRxccS, 
##     R = incc, Pi = frac, ties = F, robust = FALSE, calibration.variables = c("strt1", 
##         "strt2", "strt3", "strt4"))
## 
##            coef         se   lower.95   upper.95      z p.value  
## [1,]  7.165e-04  3.770e-04 -2.249e-05  1.455e-03  1.900  0.0574 .
## [2,] -3.430e-05  2.131e-05 -7.606e-05  7.472e-06 -1.609  0.1075  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# obtain the phase I and phase II SEs
sqrt(diag(fit3.1.mod$var.pha1))

##       dose     age.rx 
## 2.8579e-04 1.9259e-05

sqrt(diag(fit3.1.mod$var.pha2))

##       dose     age.rx 
## 2.4592e-04 9.1204e-06

4. Analysis with Calibration

construct calibration variables

# step 1: fit the AH model to the full cohort using the total number of
# fluoroscopies/100 and ageRx as the covariates
tb$no.scaled <- tb$no/100
fit.for.calibration <- ah(Surv(age.in, age.out, bc) ~ no.scaled + age.rx, data = tb, ties = F)
# obtain the (integrated) martingale residuals as the calibration variables
calvars <- fit.for.calibration$resid
tbsaRxccS[paste("calvar", 1:2, sep="")] <- calvars

fit the AH model to stratified case-cohort data with calibration assuming Bernoulli sampling

fit4.mod<- ah.2ph(Surv(age.in, age.out, bc) ~ dose + age.rx, data = tbsaRxccS,
                  R = incc, Pi = frac,  robust = FALSE,  ties = F,
                  calibration.variables = c("calvar1", "calvar2"))

fit the AH model to stratified case-cohort data with calibration assuming FPSS. See the detailed explanations in section 2

fit4.1.mod <- ah.2ph(Surv(age.in, age.out, bc) ~ dose + age.rx, data = tbsaRxccS,
         R = incc, Pi = frac,  robust = FALSE, ties = F,
         calibration.variables = c("calvar1", "calvar2", "strt1", "strt2", "strt3", "strt4"))

# get the results for table 20.4
summary(fit4.1.mod)

## Call:
## ah.2ph(formula = Surv(age.in, age.out, bc) ~ dose + age.rx, data = tbsaRxccS, 
##     R = incc, Pi = frac, ties = F, robust = FALSE, calibration.variables = c("calvar1", 
##         "calvar2", "strt1", "strt2", "strt3", "strt4"))
## 
##            coef         se   lower.95   upper.95      z p.value  
## [1,]  5.167e-04  2.938e-04 -5.918e-05  1.093e-03  1.759  0.0787 .
## [2,] -3.650e-05  1.951e-05 -7.474e-05  1.727e-06 -1.871  0.0613 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

###  obtain the phase I and phase II SEs
sqrt(diag(fit4.1.mod$var.pha1))

##       dose     age.rx 
## 2.6244e-04 1.9176e-05

sqrt(diag(fit4.1.mod$var.pha2))

##       dose     age.rx 
## 1.3208e-04 3.5742e-06

Statified Case-Cohort Data Analysis using the Additive Hazards Model

Kate Jie Hu, jhu@uw.edu

May 20th, 2018

1. Full Cohort Analysis

2. Case-cohort Data analysis

3. Stratified Case-cohort Data Analysis

4. Analysis with Calibration