6 min read

Multiple imputation

This is the tutorial for the paper “Missing data in TBI research: A five step approach for multiple imputation”.

Step 1 - Exploration

For the quantity and patterns of missingness

VIM::aggr(dti)

For the correlation between variables

corr <- cor(sapply(dti, as.numeric), use = "pairwise.complete.obs", method = "spearman")
corrplot::corrplot(corr, type = "lower")

For overall distributions and % missingess

# The first time:
# library(devtools)
# install_github("bgravesteijn/bgravesteijn")
bgravesteijn::distr.na(dti)

To test MAR

summary(glm(I(is.na(Pupils))~mGCS+Age+ISS+GOSE, data=dti, family="binomial"))
## 
## Call:
## glm(formula = I(is.na(Pupils)) ~ mGCS + Age + ISS + GOSE, family = "binomial", 
##     data = dti)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3502  -0.2998  -0.2797  -0.2559   2.8021  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.284e+00  6.052e-01  -5.427 5.74e-08 ***
## mGCS        -3.960e-03  7.834e-02  -0.051    0.960    
## Age          6.136e-03  4.244e-03   1.446    0.148    
## ISS         -1.085e-02  6.992e-03  -1.552    0.121    
## GOSE         9.707e-07  5.064e-02   0.000    1.000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1210.7  on 3698  degrees of freedom
## Residual deviance: 1204.2  on 3694  degrees of freedom
##   (823 observations deleted due to missingness)
## AIC: 1214.2
## 
## Number of Fisher Scoring iterations: 6
summary(glm(I(is.na(mGCS))~Pupils+Age+ISS+GOSE, data=dti, family="binomial"))
## 
## Call:
## glm(formula = I(is.na(mGCS)) ~ Pupils + Age + ISS + GOSE, family = "binomial", 
##     data = dti)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3886  -0.1504  -0.1199  -0.0959   3.3980  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.251663   0.919995  -2.447  0.01439 * 
## Pupils1      0.236735   0.646505   0.366  0.71423   
## Pupils2     -1.372729   0.787067  -1.744  0.08114 . 
## Age         -0.016638   0.008650  -1.924  0.05440 . 
## ISS          0.005061   0.011033   0.459  0.64644   
## GOSE        -0.288974   0.093874  -3.078  0.00208 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 384.53  on 3589  degrees of freedom
## Residual deviance: 368.39  on 3584  degrees of freedom
##   (932 observations deleted due to missingness)
## AIC: 380.39
## 
## Number of Fisher Scoring iterations: 8
summary(glm(I(is.na(ISS))~mGCS+Age+Pupils+GOSE, data=dti, family="binomial"))
## 
## Call:
## glm(formula = I(is.na(ISS)) ~ mGCS + Age + Pupils + GOSE, family = "binomial", 
##     data = dti)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.1677  -0.1266  -0.1213  -0.1150   3.4560  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.295091   1.212123  -4.368 1.25e-05 ***
## mGCS         0.146638   0.193995   0.756    0.450    
## Age          0.002723   0.009820   0.277    0.782    
## Pupils1      0.072399   1.065004   0.068    0.946    
## Pupils2     -0.800449   1.107279  -0.723    0.470    
## GOSE        -0.081076   0.107459  -0.754    0.451    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 307.94  on 3581  degrees of freedom
## Residual deviance: 305.70  on 3576  degrees of freedom
##   (940 observations deleted due to missingness)
## AIC: 317.7
## 
## Number of Fisher Scoring iterations: 8
summary(glm(I(is.na(GOSE))~mGCS+Age+ISS+Pupils, data=dti, family="binomial"))
## 
## Call:
## glm(formula = I(is.na(GOSE)) ~ mGCS + Age + ISS + Pupils, family = "binomial", 
##     data = dti)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6931  -0.6144  -0.5683  -0.4778   2.3585  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.308427   0.278584  -4.697 2.64e-06 ***
## mGCS         0.008975   0.041375   0.217 0.828279    
## Age         -0.003581   0.002023  -1.771 0.076622 .  
## ISS         -0.013434   0.003480  -3.860 0.000113 ***
## Pupils1      0.048037   0.254093   0.189 0.850051    
## Pupils2     -0.309961   0.212622  -1.458 0.144895    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3580.2  on 4194  degrees of freedom
## Residual deviance: 3543.4  on 4189  degrees of freedom
##   (327 observations deleted due to missingness)
## AIC: 3555.4
## 
## Number of Fisher Scoring iterations: 4

Step 3 - Imputation

Multiple - single level

library(mice)

miceimp <- dti

miceimp0 <- mice(miceimp, maxit=0)

meth <- miceimp0$method
pred <- miceimp0$predictorMatrix

meth[which(meth=="pmm")] <- "midastouch" #the improved version of PMM, of the miceadds package

pred[, "Site"] <- 0 #don't use sitecode to impute

miceimp <- mice(data = miceimp, method = meth, predictorMatrix = pred, m=5, printFlag = FALSE, set.seed=1234)

Multiple - multilevel

library(jomo)

jomoimp <- dti

jomoimp$GOSE <- factor(jomoimp$GOSE, ordered=TRUE)

# specify the level of each variable
lvl <- c(GOSE=1, 
         Pupils = 1, 
         mGCS = 1, 
         Age = 1, 
         ISS = 1, 
         Site = 2)

jomoimp <- data.frame(jomoimp) #so the subsetting within the function works

fml <- as.formula("GOSE~Pupils+mGCS+Age+ISS+(1|Site)")

jomo.chain <- jomo.clmm.MCMCchain(fml,data = jomoimp[,names(lvl)], level=lvl)

jomoimp <- jomo.clmm(fml,data = jomoimp[,names(lvl)], level=lvl, nimp = 5)

jomoimp.2 <- datalist2mids(split(jomoimp,
                                 jomoimp$Imputation)[-1])

Step 4 - Diagnostics

print("MICE")
## [1] "MICE"
plot(miceimp)

bgravesteijn::distr.na(complete(miceimp,1))
densityplot(miceimp)

print("JOMO")
## [1] "JOMO"
par(mfcol = c(2, 3), mar = c(3, 2.5, 0.5, 0.5), mgp = c(2, 0.6, 0))

apply(jomo.chain$collectbeta[1, ,], 1, plot, type = "l",
      xlab = 'iteration', ylab = '')
## NULL
for (k in 1:dim(jomo.chain$collectomega)[1]) {
  apply(jomo.chain$collectomega[k, , ], 1, plot, type = "l",
        xlab = 'iteration', ylab = '')
}

apply(jomo.chain$collectbetaY[1, ,], 1, plot, type = "l",
      xlab = 'iteration', ylab = '')

## NULL
plot(jomo.chain$collectvarY, type = 'l')

bgravesteijn::distr.na(complete(jomoimp.2,1))
densityplot(jomoimp.2)

Step 5 - Fitting

library(rms)
fit2 <- fit.mult.impute(GOSE~mGCS+Age+Pupils, fitter=lrm, xtrans = miceimp, data=dti)
## 
## Variance Inflation Factors Due to Imputation:
## 
##     y>=3     y>=4     y>=5     y>=6     y>=7     y>=8     mGCS      Age 
##     1.35     1.38     1.35     1.36     1.38     1.49     1.20     1.30 
## Pupils=1 Pupils=2 
##     1.10     1.19 
## 
## Rate of Missing Information:
## 
##     y>=3     y>=4     y>=5     y>=6     y>=7     y>=8     mGCS      Age 
##     0.26     0.27     0.26     0.26     0.28     0.33     0.16     0.23 
## Pupils=1 Pupils=2 
##     0.09     0.16 
## 
## d.f. for t-distribution for Tests of Single Coefficients:
## 
##     y>=3     y>=4     y>=5     y>=6     y>=7     y>=8     mGCS      Age 
##    58.86    53.56    60.35    57.13    52.56    37.41   150.10    73.84 
## Pupils=1 Pupils=2 
##   486.50   163.26 
## 
## The following fit components were averaged over the 5 model fits:
## 
##   stats linear.predictors
fit2
## Logistic Regression Model
##  
##  fit.mult.impute(formula = GOSE ~ mGCS + Age + Pupils, fitter = lrm, 
##      xtrans = miceimp, data = dti)
##  
##  
##  Frequencies of Responses
##  
##     2    3    4    5    6    7    8 
##   521  292  192  272  464  745 2036 
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          4522    LR chi2    1464.45    R2       0.288    C       0.703    
##  max |deriv| 1e-09    d.f.             4    g        1.173    Dxy     0.406    
##                       Pr(> chi2) <0.0001    gr       3.233    gamma   0.408    
##                                             gp       0.194    tau-a   0.299    
##                                             Brier    0.120                     
##  
##           Coef    S.E.   Wald Z Pr(>|Z|)
##  y>=3      0.8848 0.1580   5.60 <0.0001 
##  y>=4      0.1476 0.1575   0.94 0.3486  
##  y>=5     -0.2216 0.1559  -1.42 0.1552  
##  y>=6     -0.6654 0.1573  -4.23 <0.0001 
##  y>=7     -1.2672 0.1602  -7.91 <0.0001 
##  y>=8     -2.0792 0.1686 -12.33 <0.0001 
##  mGCS      0.5453 0.0247  22.05 <0.0001 
##  Age      -0.0209 0.0016 -13.05 <0.0001 
##  Pupils=1 -0.9522 0.1555  -6.13 <0.0001 
##  Pupils=2 -1.5360 0.1273 -12.07 <0.0001 
##