## Read
filename <- "Data_Rezidential.RDS"
Data <- readRDS(filename)
Data$CYW <- ifelse(Data$CYW == 0, 0, Data$CYW - 1)
## Bar Plot
cat("### Frequencies for ACE Score")
### Frequencies for ACE Score
Data %>%
ggplot(aes(x = as.factor(CYW))) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "ACE Score frequency", y = "Percent", x = "ACE Score")
## Is ACE Scoare Poisson distributed?
cat("### Does ACE Scoare follow a Poisson distribution?")
### Does ACE Scoare follow a Poisson distribution?
# Note that if the p value is larger than 0.05, we can not reject h0: the process is a Poisson process.
# Or else, it is not a Poisson process.
gf <- vcd::goodfit(Data$CYW, type = "poisson", method = "ML") # based on load the vcd package
# plot(gf, main = "Poisson", shade = TRUE, legend = FALSE)
# summary(gf) # Likelihood Ratio Test
## to automatically get the pvalue
gf.summary <- capture.output(summary(gf))[[7]] # if p-value is smaller pick [5] from list
pvalue <- unlist(strsplit(gf.summary, split = " "))
pvalue <- as.numeric(pvalue[length(pvalue)]);
cat("Goodness-of-fit test for poisson distribution p-value: ", round(pvalue, 3))
Goodness-of-fit test for poisson distribution p-value: 0
if(pvalue > .05) cat("Yes, it is Poisson") else cat("No, it is not Poisson")
No, it is not Poisson
## Rootograms
hroot_plot1 <- plot(gf, type = "hanging", shade = TRUE, main = "Hanging Rootogram", return_grob = TRUE) # hanging rootogram
droot_plot1 <- plot(gf, type = "deviation", shade = TRUE, main = "Deviation Rootogram", return_grob = TRUE) # deviation rootogram
subtext1 <- "Left: hanging rootogram; Right: deviation rootogram. Hanging rootogram Color reflects the sign and magnitude of the contributions
to lack of fit. moves the rootogram bars so their tops are at the expected frequencies for poisson distribution."
vcd::mplot(hroot_plot1, droot_plot1, sub = subtext1, gp_sub = grid::gpar(fontsize = 11))
## Poissonness plots
dist_plot1 <- vcd::distplot(Data$CYW, type = "poisson", xlab = "ACE", return_grob = TRUE)
subtext2 <- "The fitted line is not within the confidence intervals, indicating the Poisson model is not adequate for these data"
vcd::mplot(dist_plot1, sub = subtext2, gp_sub = grid::gpar(fontsize = 11))
## Negative Binomial?
cat("### Does ACE Scoare follow a Negative Binomial distribution?")
### Does ACE Scoare follow a Negative Binomial distribution?
gf2 <- vcd::goodfit(Data$CYW, type = "nbinomial")
# summary(gf2) # Likelihood Ratio Test
# plot(gf2, main = "Negative binomial", shade = TRUE, legend = FALSE)
dist_plot2 <- vcd::distplot(Data$CYW, type = "nbinomial", xlab = "ACE", return_grob = TRUE)
subtext3 <- "The fitted line is within the confidence intervals, indicating the adequacy of the Poisson model for these data"
vcd::mplot(dist_plot2, sub = subtext3, gp_sub = grid::gpar(fontsize = 11))
## Ord plots: Diagnostic slope and intercept for four discrete distributions
vcd::Ord_plot(Data$CYW, main = "Ord plot", gp = grid::gpar(cex = 1), pch = 16)
cat("#### Test a good liniar model")
#### Test a good liniar model
## Data for Linear Regression Step
Data_lm <-
Data %>% # recode v_mama_nastere to binary
mutate(v_mama_nastere_d = fct_recode(v_mama_nastere, "1" = "<19" , "0" = "20-25", "0" = "26–34", "0" = "35>")) %>%
mutate_at(vars(v_mama_nastere_d), funs(as.numeric(as.character(.)))) %>%
select(CYW, varsta, gen, 9:29, v_mama_nastere_d) %>%
mutate_at(vars(expunere_tox:comunit), funs(replace_na(., 0)))
library(gvlma)
library(olsrr)
## Linear Regression for Test - full model
mod_lm_full <- lm(CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom + neglijare + varsta + boli, data = Data_lm)
moderndive::get_regression_table(mod_lm_full)
moderndive::get_regression_summaries(mod_lm_full)
par(mfrow = c(2, 2)); plot(mod_lm_full)
gvlma::gvlma(mod_lm_full)
Call:
lm(formula = CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom +
neglijare + varsta + boli, data = Data_lm)
Coefficients:
(Intercept) expunere_tox varsta_inst tras_dez schimb_dom neglijare varsta boli
1.8865 0.9103 0.0810 1.1212 0.7154 0.5677 0.0968 0.6371
ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
Level of Significance = 0.05
Call:
gvlma::gvlma(x = mod_lm_full)
# Influential Observations -- Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(Data)-length(mod_lm_full$coefficients)-2))
plot(mod_lm_full, which = 4, cook.levels=cutoff)
# Influence Plot
car::influencePlot(mod_lm_full, main = "Influence Plot", sub = "Circle size is proportial to Cook's Distance")
# Evaluate Collinearity
olsrr::ols_coll_diag(mod_lm_full) # VIF si Tolerance din olsrr
Tolerance and Variance Inflation Factor
---------------------------------------
Eigenvalue and Condition Index
------------------------------
car::vif(mod_lm_full) # variance inflation factors
expunere_tox varsta_inst tras_dez schimb_dom neglijare varsta boli
1.085937 1.128838 1.082916 1.067616 1.047941 1.138849 1.074788
sqrt(vif(mod_lm_full)) > 2 # problem?
expunere_tox varsta_inst tras_dez schimb_dom neglijare varsta boli
FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Evaluate Nonlinearity
# component + residual plot
car::crPlots(mod_lm_full, ask = FALSE)
# Ceres plots
# car::ceresPlots(mod_lm_full, ask = FALSE)
Data_var_imp <-
Data %>% # recode v_mama_nastere to binary
select(CYW, varsta, gen, 9:29, v_mama_nastere, tip_chestionar) %>%
mutate_at(vars(expunere_tox:comunit), funs(replace_na(., 0)))
with(Data_var_imp,
by(CYW, INDICES = v_mama_nastere, FUN = summarytools::descr, transpose = TRUE,
stats = c("n.valid", "mean", "sd", "min", "med", "max", "skewness", "kurtosis"), plain.ascii = FALSE, headings = FALSE)) %>%
view(method = "render", style = "rmarkdown", footnote = NA)
N.Valid | Mean | Std.Dev | Min | Median | Max | Skewness | Kurtosis | |
---|---|---|---|---|---|---|---|---|
<19 | 157 | 4.61 | 3.44 | 0.00 | 4.00 | 17.00 | 0.76 | 0.16 |
20-25 | 497 | 4.84 | 3.24 | 0.00 | 5.00 | 18.00 | 0.46 | -0.15 |
2634 | 369 | 4.18 | 2.84 | 0.00 | 4.00 | 13.00 | 0.33 | -0.53 |
35> | 115 | 4.60 | 2.66 | 0.00 | 5.00 | 13.00 | 0.50 | 0.06 |
ggplot(Data_var_imp, aes(x = v_mama_nastere, y = CYW)) +
geom_boxplot() +
stat_summary(fun.data = mean_se, colour = "darkred") +
ggpubr::stat_compare_means(method = "t.test",
label = "p.signif", # to avoid scientific notation of very small p-values
paired = FALSE,
comparisons = list(c("<19", "20-25"),
c("<19", "26–34"),
c("20-25", "26–34"),
c("20-25", "35>"),
c("26–34", "35>")))
ggplot(Data_var_imp, aes(x = gen, y = CYW)) +
facet_wrap(~v_mama_nastere) +
geom_boxplot() +
stat_summary(fun.data = mean_se, colour = "darkred") +
ggpubr::stat_compare_means(method = "t.test",
label = "p.signif", # to avoid scientific notation of very small p-values
paired = FALSE,
comparisons = list(c("m", "f")))
with(Data_var_imp,
by(CYW, INDICES = gen, FUN = summarytools::descr, transpose = TRUE,
stats = c("n.valid", "mean", "sd", "min", "med", "max", "skewness", "kurtosis"), plain.ascii = FALSE, headings = FALSE)) %>%
view(method = "render", style = "rmarkdown", footnote = NA)
N.Valid | Mean | Std.Dev | Min | Median | Max | Skewness | Kurtosis | |
---|---|---|---|---|---|---|---|---|
f | 666 | 4.64 | 3.20 | 0.00 | 4.00 | 18.00 | 0.50 | -0.13 |
m | 609 | 4.54 | 3.13 | 0.00 | 4.00 | 15.00 | 0.48 | -0.32 |
tadaatoolbox::tadaa_t.test(data = Data_var_imp,
response = CYW, group = gen, paired = FALSE) # , print = "markdown"
ggplot(Data_var_imp, aes(x = gen, y = CYW)) +
geom_boxplot() +
stat_summary(fun.data = mean_se, colour = "darkred") +
ggpubr::stat_compare_means(method = "t.test",
label = "p.signif", # to avoid scientific notation of very small p-values
paired = FALSE,
comparisons = list(c("f", "m")))
with(Data_var_imp,
by(CYW, INDICES = tip_chestionar, FUN = summarytools::descr, transpose = TRUE,
stats = c("n.valid", "mean", "sd", "min", "med", "max", "skewness", "kurtosis"), plain.ascii = FALSE, headings = FALSE)) %>%
view(method = "render", style = "rmarkdown", footnote = NA)
N.Valid | Mean | Std.Dev | Min | Median | Max | Skewness | Kurtosis | |
---|---|---|---|---|---|---|---|---|
5-8ani | 152 | 3.78 | 2.66 | 0.00 | 3.50 | 12.00 | 0.39 | -0.45 |
5-8intarziere | 78 | 6.13 | 3.20 | 0.00 | 6.00 | 13.00 | -0.32 | -0.39 |
9-18ani | 1045 | 4.60 | 3.19 | 0.00 | 4.00 | 18.00 | 0.54 | -0.14 |
t.test(CYW ~ tip_chestionar, data = Data_var_imp[Data_var_imp$tip_chestionar %in% c("5-8ani", "5-8intarziere"),])
Welch Two Sample t-test
data: CYW by tip_chestionar
t = -5.5818, df = 132.89, p-value = 0.0000001286
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-3.185314 -1.518465
sample estimates:
mean in group 5-8ani mean in group 5-8intarziere
3.776316 6.128205
tadaatoolbox::tadaa_t.test(data = Data_var_imp[Data_var_imp$tip_chestionar %in% c("5-8ani", "5-8intarziere"),],
response = CYW, group = tip_chestionar, paired = FALSE) # , print = "markdown"
t.test(CYW ~ tip_chestionar, data = Data_var_imp[Data_var_imp$tip_chestionar %in% c("5-8intarziere", "9-18ani"),]) # this works, tadaatoolbox bug
Welch Two Sample t-test
data: CYW by tip_chestionar
t = 4.0759, df = 88.871, p-value = 0.00009951
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.7836904 2.2746337
sample estimates:
mean in group 5-8intarziere mean in group 9-18ani
6.128205 4.599043
# tadaatoolbox::tadaa_t.test(data = Data_var_imp[Data_var_imp$tip_chestionar %in% c("5-8intarziere", "9-18ani"),],
# response = CYW, group = tip_chestionar, paired = FALSE) # , print = "markdown"
#
t.test(CYW ~ tip_chestionar, data = Data_var_imp[Data_var_imp$tip_chestionar %in% c("5-8ani", "9-18ani"),]) # this works, tadaatoolbox bug
Welch Two Sample t-test
data: CYW by tip_chestionar
t = -3.467, df = 219.6, p-value = 0.0006332
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.2904142 -0.3550403
sample estimates:
mean in group 5-8ani mean in group 9-18ani
3.776316 4.599043
# tadaatoolbox::tadaa_t.test(data = Data_var_imp[Data_var_imp$tip_chestionar %in% c("5-8ani", "9-18ani"),],
# response = CYW, group = tip_chestionar, paired = FALSE) # , print = "markdown"
ggplot(Data_var_imp, aes(x = tip_chestionar, y = CYW)) +
geom_boxplot() +
stat_summary(fun.data = mean_se, colour = "darkred") +
ggpubr::stat_compare_means(method = "t.test",
label = "p.signif", # to avoid scientific notation of very small p-values
paired = FALSE,
comparisons = list(c("5-8ani", "5-8intarziere"),
c("5-8intarziere", "9-18ani"),
c("5-8ani", "9-18ani")))
## Poisson Regression Step
step_pois_null <- glm(CYW ~ 1, family = poisson, data = Data_lm_step)
step_pois_full <- glm(CYW ~ ., family = poisson, data = Data_lm_step)
cat("#### Poisson - Bidirectional step by BIC")
#### Poisson - Bidirectional step by BIC
mod_pois_BIC <- step(step_pois_null, scope=list(lower=formula(step_pois_null), upper=formula(step_pois_full)), direction="both", k = log(nrow(Data_lm_step)), trace = FALSE)
summary(mod_pois_BIC)
Call:
glm(formula = CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom +
neglijare + varsta + boli + comunit + nr_frati, family = poisson,
data = Data_lm_step)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.9623 -1.0943 -0.1427 0.8560 3.4029
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.817902 0.073330 11.154 < 0.0000000000000002 ***
expunere_tox 0.159382 0.033891 4.703 0.00000256711 ***
varsta_inst 0.024757 0.004187 5.913 0.00000000335 ***
tras_dez 0.189401 0.052076 3.637 0.000276 ***
schimb_dom 0.146103 0.035932 4.066 0.00004779879 ***
neglijare 0.130465 0.031784 4.105 0.00004047885 ***
varsta 0.017343 0.005047 3.437 0.000589 ***
boli 0.150929 0.045275 3.334 0.000857 ***
comunit 0.105935 0.036907 2.870 0.004101 **
nr_frati 0.015987 0.005643 2.833 0.004609 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2165.1 on 929 degrees of freedom
Residual deviance: 1938.7 on 920 degrees of freedom
AIC: 4776.7
Number of Fisher Scoring iterations: 5
# summary(step(step_pois_full, ~.^2, direction = "both", k = log(nrow(Data_lm_step)), trace = FALSE)) # step for all terms and all interactions !!!
cat("#### Poisson - Bidirectional step by AIC")
#### Poisson - Bidirectional step by AIC
mod_pois_AIC <- step(step_pois_null, scope=list(lower=formula(step_pois_null), upper=formula(step_pois_full)), direction="both", trace = FALSE)
summary(mod_pois_AIC)
Call:
glm(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom +
neglijare + varsta + boli + comunit + nr_frati + v_mama_nastere +
TCC + gen + temperam + scoala_spec + intarziere + asfixie +
tulb_cond + abuz_sub, family = poisson, data = Data_lm_step)
Deviance Residuals:
Min 1Q Median 3Q Max
-4.2716 -1.0910 -0.1607 0.8802 3.4389
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.786797 0.085342 9.219 < 0.0000000000000002 ***
expunere_tox 0.136271 0.034537 3.946 0.0000795897 ***
varsta_inst 0.023153 0.004238 5.463 0.0000000469 ***
schimb_dom 0.137522 0.036335 3.785 0.000154 ***
neglijare 0.143642 0.032056 4.481 0.0000074308 ***
varsta 0.020166 0.005173 3.898 0.0000969802 ***
boli 0.131460 0.046740 2.813 0.004914 **
comunit 0.085488 0.037782 2.263 0.023657 *
nr_frati 0.019418 0.005775 3.362 0.000773 ***
v_mama_nastere20-25 0.065748 0.049539 1.327 0.184444
v_mama_nastere26–34 -0.054876 0.053270 -1.030 0.302933
v_mama_nastere35> -0.040525 0.066041 -0.614 0.539458
TCC 0.179279 0.101340 1.769 0.076880 .
genm -0.063241 0.031126 -2.032 0.042177 *
temperam 0.079530 0.046975 1.693 0.090448 .
scoala_spec -0.237303 0.075600 -3.139 0.001696 **
intarziere 0.106579 0.047019 2.267 0.023409 *
asfixie 0.223835 0.105030 2.131 0.033077 *
tulb_cond 0.120407 0.050427 2.388 0.016952 *
abuz_sub -0.117746 0.072784 -1.618 0.105715
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2165.1 on 929 degrees of freedom
Residual deviance: 1900.5 on 910 degrees of freedom
AIC: 4758.6
Number of Fisher Scoring iterations: 5
# summary(step(step_pois_full, ~.^2, direction = "both", trace = FALSE)) # step for all terms and all interactions !!!
## Data for GLMs
Data_glm <-
Data %>% # recode v_mama_nastere to binary
mutate(v_mama_nastere_d = fct_recode(v_mama_nastere, "1" = "<19" , "0" = "20-25", "0" = "26–34", "0" = "35>")) %>%
mutate_at(vars(v_mama_nastere_d), funs(as.numeric(as.character(.)))) %>%
select(CYW, varsta, gen, 9:29, v_mama_nastere_d) %>%
mutate_at(vars(expunere_tox:comunit), funs(replace_na(., 0)))
## GLM - Poisson
cat("#### Test a good poisson model")
#### Test a good poisson model
# mod_pois <- glm(CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom + boli + neglijare + varsta +
# nr_frati + TCC, family = poisson, data = Data_glm) # fisrt decent model
mod_pois <- glm(CYW ~ expunere_tox + varsta_inst + schimb_dom + boli + varsta +
nr_frati + gen + comunit + intarziere + TCC + neglijare +
scoala_spec + tulb_cond, family = poisson, data = Data_glm) # best possible model
summary(mod_pois) # tidy(mod_pois) %>% mutate_if(is.numeric, round, 2) %>% xlsx::write.xlsx(., file = "pois.xlsx")
Call:
glm(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom +
boli + varsta + nr_frati + gen + comunit + intarziere + TCC +
neglijare + scoala_spec + tulb_cond, family = poisson, data = Data_glm)
Deviance Residuals:
Min 1Q Median 3Q Max
-4.1038 -1.1869 -0.1411 0.8357 4.4377
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.874702 0.070492 12.409 < 0.0000000000000002 ***
expunere_tox 0.139297 0.031750 4.387 0.000011479 ***
varsta_inst 0.015875 0.003743 4.241 0.000022227 ***
schimb_dom 0.130945 0.033140 3.951 0.000077725 ***
boli 0.098082 0.043801 2.239 0.025140 *
varsta 0.023857 0.004688 5.089 0.000000359 ***
nr_frati 0.011953 0.005202 2.298 0.021579 *
genm -0.059882 0.028346 -2.113 0.034643 *
comunit 0.085733 0.034140 2.511 0.012032 *
intarziere 0.128884 0.043780 2.944 0.003241 **
TCC 0.219910 0.083426 2.636 0.008389 **
neglijare 0.113752 0.028729 3.959 0.000075111 ***
scoala_spec -0.217585 0.065682 -3.313 0.000924 ***
tulb_cond 0.148534 0.040335 3.683 0.000231 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2666.1 on 1106 degrees of freedom
Residual deviance: 2416.2 on 1093 degrees of freedom
(168 observations deleted due to missingness)
AIC: 5800.8
Number of Fisher Scoring iterations: 5
plot(mod_pois, ask = FALSE)
# pr <- sum(residuals(mod_pois, type="pearson")^2) # Pearson Chi2
# pr/mod_pois$df.residual # dispersion statistic
msme:::P__disp(mod_pois) # commented the 2 lines above, this function does both
pearson.chi2 dispersion
2145.03392 1.96252
dev <- deviance(mod_pois)
df <- df.residual(mod_pois)
p_value <- 1-pchisq(dev,df)
print(matrix(c("Deviance GOF"," ","D",round(dev,4),"df",df, # the deviance GOF test, a Chi2 p < 0.05 indicates that the model is considered well fit
"p_value",p_value), ncol=2))
[,1] [,2]
[1,] "Deviance GOF" "df"
[2,] " " "1093"
[3,] "D" "p_value"
[4,] "2416.2099" "0"
# these assess the overall performance of a model in reproducing the data. The commonly used measures include the Pearson chi-square and likelihoodratio
# deviance statistics, which can be seen as weighted sums of residuals.
COUNT::modelfit(mod_pois)
$AIC
[1] 5800.816
$AICn
[1] 5.240123
$BIC
[1] 5870.948
$BICqh
[1] 5.28158
# cnt <- table(Data_lm$CYW)
# dataf <- data.frame(prop.table(table(Data_lm$CYW) ) )
# dataf$cumulative <- cumsum(dataf$Freq)
# datafall <- data.frame(cnt, dataf$Freq*100, dataf$cumulative * 100)
mod_pois$aic / (mod_pois$df.null+1) # AIC/n
[1] 5.240123
exp(coef(mod_pois)) # IRR
(Intercept) expunere_tox varsta_inst schimb_dom boli varsta nr_frati genm comunit intarziere TCC
2.3981606 1.1494650 1.0160019 1.1399049 1.1030528 1.0241442 1.0120251 0.9418756 1.0895156 1.1375582 1.2459646
neglijare scoala_spec tulb_cond
1.1204746 0.8044595 1.1601325
exp(coef(mod_pois))*sqrt(diag(vcov(mod_pois))) # delta method
(Intercept) expunere_tox varsta_inst schimb_dom boli varsta nr_frati genm comunit intarziere TCC
0.169051093 0.036495835 0.003802940 0.037775996 0.048315120 0.004800822 0.005264885 0.026698830 0.037196372 0.049802209 0.103946225
neglijare scoala_spec tulb_cond
0.032190189 0.052838615 0.046793765
exp(confint.default(mod_pois)) # CI of IRR
2.5 % 97.5 %
(Intercept) 2.0886967 2.7534751
expunere_tox 1.0801146 1.2232680
varsta_inst 1.0085755 1.0234829
schimb_dom 1.0682186 1.2164019
boli 1.0123078 1.2019324
varsta 1.0147779 1.0335970
nr_frati 1.0017585 1.0223968
genm 0.8909740 0.9956853
comunit 1.0189977 1.1649136
intarziere 1.0440182 1.2394789
TCC 1.0580181 1.4672979
neglijare 1.0591264 1.1853763
scoala_spec 0.7072866 0.9149827
tulb_cond 1.0719500 1.2555693
## Test for overdispersion
# Z-Score Test (assumtions: The data set on which the test is used is large. z is t-distributed.), ns = overdispersed
mu <-predict(mod_pois, type="response")
z <- ((Data_glm$CYW - mu)^2 - Data_glm$CYW)/ (mu * sqrt(2))
summary(zscore <- lm(z ~ 1)) # the hypothesis of no overdispersion is rejected (i.e., that it is likely that real overdispersion exists in the data)
Call:
lm(formula = z ~ 1)
Residuals:
Min 1Q Median 3Q Max
-1.7800 -1.5225 -0.7561 0.8374 19.2377
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.02212 0.06267 16.31 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.238 on 1274 degrees of freedom
# Lagrange Multiplier Test, ns = overdispersed
obs <- nrow(Data_glm) # continue from Table 3.2
mmu <- mean(mu); nybar <- obs*mmu; musq <- mu*mu
mu2 <- mean(musq)*obs
chival <- (mu2 - nybar)^2/(2*mu2); chival
[1] 9217.919
pchisq(chival,1,lower.tail = FALSE) # the hypothesis of no overdispersion is again rejected
[1] 0
# Many statisticians argue that robust standard errors should be the default standard errors for all count response regression models.
# A robust variance estimator adjusts standard errors for correlation in the data. That is, robust standard
# errors should be used when the data are not independent, perhaps gathered
# over different households, hospitals, schools, cities, litters, and so forth.
# Robust variance estimators have also been referred to as sandwich variance
# estimators or heteroskedastic robust estimators.
# Lack of fit in a GLM for count data can result either from a mis-specified model for the systematic
# component (omitted or unmeasured predictors, nonlinear relations, etc.) or from failure of the
# Poisson mean = variance assumption. Thus, use of these methods requires some high degree of
# confidence that the systematic part of the model has been correctly specified, so that any lack of fit
# can be attributed to overdispersion.
# One way of dealing with this is to base inference on so-called sandwich covariance estimators
# that are robust against some types of model mis-specification.
require("sandwich") # over-dispersion is present in this data set, we re-compute the Wald tests using sandwich standard errors
sandw_coefse <- lmtest::coeftest(mod_pois, vcov = sandwich) # sandwich-adjusted Poisson
sandw_coefse
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.8747020 0.0964586 9.0682 < 0.00000000000000022 ***
expunere_tox 0.1392966 0.0424894 3.2784 0.0010440 **
varsta_inst 0.0158752 0.0056217 2.8239 0.0047442 **
schimb_dom 0.1309448 0.0430544 3.0414 0.0023549 **
boli 0.0980816 0.0596660 1.6438 0.1002080
varsta 0.0238574 0.0063528 3.7554 0.0001731 ***
nr_frati 0.0119533 0.0072897 1.6398 0.1010544
genm -0.0598821 0.0387525 -1.5452 0.1222875
comunit 0.0857332 0.0437573 1.9593 0.0500791 .
intarziere 0.1288840 0.0616466 2.0907 0.0365559 *
TCC 0.2199100 0.1162442 1.8918 0.0585185 .
neglijare 0.1137524 0.0400693 2.8389 0.0045271 **
scoala_spec -0.2175847 0.1003456 -2.1684 0.0301319 *
tulb_cond 0.1485342 0.0554158 2.6804 0.0073543 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
sandw_ci <- lmtest::coefci(mod_pois, vcov = sandwich)
exp(sandw_ci) # CI of sandwich-adjusted IRR
2.5 % 97.5 %
(Intercept) 1.9850552 2.8972365
expunere_tox 1.0576177 1.2492885
varsta_inst 1.0048687 1.0272585
schimb_dom 1.0476607 1.2402709
boli 0.9813153 1.2398926
varsta 1.0114714 1.0369759
nr_frati 0.9976686 1.0265882
genm 0.8729861 1.0162013
comunit 0.9999704 1.1870794
intarziere 1.0080914 1.2836521
TCC 0.9921069 1.5647787
neglijare 1.0358455 1.2120180
scoala_spec 0.6608301 0.9793063
tulb_cond 1.0407289 1.2932354
library(effects)
plot(allEffects(mod_pois), band.colors = "blue", lwd = 3, ylab = "ACE Score", main = "", rows=5, cols=3) # plot meta-array: rows=5, cols=3
## GLM - Quasi Poisson
# In R, Poisson models with scaled standard errors are called quasipoisson:
# A Pearson dispersion in excess of 1.0 indicates likely Poisson model
# overdispersion. Whether the overdispersion is significant depends on
# (1) the value of the dispersion statistic, (2) the number of observations
# in the model, and (3) the structure of the data; for example, if the data
# are highly unbalanced.
# mod_qpois <- glm(CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom + boli + neglijare + varsta +
# nr_frati + TCC, family = quasipoisson, data = Data_glm) # first decent model
mod_qpois <- glm(CYW ~ expunere_tox + varsta_inst + schimb_dom + boli + varsta +
nr_frati + gen + comunit + intarziere + TCC + neglijare +
scoala_spec + tulb_cond, family = quasipoisson, data = Data_glm) # best possible model
summary(mod_qpois) # Dispersion parameter for quasipoisson family taken to be 1.976578 -- is > 1
Call:
glm(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom +
boli + varsta + nr_frati + gen + comunit + intarziere + TCC +
neglijare + scoala_spec + tulb_cond, family = quasipoisson,
data = Data_glm)
Deviance Residuals:
Min 1Q Median 3Q Max
-4.1038 -1.1869 -0.1411 0.8357 4.4377
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.874702 0.098752 8.858 < 0.0000000000000002 ***
expunere_tox 0.139297 0.044479 3.132 0.001784 **
varsta_inst 0.015875 0.005244 3.028 0.002523 **
schimb_dom 0.130945 0.046425 2.821 0.004881 **
boli 0.098082 0.061361 1.598 0.110236
varsta 0.023857 0.006567 3.633 0.000293 ***
nr_frati 0.011953 0.007288 1.640 0.101261
genm -0.059882 0.039711 -1.508 0.131853
comunit 0.085733 0.047827 1.793 0.073319 .
intarziere 0.128884 0.061331 2.101 0.035831 *
TCC 0.219910 0.116872 1.882 0.060151 .
neglijare 0.113752 0.040247 2.826 0.004793 **
scoala_spec -0.217585 0.092014 -2.365 0.018219 *
tulb_cond 0.148534 0.056505 2.629 0.008692 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for quasipoisson family taken to be 1.96252)
Null deviance: 2666.1 on 1106 degrees of freedom
Residual deviance: 2416.2 on 1093 degrees of freedom
(168 observations deleted due to missingness)
AIC: NA
Number of Fisher Scoring iterations: 5
# over-dispersion can be confirmed by comparison of the log-likelihoods of the Poisson and negative binomial model
# tidy(mod_qpois) %>% mutate_if(is.numeric, round, 2) %>% xlsx::write.xlsx(., file = "qpois.xlsx")
exp(coef(mod_qpois)) # IRR
(Intercept) expunere_tox varsta_inst schimb_dom boli varsta nr_frati genm comunit intarziere TCC
2.3981606 1.1494650 1.0160019 1.1399049 1.1030528 1.0241442 1.0120251 0.9418756 1.0895156 1.1375582 1.2459646
neglijare scoala_spec tulb_cond
1.1204746 0.8044595 1.1601325
exp(coef(mod_qpois))*sqrt(diag(vcov(mod_qpois))) # delta method
(Intercept) expunere_tox varsta_inst schimb_dom boli varsta nr_frati genm comunit intarziere TCC
0.236823632 0.051127006 0.005327538 0.052920383 0.067684638 0.006725471 0.007375576 0.037402384 0.052108388 0.069767902 0.145618239
neglijare scoala_spec tulb_cond
0.045095227 0.074021602 0.065553372
exp(confint.default(mod_qpois)) # CI of IRR
2.5 % 97.5 %
(Intercept) 1.9761515 2.9102902
expunere_tox 1.0535015 1.2541697
varsta_inst 1.0056136 1.0264975
schimb_dom 1.0407618 1.2484923
boli 0.9780602 1.2440191
varsta 1.0110470 1.0374111
nr_frati 0.9976720 1.0265847
genm 0.8713485 1.0181112
comunit 0.9920257 1.1965861
intarziere 1.0087146 1.2828589
TCC 0.9908871 1.5667050
neglijare 1.0354857 1.2124391
scoala_spec 0.6717097 0.9634445
tulb_cond 1.0385093 1.2959994
## Test if there is overdispersion in BIC step selected model (best fitting) --- overdispersion still present
# mod_qpoisBIC <- glm(CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom + neglijare +
# varsta + boli + comunit, family = quasipoisson, data = Data_glm) # best BIC
# summary(mod_qpoisBIC)
# Step selection NB Regression
step_glm_nb_full <- MASS::glm.nb(CYW ~ ., data = Data_lm_step)
cat("#### NB GLM - Bidirectional step by AIC")
#### NB GLM - Bidirectional step by AIC
mod_nb_AIC <- MASS::stepAIC(step_glm_nb_full, direction = "both", trace = FALSE)
summary(mod_nb_AIC)
Call:
MASS::glm.nb(formula = CYW ~ varsta + gen + varsta_inst + nr_frati +
expunere_tox + boli + TCC + intarziere + tras_dez + neglijare +
temperam + scoala_spec + schimb_dom + comunit, data = Data_lm_step,
init.theta = 4.675848344, link = log)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.1972 -0.8322 -0.0907 0.6144 2.1442
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.792959 0.106074 7.476 0.0000000000000769 ***
varsta 0.021459 0.007186 2.986 0.002824 **
genm -0.064499 0.043807 -1.472 0.140931
varsta_inst 0.023296 0.006062 3.843 0.000122 ***
nr_frati 0.015695 0.008152 1.925 0.054193 .
expunere_tox 0.149244 0.049391 3.022 0.002514 **
boli 0.132322 0.068679 1.927 0.054022 .
TCC 0.243798 0.145721 1.673 0.094317 .
intarziere 0.116584 0.068857 1.693 0.090432 .
tras_dez 0.142578 0.085173 1.674 0.094134 .
neglijare 0.131975 0.044996 2.933 0.003357 **
temperam 0.096056 0.063181 1.520 0.128426
scoala_spec -0.206528 0.104283 -1.980 0.047651 *
schimb_dom 0.133804 0.052676 2.540 0.011081 *
comunit 0.091172 0.054269 1.680 0.092958 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for Negative Binomial(4.6758) family taken to be 1)
Null deviance: 1231.5 on 929 degrees of freedom
Residual deviance: 1106.3 on 915 degrees of freedom
AIC: 4556.5
Number of Fisher Scoring iterations: 1
Theta: 4.676
Std. Err.: 0.497
2 x log-likelihood: -4524.514
cat("#### NB GLM - Bidirectional step by BIC")
#### NB GLM - Bidirectional step by BIC
mod_nb_BIC <- MASS::stepAIC(step_glm_nb_full, direction = "both", k = log(nrow(Data_lm_step)), trace = FALSE)
summary(mod_nb_BIC)
Call:
MASS::glm.nb(formula = CYW ~ varsta + varsta_inst + expunere_tox +
tras_dez + neglijare + schimb_dom, data = Data_lm_step, init.theta = 4.368457033,
link = log)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0604 -0.8216 -0.1146 0.5956 2.2701
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.877921 0.099373 8.835 < 0.0000000000000002 ***
varsta 0.020118 0.007127 2.823 0.004761 **
varsta_inst 0.023193 0.006071 3.821 0.000133 ***
expunere_tox 0.210956 0.047733 4.420 0.00000989 ***
tras_dez 0.235963 0.079463 2.969 0.002983 **
neglijare 0.135866 0.045429 2.991 0.002783 **
schimb_dom 0.155043 0.052708 2.942 0.003266 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for Negative Binomial(4.3685) family taken to be 1)
Null deviance: 1198.2 on 929 degrees of freedom
Residual deviance: 1102.6 on 923 degrees of freedom
AIC: 4566.2
Number of Fisher Scoring iterations: 1
Theta: 4.368
Std. Err.: 0.448
2 x log-likelihood: -4550.245
# mod_nb <- MASS::glm.nb(CYW ~ varsta + varsta_inst + expunere_tox + TCC + intarziere +
# neglijare + temperam + scoala_spec + schimb_dom, data = Data_glm) # a good model on Poisson and NB
# summary(mod_nb)
## Negative Binomial GLM - a good model
cat("#### Test a good NB model")
#### Test a good NB model
# NB2
mod_nb2 <- MASS::glm.nb(CYW ~ expunere_tox + varsta_inst + schimb_dom +
varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond,
data = Data_glm)
summary(mod_nb2) # tidy(mod_nb2) %>% mutate_if(is.numeric, round, 2) %>% xlsx::write.xlsx(., file = "mod_nb2.xlsx")
Call:
MASS::glm.nb(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom +
varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond,
data = Data_glm, init.theta = 3.853972983, link = log)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0243 -0.8670 -0.1277 0.5529 2.6456
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.858096 0.093796 9.149 < 0.0000000000000002 ***
expunere_tox 0.182474 0.046301 3.941 0.0000811 ***
varsta_inst 0.015630 0.005415 2.886 0.00390 **
schimb_dom 0.140069 0.049631 2.822 0.00477 **
varsta 0.026321 0.006679 3.941 0.0000812 ***
intarziere 0.178918 0.065683 2.724 0.00645 **
TCC 0.296167 0.136563 2.169 0.03010 *
neglijare 0.129852 0.041878 3.101 0.00193 **
scoala_spec -0.231707 0.094070 -2.463 0.01377 *
tulb_cond 0.155874 0.060333 2.584 0.00978 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for Negative Binomial(3.854) family taken to be 1)
Null deviance: 1483.7 on 1157 degrees of freedom
Residual deviance: 1370.0 on 1148 degrees of freedom
(117 observations deleted due to missingness)
AIC: 5736.4
Number of Fisher Scoring iterations: 1
Theta: 3.854
Std. Err.: 0.333
2 x log-likelihood: -5714.372
exp(coef(mod_nb2)); # exp(coef(mod_nb2))*sqrt(diag(vcov(mod_nb2))) # adjust SE???, not here
(Intercept) expunere_tox varsta_inst schimb_dom varsta intarziere TCC neglijare scoala_spec tulb_cond
2.3586654 1.2001827 1.0157523 1.1503535 1.0266700 1.1959221 1.3446951 1.1386603 0.7931786 1.1686793
exp(confint.default(mod_nb2))
2.5 % 97.5 %
(Intercept) 1.9625789 2.8346899
expunere_tox 1.0960647 1.3141911
varsta_inst 1.0050279 1.0265911
schimb_dom 1.0437240 1.2678765
varsta 1.0133183 1.0401977
intarziere 1.0514624 1.3602291
TCC 1.0289180 1.7573847
neglijare 1.0489332 1.2360627
scoala_spec 0.6596274 0.9537693
tulb_cond 1.0383405 1.3153791
mod_pois_new <- glm(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom +
varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond,
family = poisson, data = Data_glm)
summary(mod_pois_new) # tidy(mod_pois_new) %>% mutate_if(is.numeric, round, 2) %>% xlsx::write.xlsx(., file = "mod_pois_new.xlsx")
Call:
glm(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom +
varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond,
family = poisson, data = Data_glm)
Deviance Residuals:
Min 1Q Median 3Q Max
-4.1411 -1.2151 -0.1879 0.8585 4.5745
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.869963 0.064547 13.478 < 0.0000000000000002 ***
expunere_tox 0.176596 0.030316 5.825 0.0000000057 ***
varsta_inst 0.015592 0.003594 4.338 0.0000143585 ***
schimb_dom 0.144852 0.032228 4.495 0.0000069713 ***
varsta 0.025715 0.004551 5.651 0.0000000160 ***
intarziere 0.165831 0.042378 3.913 0.0000911183 ***
TCC 0.269712 0.081106 3.325 0.000883 ***
neglijare 0.129210 0.028203 4.582 0.0000046162 ***
scoala_spec -0.220913 0.063488 -3.480 0.000502 ***
tulb_cond 0.151754 0.038747 3.916 0.0000898524 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2833.2 on 1157 degrees of freedom
Residual deviance: 2582.8 on 1148 degrees of freedom
(117 observations deleted due to missingness)
AIC: 6094
Number of Fisher Scoring iterations: 5
exp(coef(mod_pois_new))
(Intercept) expunere_tox varsta_inst schimb_dom varsta intarziere TCC neglijare scoala_spec tulb_cond
2.3868231 1.1931493 1.0157145 1.1558685 1.0260483 1.1803735 1.3095875 1.1379295 0.8017862 1.1638733
exp(confint.default(mod_pois_new))
2.5 % 97.5 %
(Intercept) 2.1031870 2.7087105
expunere_tox 1.1243206 1.2661915
varsta_inst 1.0085846 1.0228947
schimb_dom 1.0851148 1.2312357
varsta 1.0169375 1.0352406
intarziere 1.0862928 1.2826022
TCC 1.1171117 1.5352265
neglijare 1.0767362 1.2026006
scoala_spec 0.7079749 0.9080281
tulb_cond 1.0787575 1.2557050
lmtest::lrtest(mod_pois_new, mod_nb2) # likelihood ratio test
Likelihood ratio test
Model 1: CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere +
TCC + neglijare + scoala_spec + tulb_cond
Model 2: CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere +
TCC + neglijare + scoala_spec + tulb_cond
#Df LogLik Df Chisq Pr(>Chisq)
1 10 -3037.0
2 11 -2857.2 1 359.65 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
## ATTENTION AT DISPERSION PARAM -- nbinomial has alpha (direct rel), glm.nb has theta (indirect rel)
# ATTENTION -- when using na.omit do it on dataset that has only the predictors and outcome from the model to not exclude other cases
# NB2 with alpha dispersion param instead of theta
# mod_nb2 <- msme::nbinomial(CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom + boli + neglijare + varsta +
# nr_frati + TCC, data = na.omit(Data_glm))
# summary(mod_nb2)
## NB1
# library(gamlss)
# mod_nb1 <- gamlss::gamlss(formula = CYW ~ expunere_tox + varsta_inst + tras_dez + schimb_dom + boli + neglijare + varsta +
# nr_frati + TCC, family = NBI, data = na.omit(Data_glm))
# summary(mod_nb1); plot(mod_nb1)
# lmtest::lrtest(mod_nb1, mod_nb2) # likelihood ratio test
# https://data.library.virginia.edu/getting-started-with-negative-binomial-regression-modeling/
# install.packages("countreg", repos="http://R-Forge.R-project.org") # Zeileis and Kleiber, 2014 not on CRAN
library(countreg)
countreg::rootogram(mod_pois_new, ylim = c(-7, 18), main = "Poisson") # rootogram on the fitted model objects (not possible in vcd::rootogram)
countreg::rootogram(mod_nb2, ylim = c(-7, 18), main = "Negative Binomial")
# For the negativebinomial the underfitting of the count for 0 and overfitting for counts 1–2 is characteristic of data with excess zeros.
# Data without NA for functions that dont exclude NAs by default
Data_glm_nona <-
Data_glm %>%
dplyr::select(CYW, expunere_tox, varsta_inst, schimb_dom, varsta,
intarziere, TCC, neglijare, scoala_spec, tulb_cond) %>%
drop_na()
# Formlula for 9 prector best model (Poisson and NB)
formula_model <- as.formula("CYW ~ expunere_tox + varsta_inst + schimb_dom +
varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond")
library(pscl)
mod_hurd_pois <- hurdle(formula_model, data = Data_glm , dist = "poisson")
mod_hurd_nb <- hurdle(formula_model, data = Data_glm , dist = "negbin")
mod_zip <- zeroinfl(formula_model, data = Data_glm , dist = "poisson")
mod_znb <- zeroinfl(formula_model, data = Data_glm , dist = "negbin")
mod_zpig <- gamlss::gamlss(formula_model, data = Data_glm_nona , family = "ZIPIG")
GAMLSS-RS iteration 1: Global Deviance = 5691.461
GAMLSS-RS iteration 2: Global Deviance = 5702.617
GAMLSS-RS iteration 3: Global Deviance = 5707.807
GAMLSS-RS iteration 4: Global Deviance = 5709.887
GAMLSS-RS iteration 5: Global Deviance = 5710.591
GAMLSS-RS iteration 6: Global Deviance = 5710.963
GAMLSS-RS iteration 7: Global Deviance = 5711.088
GAMLSS-RS iteration 8: Global Deviance = 5711.162
GAMLSS-RS iteration 9: Global Deviance = 5711.193
GAMLSS-RS iteration 10: Global Deviance = 5711.206
GAMLSS-RS iteration 11: Global Deviance = 5711.213
GAMLSS-RS iteration 12: Global Deviance = 5711.212
GAMLSS-RS iteration 13: Global Deviance = 5711.215
GAMLSS-RS iteration 14: Global Deviance = 5711.213
GAMLSS-RS iteration 15: Global Deviance = 5711.214
GAMLSS-RS iteration 16: Global Deviance = 5711.214
countreg::rootogram(mod_hurd_pois, max = 50, main = "Hurdle Poisson")
countreg::rootogram(mod_hurd_nb, max = 50, main = "Hurdle Negative Binomial")
countreg::rootogram(mod_zip, max = 50, main = "Zero-inflated Poisson")
countreg::rootogram(mod_znb, max = 50, main = "Zero-inflated Negative Binomial")
vcdExtra::LRstats(mod_pois_new, mod_nb2, mod_hurd_pois, mod_hurd_nb, mod_zip, mod_znb, mod_zpig, sortby = "AIC") #%>% xlsx::write.xlsx(., file = "LRtest.xlsx")
Likelihood summary table:
AIC BIC LR Chisq Df Pr(>Chisq)
mod_pois_new 6094.0 6144.6 6074.0 1148 < 0.00000000000000022 ***
mod_hurd_pois 5846.2 5947.3 5806.2 1138 < 0.00000000000000022 ***
mod_zip 5845.4 5946.5 5805.4 1138 < 0.00000000000000022 ***
mod_nb2 5736.4 5792.0 5714.4 1148 < 0.00000000000000022 ***
mod_zpig 5735.2 5795.9 5711.2 1146 < 0.00000000000000022 ***
mod_hurd_nb 5696.6 5802.7 5654.6 1137 < 0.00000000000000022 ***
mod_znb 5694.5 5800.7 5652.5 1137 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
lmtest::lrtest(mod_hurd_nb, mod_znb)
Likelihood ratio test
Model 1: CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere +
TCC + neglijare + scoala_spec + tulb_cond
Model 2: CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere +
TCC + neglijare + scoala_spec + tulb_cond
#Df LogLik Df Chisq Pr(>Chisq)
1 21 -2827.3
2 21 -2826.3 0 2.0362 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Very Best Model - Zero-inflated Negative Binomial
summary(mod_znb)
Call:
zeroinfl(formula = formula_model, data = Data_glm, dist = "negbin")
Pearson residuals:
Min 1Q Median 3Q Max
-1.8238 -0.7583 -0.1148 0.6524 4.0239
Count model coefficients (negbin with log link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.974121 0.091627 10.631 < 0.0000000000000002 ***
expunere_tox 0.181976 0.042720 4.260 0.0000205 ***
varsta_inst 0.013161 0.005181 2.540 0.01108 *
schimb_dom 0.100949 0.045208 2.233 0.02555 *
varsta 0.026730 0.006530 4.094 0.0000425 ***
intarziere 0.182133 0.059343 3.069 0.00215 **
TCC 0.236223 0.118234 1.998 0.04572 *
neglijare 0.060959 0.039219 1.554 0.12011
scoala_spec -0.193458 0.088012 -2.198 0.02794 *
tulb_cond 0.160337 0.054983 2.916 0.00354 **
Log(theta) 1.848631 0.123964 14.913 < 0.0000000000000002 ***
Zero-inflation model coefficients (binomial with logit link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.869657 0.785924 -2.379 0.01736 *
expunere_tox 0.052760 0.382341 0.138 0.89025
varsta_inst -0.046119 0.043698 -1.055 0.29125
schimb_dom -1.084011 0.639696 -1.695 0.09016 .
varsta 0.001132 0.057044 0.020 0.98417
intarziere 0.032617 0.516929 0.063 0.94969
TCC -13.943212 904.443543 -0.015 0.98770
neglijare -1.349598 0.447732 -3.014 0.00258 **
scoala_spec 0.477429 0.576483 0.828 0.40757
tulb_cond 0.086141 0.447328 0.193 0.84730
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Theta = 6.3511
Number of iterations in BFGS optimization: 31
Log-likelihood: -2826 on 21 Df
exp(coef(mod_znb)) # %>% as.data.frame() %>% mutate_if(is.numeric, round, 2) %>% xlsx::write.xlsx(., file = "mod_znb.xlsx")
count_(Intercept) count_expunere_tox count_varsta_inst count_schimb_dom count_varsta count_intarziere count_TCC
2.6488366843609 1.1995853584739 1.0132478388526 1.1062199094659 1.0270903997878 1.1997741031525 1.2664572338295
count_neglijare count_scoala_spec count_tulb_cond zero_(Intercept) zero_expunere_tox zero_varsta_inst zero_schimb_dom
1.0628551671389 0.8241040929909 1.1739061865554 0.1541765294175 1.0541760856719 0.9549287565637 0.3382360724541
zero_varsta zero_intarziere zero_TCC zero_neglijare zero_scoala_spec zero_tulb_cond
1.0011324796663 1.0331549499947 0.0000008801157 0.2593444676465 1.6119250943723 1.0899595920461
exp(confint.default(mod_znb)) # %>% as.data.frame() %>% mutate_if(is.numeric, round, 2) %>% xlsx::write.xlsx(., file = "mod_znb.xlsx")
2.5 % 97.5 %
count_(Intercept) 2.21341150 3.1699193
count_expunere_tox 1.10323438 1.3043511
count_varsta_inst 1.00301080 1.0235894
count_schimb_dom 1.01241962 1.2087108
count_varsta 1.01402968 1.0403193
count_intarziere 1.06803869 1.3477582
count_TCC 1.00449834 1.5967313
count_neglijare 0.98421686 1.1477766
count_scoala_spec 0.69353203 0.9792591
count_tulb_cond 1.05397803 1.3074805
zero_(Intercept) 0.03303989 0.7194456
zero_expunere_tox 0.49826880 2.2302966
zero_varsta_inst 0.87654607 1.0403206
zero_schimb_dom 0.09654044 1.1850333
zero_varsta 0.89523247 1.1195598
zero_intarziere 0.37510626 2.8456180
zero_TCC 0.00000000 Inf
zero_neglijare 0.10783677 0.6237163
zero_scoala_spec 0.52076440 4.9894012
zero_tulb_cond 0.45357021 2.6192459
pred <- round(colSums(predict(mod_znb, type="prob")[,1:18])) # expected counts
obs <- table(Data_glm$CYW)[1:18] # observed counts
rbind(obs, pred)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18
obs 119 128 144 142 121 130 147 105 81 68 34 31 14 6 2 1 1 1
pred 101 87 133 155 153 135 111 85 63 45 31 21 14 9 6 4 3 2
# Other Tests
mod_znb2 <- zeroinfl(CYW ~ expunere_tox + varsta_inst + schimb_dom +
varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond | neglijare , data = Data_glm , dist = "negbin")
summary(mod_znb2)
Call:
zeroinfl(formula = CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere + TCC + neglijare + scoala_spec + tulb_cond |
neglijare, data = Data_glm, dist = "negbin")
Pearson residuals:
Min 1Q Median 3Q Max
-1.8155 -0.7717 -0.1139 0.6484 3.9855
Count model coefficients (negbin with log link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.964337 0.089380 10.789 < 0.0000000000000002 ***
expunere_tox 0.181433 0.042378 4.281 0.0000186 ***
varsta_inst 0.014176 0.005077 2.792 0.00523 **
schimb_dom 0.113385 0.045010 2.519 0.01177 *
varsta 0.026448 0.006335 4.175 0.0000298 ***
intarziere 0.180370 0.059126 3.051 0.00228 **
TCC 0.235590 0.118591 1.987 0.04697 *
neglijare 0.064806 0.039276 1.650 0.09894 .
scoala_spec -0.204271 0.087691 -2.329 0.01984 *
tulb_cond 0.161198 0.054905 2.936 0.00333 **
Log(theta) 1.835746 0.123039 14.920 < 0.0000000000000002 ***
Zero-inflation model coefficients (binomial with logit link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.3071 0.1855 -12.439 <0.0000000000000002 ***
neglijare -1.3395 0.4244 -3.156 0.0016 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Theta = 6.2698
Number of iterations in BFGS optimization: 25
Log-likelihood: -2831 on 13 Df
lmtest::lrtest(mod_znb, mod_znb2) # isnt better if zero counts predicted just by neglijare
Likelihood ratio test
Model 1: CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere +
TCC + neglijare + scoala_spec + tulb_cond
Model 2: CYW ~ expunere_tox + varsta_inst + schimb_dom + varsta + intarziere +
TCC + neglijare + scoala_spec + tulb_cond | neglijare
#Df LogLik Df Chisq Pr(>Chisq)
1 21 -2826.3
2 13 -2831.3 -8 10.021 0.2636
# NB2 again -- nobs and results are the same as mod_nb2 --- THE DATA IS UNDERDISPERED HERE: Disp = 0.933
mod_nbnb <- msme::nbinomial(formula_model, data = Data_glm_nona)
summary(mod_nbnb)
# HNB
mod_hnb <- msme::nbinomial(formula1 = formula_model,
formula2 =~ expunere_tox + varsta_inst + schimb_dom + varsta + # modelling predictors of dispersion
intarziere + TCC + neglijare + scoala_spec + tulb_cond,
family = "negBinomial", mean.link = "log", scale.link = "log_s", data = Data_glm_nona)
summary(mod_hnb)
exp(coef(mod_hnb))
# NB1 -- seems a little better than NB2 but not sure
mod_nb1 <- gamlss::gamlss(formula = formula_model, family = NBI, data = Data_glm_nona)
summary(mod_nb1); # plot(mod_nb1)
countreg::rootogram(mod_nb1, max = 50, main = "NB1")
vcdExtra::LRstats(mod_nb2, mod_nb1)
R version 3.5.2 (2018-12-20)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=Romanian_Romania.1250 LC_CTYPE=Romanian_Romania.1250 LC_MONETARY=Romanian_Romania.1250 LC_NUMERIC=C
[5] LC_TIME=Romanian_Romania.1250
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] pscl_1.5.2 countreg_0.2-1 MASS_7.3-51.1 effects_4.1-0 sandwich_2.5-1
[6] olsrr_0.5.2 gvlma_1.0.0.2 bindrcpp_0.2.2 car_3.0-2 carData_3.0-2
[11] RColorBrewer_1.1-2 corrplot_0.84 GGally_1.4.0 Hmisc_4.1-1 Formula_1.2-3
[16] survival_2.43-3 lattice_0.20-38 rio_0.5.16 scales_1.0.0 ggpubr_0.2
[21] magrittr_1.5 PerformanceAnalytics_1.5.2 xts_0.11-2 zoo_1.8-4 tadaatoolbox_0.16.1
[26] summarytools_0.9.3 broom_0.5.1 psycho_0.4.0 psych_1.8.10 plyr_1.8.4
[31] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8 purrr_0.2.5 readr_1.3.0
[36] tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1 papaja_0.1.0.9842
[41] kableExtra_1.0.1 knitr_1.21 pacman_0.5.0
loaded via a namespace (and not attached):
[1] statnet.common_4.1.4 vcd_1.4-4 corpcor_1.6.9 class_7.3-14 formula.tools_1.7.1
[6] assertive.properties_0.0-4 ps_1.2.1 d3Network_0.5.2.1 relimp_1.0-5 lmtest_0.9-36
[11] crayon_1.3.4 nonnest2_0.5-2 nlme_3.1-137 backports_1.1.3 infer_0.4.0.1
[16] ggcorrplot_0.1.2 ellipse_0.4.1 colourpicker_1.0 huge_1.2.7 rlang_0.3.0.1
[21] readxl_1.1.0 SparseM_1.77 nloptr_1.2.1 callr_3.1.1 ca_0.71
[26] rjson_0.2.20 glue_1.3.1 loo_2.0.0 rstan_2.18.2 parallel_3.5.2
[31] processx_3.2.1 moderndive_0.2.0 tcltk_3.5.2 mcmc_0.9-5 haven_2.1.0
[36] tidyselect_0.2.5 assertive.types_0.0-3 blavaan_0.3-4 xtable_1.8-3 MatrixModels_0.4-1
[41] ggm_2.3 evaluate_0.12 cli_1.0.1 rstudioapi_0.8 miniUI_0.1.1.1
[46] whisker_0.3-2 rpart_4.1-13 shinystan_2.5.0 shiny_1.2.0 xfun_0.4
[51] inline_0.3.15 pkgbuild_1.0.2 cluster_2.0.7-1 nFactors_2.3.3 expm_0.999-3
[56] quantreg_5.38 assertive.sets_0.0-3 threejs_0.3.1 png_0.1-7 reshape_0.8.8
[61] ipred_0.9-8 withr_2.1.2 bitops_1.0-6 cellranger_1.1.0 assertive.base_0.0-7
[66] survey_3.35 assertive.models_0.0-2 coda_0.19-2 pillar_1.3.1 multcomp_1.4-10
[71] assertive.matrices_0.0-2 msme_0.5.3 assertive.reflection_0.0-4 BDgraph_2.53 gamlss.data_5.1-4
[76] pbivnorm_0.6.0 generics_0.0.2 dygraphs_1.1.1.6 gh_1.0.1 nortest_1.0-4
[81] lava_1.6.4 tools_3.5.2 foreign_0.8-71 munsell_0.5.0 emmeans_1.3.1
[86] compiler_3.5.2 abind_1.4-5 httpuv_1.4.5 manipulate_1.0.1 assertive.data.uk_0.0-2
[91] DescTools_0.99.27 prodlim_2018.04.18 gridExtra_2.3 MCMCpack_1.4-4 ppcor_1.1
[96] assertive.data.us_0.0-2 later_0.7.5 recipes_0.1.4 vcdExtra_0.7-1 jsonlite_1.6
[101] arm_1.10-1 pbapply_1.3-4 estimability_1.3 lazyeval_0.2.1 promises_1.0.1
[106] latticeExtra_0.6-28 goftest_1.1-1 sna_2.4 checkmate_1.8.5 rapportools_1.0
[111] rmarkdown_1.11 openxlsx_4.1.0 webshot_0.5.1 pander_0.6.3 igraph_1.2.2
[116] numDeriv_2016.8-1 rsconnect_0.8.13 yaml_2.2.0 bayesplot_1.6.0 htmltools_0.3.6
[121] rstantools_1.5.1 lavaan_0.6-3 quadprog_1.5-5 viridisLite_0.3.0 digest_0.6.18
[126] assertthat_0.2.1 mime_0.6 MuMIn_1.42.1 assertive.code_0.0-3 assertive.strings_0.0-3
[131] data.table_1.12.2 gnm_1.1-0 shinythemes_1.1.2 splines_3.5.2 labeling_0.3
[136] RCurl_1.95-4.11 assertive.numbers_0.0-2 hms_0.4.2 modelr_0.1.2 colorspace_1.3-2
[141] base64enc_0.1-3 mnormt_1.5-5 operator.tools_1.6.3 assertive.files_0.0-2 nnet_7.3-12
[146] Rcpp_1.0.1 mvtnorm_1.0-10 matrixcalc_1.0-3 R6_2.4.0 grid_3.5.2
[151] ggridges_0.5.1 acepack_1.4.1 StanHeaders_2.18.0-1 zip_1.0.0 BayesFactor_0.9.12-4.2
[156] qvcalc_1.0.0 curl_3.2 ggsignif_0.4.0 pryr_0.1.4 minqa_1.2.4
[161] mi_1.0 snakecase_0.9.2 qgraph_1.5 Matrix_1.2-15 assertive.data_0.0-3
[166] glasso_1.10 TH.data_1.0-9 pixiedust_0.8.6 gower_0.1.2 htmlwidgets_1.3
[171] markdown_0.9 network_1.13.0.1 crosstalk_1.0.0 gamlss_5.1-4 COUNT_1.3.4
[176] rvest_0.3.2 rstanarm_2.18.2 htmlTable_1.12 codetools_0.2-15 matrixStats_0.54.0
[181] lubridate_1.7.4 gtools_3.8.1 prettyunits_1.0.2 gtable_0.2.0 stats4_3.5.2
[186] httr_1.4.0 stringi_1.2.4 reshape2_1.4.3 viridis_0.5.1 fdrtool_1.2.15
[191] magick_2.0 timeDate_3043.102 DT_0.5 xml2_1.2.0 assertive_0.3-5
[196] assertive.datetimes_0.0-2 boot_1.3-20 shinyjs_1.0 lme4_1.1-19 sem_3.1-9
[201] pwr_1.2-2 CompQuadForm_1.4.3 jpeg_0.1-8 janitor_1.1.1 pkgconfig_2.0.2
[206] gamlss.dist_5.1-4 lmerTest_3.0-1 bindr_0.1.1
A work by Claudiu Papasteri