# Analysis of killdeer data using the logistic exposure model
# 2019-05-01 CJS Initial code

# This is the killdeer data that ships with RMark

library(AICcmodavg)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(plyr)
library(readxl)

source(file.path("..","..","logistic-exposure-model.R"))

# The dataframe must contain the following fields with the following names
#
#    FirstFound: day the nest was first found
#    LastPresent: last day that a chick was present in the nest
#    LastChecked: last day the nest was checked
#    Fate: fate of the nest; 0=hatch an
#    Freq: number of nests with this data
#
# In this example, the multiple visits to a nest have been collapsed
# to a single record for each nest.
# In more complex examples, you may have multple records per nest
# as shown in the mallard example.
#

killdata <- readxl::read_excel(file.path("..","Killdeer.xlsx"), 
                               sheet="killdeer")
head(killdata)
## # A tibble: 6 x 6
##   id    FirstFound LastPresent LastChecked  Fate  Freq
##   <chr>      <dbl>       <dbl>       <dbl> <dbl> <dbl>
## 1 /*A*/          1           9           9     0     1
## 2 /*B*/          5           5           9     1     1
## 3 /*C*/          5          40          40     0     1
## 4 /*D*/          9          32          32     0     1
## 5 /*E*/          7           8           8     0     1
## 6 /*F*/          3          15          15     0     1
# We expand the data to generate the effective sample size
killdata2 <- expand.nest.data(killdata)
head(killdata2)
##      id FirstFound LastPresent LastChecked Fate Freq Day Exposure Survive
## 1 /*A*/          1           9           9    0    1   1        1       1
## 2 /*A*/          1           9           9    0    1   2        1       1
## 3 /*A*/          1           9           9    0    1   3        1       1
## 4 /*A*/          1           9           9    0    1   4        1       1
## 5 /*A*/          1           9           9    0    1   5        1       1
## 6 /*A*/          1           9           9    0    1   6        1       1
# Now to fit the logistic exposure model
fit.Sdot <- glm(Survive~1,
         family=binomial(link=logexp(killdata2$Exposure)),
         data=killdata2)
summary(fit.Sdot)
## 
## Call:
## glm(formula = Survive ~ 1, family = binomial(link = logexp(killdata2$Exposure)), 
##     data = killdata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6777   0.2372   0.2372   0.2372   0.2372  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.5570     0.4085   8.708   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.491  on 209  degrees of freedom
## Residual deviance: 42.510  on 209  degrees of freedom
## AIC: 44.51
## 
## Number of Fisher Scoring iterations: 7
-2*logLik(fit.Sdot)
## 'log Lik.' 42.51028 (df=1)
# Convert the logit(DSR) to DSR
DSR <- expit(coef(fit.Sdot))
DSR.se <- arm::se.coef(fit.Sdot)*DSR*(1-DSR)
cat("DSR ", DSR, "(SE ", DSR.se, ")\n")
## DSR  0.9722669 (SE  0.01101397 )
# Find confidence intervals by taking expit of confit of coefficients
expit(confint(fit.Sdot))
## Waiting for profiling to be done...
##     2.5 %    97.5 % 
## 0.9445889 0.9888876
# make a data frame for model averaging
dsr.dot <- data.frame(Day=1:39, logit.dsr=coef(fit.Sdot), logit.dsr.sr=arm::se.coef(fit.Sdot),
                                dsr=DSR, dsr.se=DSR.se)
## Warning in data.frame(Day = 1:39, logit.dsr = coef(fit.Sdot), logit.dsr.sr
## = arm::se.coef(fit.Sdot), : row names were found from a short variable and
## have been discarded
# Compute the nest survival
days <- 39
NS <- DSR^days
NS.se <- DSR.se * days * DSR^(days-1)
cat("NS ", days," days ", NS, "(SE ", NS.se, ")\n")
## NS  39  days  0.3339134 (SE  0.147522 )
# Linear in date
# This is a close approximation as the final interval uses the midpoint of the interval 
# rather than the individual daily survival probabilities computed with 
# the actual date

head(killdata2)
##      id FirstFound LastPresent LastChecked Fate Freq Day Exposure Survive
## 1 /*A*/          1           9           9    0    1   1        1       1
## 2 /*A*/          1           9           9    0    1   2        1       1
## 3 /*A*/          1           9           9    0    1   3        1       1
## 4 /*A*/          1           9           9    0    1   4        1       1
## 5 /*A*/          1           9           9    0    1   5        1       1
## 6 /*A*/          1           9           9    0    1   6        1       1
fit.linear <- glm(Survive~Day,
         family=binomial(link=logexp(killdata2$Exposure)),
         data=killdata2)
summary(fit.linear)
## 
## Call:
## glm(formula = Survive ~ Day, family = binomial(link = logexp(killdata2$Exposure)), 
##     data = killdata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6334   0.1677   0.2185   0.2679   0.4007  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.17135    1.39604   3.704 0.000212 ***
## Day         -0.06896    0.05177  -1.332 0.182842    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.491  on 209  degrees of freedom
## Residual deviance: 40.636  on 208  degrees of freedom
## AIC: 44.636
## 
## Number of Fisher Scoring iterations: 7
-2*logLik(fit.linear)
## 'log Lik.' 40.6357 (df=2)
# We now want to predict the DSR for days 1..39
# In this case predict() will work on the logit scale because the exposure=1 default is ok, 
# but we need to expit the results
pred.data <- data.frame(Day=1:39)
logit.dsr.pred.linear <- predict(fit.linear, newdata=pred.data, se.fit=TRUE)  

# put these together in a data frame
dsr.linear <- cbind(pred.data, logit.dsr=logit.dsr.pred.linear$fit, logit.dsr.se=logit.dsr.pred.linear$se.fit)
head(dsr.linear)
##   Day logit.dsr logit.dsr.se
## 1   1  5.102386     1.346623
## 2   2  5.033424     1.297391
## 3   3  4.964463     1.248365
## 4   4  4.895501     1.199570
## 5   5  4.826539     1.151034
## 6   6  4.757578     1.102793
dsr.linear$dsr <- expit(dsr.linear$logit.dsr)
dsr.linear$dsr.se <- dsr.linear$logit.dsr.se* dsr.linear$dsr * (1-dsr.linear$dsr)
head(dsr.linear)
##   Day logit.dsr logit.dsr.se       dsr      dsr.se
## 1   1  5.102386     1.346623 0.9939546 0.008091722
## 2   2  5.033424     1.297391 0.9935257 0.008345278
## 3   3  4.964463     1.248365 0.9930667 0.008595274
## 4   4  4.895501     1.199570 0.9925754 0.008840225
## 5   5  4.826539     1.151034 0.9920495 0.009078528
## 6   6  4.757578     1.102793 0.9914867 0.009308462
# plot this in the usual way
killdeer.linear.le <- ggplot(data=dsr.linear, aes(x=Day, y=dsr, group=1))+
  ggtitle("Estimated linear fit on logit scale", subtitle="Logistic exposure model")+
  geom_line()+
  geom_ribbon(aes(ymin=expit(logit.dsr-1.96*logit.dsr.se),
                  ymax=expit(logit.dsr+1.96*logit.dsr.se)), alpha=.1)+
  ylim(0.5, 1)
killdeer.linear.le

ggsave(killdeer.linear.le,
       file=file.path("..","..","..","..","MyStuff","Images","killdeer-linear-S-le.png"),h=4, w=6, units="in", dpi=300)
# Quadratic in date
# This is a close approximation as the final interval uses the midpoint of the interval 
# rather than the individual daily survival probabilities computed with 
# the actual date
killdata2$Day2 <- (killdata2$Day-20)^2

fit.quad <- glm(Survive~Day+Day2,
         family=binomial(link=logexp(killdata2$Exposure)),
         data=killdata2)
summary(fit.quad)
## 
## Call:
## glm(formula = Survive ~ Day + Day2, family = binomial(link = logexp(killdata2$Exposure)), 
##     data = killdata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7486   0.1553   0.1815   0.2335   0.7316  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.166929   1.080126   4.784 1.72e-06 ***
## Day         -0.035588   0.039617  -0.898   0.3690    
## Day2        -0.007196   0.003861  -1.864   0.0623 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.491  on 209  degrees of freedom
## Residual deviance: 37.762  on 207  degrees of freedom
## AIC: 43.762
## 
## Number of Fisher Scoring iterations: 7
-2*logLik(fit.quad)
## 'log Lik.' 37.76182 (df=3)
# We now want to predict the DSR for days 1..39
# In this case predict() will work on the logit scale because the exposure=1 default is ok, 
# but we need to expit the results
pred.data <- data.frame(Day=1:39)
pred.data$Day2 <- (pred.data$Day-20)^2  # we need to match coding above

logit.dsr.pred.quad <- predict(fit.quad, newdata=pred.data, se.fit=TRUE)  

# put these together in a data frame
dsr.quad  <- cbind(pred.data, logit.dsr=logit.dsr.pred.quad$fit, logit.dsr.se=logit.dsr.pred.quad$se.fit)
head(dsr.quad )
##   Day Day2 logit.dsr logit.dsr.se
## 1   1  361  2.533762    1.5006084
## 2   2  324  2.764408    1.3653107
## 3   3  289  2.980664    1.2407822
## 4   4  256  3.182528    1.1275686
## 5   5  225  3.370001    1.0262679
## 6   6  196  3.543083    0.9374879
dsr.quad $dsr <- expit(dsr.quad $logit.dsr)
dsr.quad $dsr.se <- dsr.quad $logit.dsr.se* dsr.quad$dsr * (1-dsr.quad$dsr)
head(dsr.quad)
##   Day Day2 logit.dsr logit.dsr.se       dsr     dsr.se
## 1   1  361  2.533762    1.5006084 0.9264750 0.10222003
## 2   2  324  2.764408    1.3653107 0.9407219 0.07613543
## 3   3  289  2.980664    1.2407822 0.9516929 0.05704315
## 4   4  256  3.182528    1.1275686 0.9601715 0.04312075
## 5   5  225  3.370001    1.0262679 0.9667537 0.03298523
## 6   6  196  3.543083    0.9374879 0.9718891 0.02561283
# plot this in the usual way
killdeer.quad.le <- ggplot(data=dsr.quad, aes(x=Day, y=dsr, group=1))+
  ggtitle("Estimated quadratic fit on logit scale", subtitle="Logistic exposure model")+
  geom_line()+
  geom_ribbon(aes(ymin=expit(logit.dsr-1.96*logit.dsr.se),
                  ymax=expit(logit.dsr+1.96*logit.dsr.se)), alpha=.1)+
  ylim(0.5, 1)
killdeer.quad.le

ggsave(killdeer.quad.le,
       file=file.path("..","..","..","..","MyStuff","Images","killdeer-quad-S-le.png"),h=4, w=6, units="in", dpi=300)
# DSR varies in the first and second half of the study
# need to define the part of the study
# some ambiguity if an interval straddles the breakpoint

killdata2$studyhalf <- car::recode(killdata2$Day,
                                  " lo:20='1st'; 21:hi='2nd'")
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
head(killdata2)
##      id FirstFound LastPresent LastChecked Fate Freq Day Exposure Survive
## 1 /*A*/          1           9           9    0    1   1        1       1
## 2 /*A*/          1           9           9    0    1   2        1       1
## 3 /*A*/          1           9           9    0    1   3        1       1
## 4 /*A*/          1           9           9    0    1   4        1       1
## 5 /*A*/          1           9           9    0    1   5        1       1
## 6 /*A*/          1           9           9    0    1   6        1       1
##   Day2 studyhalf
## 1  361       1st
## 2  324       1st
## 3  289       1st
## 4  256       1st
## 5  225       1st
## 6  196       1st
fit.2part <- glm(Survive~studyhalf,
         family=binomial(link=logexp(killdata2$Exposure)),
         data=killdata2)
summary(fit.2part)
## 
## Call:
## glm(formula = Survive ~ studyhalf, family = binomial(link = logexp(killdata2$Exposure)), 
##     data = killdata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6061   0.2042   0.2611   0.2611   0.2611  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.8604     0.7077   5.455 4.89e-08 ***
## studyhalf2nd  -0.4986     0.8666  -0.575    0.565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.491  on 209  degrees of freedom
## Residual deviance: 42.173  on 208  degrees of freedom
## AIC: 46.173
## 
## Number of Fisher Scoring iterations: 7
-2*logLik(fit.2part)
## 'log Lik.' 42.17273 (df=2)
# We now want to predict the DSR for days 1..39
# In this case predict() will work on the logit scale because the exposure=1 default is ok, 
# but we need to expit the results
pred.data <- data.frame(Day=1:39)
pred.data$studyhalf <- car::recode(pred.data$Day,
                                  " lo:20='1st'; 21:hi='2nd'")
logit.dsr.pred.2part <- predict(fit.2part, newdata=pred.data, se.fit=TRUE)  

# put these together in a data frame
dsr.2part <- cbind(pred.data, logit.dsr=logit.dsr.pred.2part$fit, logit.dsr.se=logit.dsr.pred.2part$se.fit)
head(dsr.2part)
##   Day studyhalf logit.dsr logit.dsr.se
## 1   1       1st    3.8604    0.7076596
## 2   2       1st    3.8604    0.7076596
## 3   3       1st    3.8604    0.7076596
## 4   4       1st    3.8604    0.7076596
## 5   5       1st    3.8604    0.7076596
## 6   6       1st    3.8604    0.7076596
dsr.2part$dsr <- expit(dsr.2part$logit.dsr)
dsr.2part$dsr.se <- dsr.2part$logit.dsr.se* dsr.2part$dsr * (1-dsr.2part$dsr)
head(dsr.2part)
##   Day studyhalf logit.dsr logit.dsr.se       dsr     dsr.se
## 1   1       1st    3.8604    0.7076596 0.9793748 0.01429459
## 2   2       1st    3.8604    0.7076596 0.9793748 0.01429459
## 3   3       1st    3.8604    0.7076596 0.9793748 0.01429459
## 4   4       1st    3.8604    0.7076596 0.9793748 0.01429459
## 5   5       1st    3.8604    0.7076596 0.9793748 0.01429459
## 6   6       1st    3.8604    0.7076596 0.9793748 0.01429459
tail(dsr.2part)
##    Day studyhalf logit.dsr logit.dsr.se       dsr     dsr.se
## 34  34       2nd   3.36182      0.50029 0.9664898 0.01620304
## 35  35       2nd   3.36182      0.50029 0.9664898 0.01620304
## 36  36       2nd   3.36182      0.50029 0.9664898 0.01620304
## 37  37       2nd   3.36182      0.50029 0.9664898 0.01620304
## 38  38       2nd   3.36182      0.50029 0.9664898 0.01620304
## 39  39       2nd   3.36182      0.50029 0.9664898 0.01620304
# plot this in the usual way
killdeer.2part.le <- ggplot(data=dsr.2part, aes(x=Day, y=dsr, group=1))+
  ggtitle("Estimated 2part fit on logit scale", subtitle="Logistic exposure model")+
  geom_line()+
  geom_ribbon(aes(ymin=expit(logit.dsr-1.96*logit.dsr.se),
                  ymax=expit(logit.dsr+1.96*logit.dsr.se)), alpha=.1)+
  ylim(0.5, 1)
killdeer.2part.le

ggsave(killdeer.2part.le,
       file=file.path("..","..","..","..","MyStuff","Images","killdeer-2part-S-le.png"),h=4, w=6, units="in", dpi=300)