# Analysis of killdeer data illustrating model averaging
# and logistic exposure models

# 2019-05-01 CJS Initial version

# This is the killdeer data that ships with RMark

library(AICcmodavg)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(plyr)
library(readxl)
library(RMark)
## This is RMark 2.2.6
##  Documentation available at http://www.phidot.org/software/mark/rmark/RMarkDocumentation.zip
## Warning: Software mark not found in path.
##          If you have mark executable, you will need to set MarkPath object to its location (e.g. MarkPath="C:/Users/Jeff Laake/Desktop"
source(file.path("..","..","logistic-exposure-model.R"))

# The dataframe must contain the following fields with the following names
#
#   FirstFound: day the nest was first found
#    LastPresent: last day that a chick was present in the nest
#    LastChecked: last day the nest was checked
#    Fate: fate of the nest; 0=hatch an
#    Freq: number of nests with this data
#
# In this example, the multiple visits to a nest have been collapsed
# to a single record for each nest.
# In more complex examples, you may have multple records per nest
# as shown in the mallard example.
#

killdata <- readxl::read_excel(file.path("..","Killdeer.xlsx"), 
                               sheet="killdeer")
head(killdata)
## # A tibble: 6 x 6
##   id    FirstFound LastPresent LastChecked  Fate  Freq
##   <chr>      <dbl>       <dbl>       <dbl> <dbl> <dbl>
## 1 /*A*/          1           9           9     0     1
## 2 /*B*/          5           5           9     1     1
## 3 /*C*/          5          40          40     0     1
## 4 /*D*/          9          32          32     0     1
## 5 /*E*/          7           8           8     0     1
## 6 /*F*/          3          15          15     0     1
# We expand the data to generate the effective sample size
killdata2 <- expand.nest.data(killdata)
head(killdata2)
##      id FirstFound LastPresent LastChecked Fate Freq Day Exposure Survive
## 1 /*A*/          1           9           9    0    1   1        1       1
## 2 /*A*/          1           9           9    0    1   2        1       1
## 3 /*A*/          1           9           9    0    1   3        1       1
## 4 /*A*/          1           9           9    0    1   4        1       1
## 5 /*A*/          1           9           9    0    1   5        1       1
## 6 /*A*/          1           9           9    0    1   6        1       1
# Add the Day2 term and the first/second half variables
killdata2$Day2 <- (killdata2$Day-20)^2
killdata2$studyhalf <- car::recode(killdata2$Day,
                                  " lo:20='1st'; 21:hi='2nd'")

#  Set up the set of model to fit
model.list.csv <- textConnection(
" S
 ~1
 ~Day
 ~Day+Day2
 ~studyhalf
")

model.list <- read.csv(model.list.csv, header=TRUE, as.is=TRUE, strip.white=TRUE)
model.list$model.number <- 1:nrow(model.list)
model.list
##            S model.number
## 1         ~1            1
## 2       ~Day            2
## 3  ~Day+Day2            3
## 4 ~studyhalf            4
model.fits <- plyr::dlply(model.list, c("S","model.number"), function(x,input.data, input.ddl){
  cat("\n\n***** Starting ", unlist(x), "\n")
  
  fit <- glm(formula=as.formula(paste("Survive", eval(x$S))),
            family=binomial(link=logexp(input.data$Exposure)),
            data=input.data)
  fit
  
},input.data=killdata2)
## 
## 
## ***** Starting  ~1 1 
## 
## 
## ***** Starting  ~Day 2 
## 
## 
## ***** Starting  ~Day+Day2 3 
## 
## 
## ***** Starting  ~studyhalf 4
# examine individual model results
model.number <-1
names(model.fits[[model.number]])
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"
model.fits[[model.number]]$formula
## Survive ~ 1
## <environment: 0x7fa15003c050>
summary(model.fits[[model.number]])
## 
## Call:
## glm(formula = as.formula(paste("Survive", eval(x$S))), family = binomial(link = logexp(input.data$Exposure)), 
##     data = input.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6777   0.2372   0.2372   0.2372   0.2372  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.5570     0.4085   8.708   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.491  on 209  degrees of freedom
## Residual deviance: 42.510  on 209  degrees of freedom
## AIC: 44.51
## 
## Number of Fisher Scoring iterations: 7
# Model comparision and averaging
# collect models and make AICc table
AICcmodavg::aictab(model.fits)
## 
## Model selection based on AICc:
## 
##              K  AICc Delta_AICc AICcWt Cum.Wt     LL
## ~Day+Day2.3  3 43.88       0.00   0.37   0.37 -18.88
## ~1.1         1 44.53       0.65   0.27   0.64 -21.26
## ~Day.2       2 44.69       0.82   0.25   0.89 -20.32
## ~studyhalf.4 2 46.23       2.35   0.11   1.00 -21.09
# compute the DSR and model average the daily values
# we need to set up the prediction matrix with variables that match the models

pred.data      <- data.frame(Day=1:39)
pred.data$Day2 <- (pred.data$Day-20)^2  # we need to match coding above
pred.data$studyhalf <- car::recode(pred.data$Day,
                                  " lo:20='1st'; 21:hi='2nd'")

dsr.indiv <- plyr::ldply(model.fits, function(x,pred.data){
   # get the predictions on the logit scale and then back transform
    logit.dsr.pred <- predict(x, newdata=pred.data, se.fit=TRUE)  

    # put these together in a data frame
    dsr <- cbind(pred.data, logit.dsr=logit.dsr.pred$fit, logit.dsr.se=logit.dsr.pred$se.fit)
    dsr$dsr   <- expit(dsr$logit.dsr)
    dsr$dsr.se <- dsr$logit.dsr.se* dsr$dsr * (1-dsr$dsr)
    dsr
},pred.data=pred.data)

plotdata.indiv <- dsr.indiv
head(plotdata.indiv)
##    S model.number Day Day2 studyhalf logit.dsr logit.dsr.se       dsr
## 1 ~1            1   1  361       1st  3.557002    0.4084693 0.9722669
## 2 ~1            1   2  324       1st  3.557002    0.4084693 0.9722669
## 3 ~1            1   3  289       1st  3.557002    0.4084693 0.9722669
## 4 ~1            1   4  256       1st  3.557002    0.4084693 0.9722669
## 5 ~1            1   5  225       1st  3.557002    0.4084693 0.9722669
## 6 ~1            1   6  196       1st  3.557002    0.4084693 0.9722669
##       dsr.se
## 1 0.01101397
## 2 0.01101397
## 3 0.01101397
## 4 0.01101397
## 5 0.01101397
## 6 0.01101397
# do the model averaging for each day's DSR
# Extract the logL and K number of parameters for each model in the model set.
# The model number is used to index the values
model.info <- plyr::ldply(model.fits, function(x){
    #browser()
    logL <- logLik(x)
    K    <- length(coef(x))
    nobs <- nrow(x$data)
    data.frame(logL=logL, K=K, nobs=nobs)
})
model.info
##            S model.number      logL K nobs
## 1         ~1            1 -21.25514 1  210
## 2       ~Day            2 -20.31785 2  210
## 3  ~Day+Day2            3 -18.88091 3  210
## 4 ~studyhalf            4 -21.08636 2  210
dsr.ma <- plyr::ddply(dsr.indiv, c("Day"), function(x, model.info){
   # merge the model information with the estimates
   x <-merge(x, model.info)
   # get the model averaged values
   #browser()
   ma <- AICcmodavg::modavgCustom(x$logL, x$K, modnames=x$S, nobs=x$nobs,
                                  estimate=x$dsr, se=x$dsr.se)
   data.frame(dsr=ma$Mod.avg.est, dsr.se=ma$Uncond.SE, dsr.lcl=ma$Lower.CL, dsr.ucl=ma$Upper.CL)
},model.info=model.info)
head(dsr.ma)
##   Day       dsr     dsr.se   dsr.lcl  dsr.ucl
## 1   1 0.9614451 0.06878004 0.8266387 1.096251
## 2   2 0.9666242 0.05174868 0.8651987 1.068050
## 3   3 0.9705807 0.03934304 0.8934697 1.047692
## 4   4 0.9736046 0.03041289 0.9139964 1.033213
## 5   5 0.9759166 0.02408579 0.9287093 1.023124
## 6   6 0.9776827 0.01970441 0.9390627 1.016303
plotdata.ma  <- dsr.ma
plotdata.ma$S <- "Model avg"

# model average the predicted nest suvival probability
final.fit <- ggplot(data=plotdata.indiv, aes(x=Day, y=dsr, color=S))+
  ggtitle("Comparing the model fits", subtitle="Fit using logistic exposure models")+
  geom_line(aes(group=S))+
  geom_line(data=plotdata.ma, color='black', size=2, group=1)+
  theme(legend.justification=c(0,0), legend.position=c(0,0))
final.fit

ggsave(final.fit,
       file=file.path("..","..","..","..","MyStuff","Images","killdeer-modavg-le.png"),h=4, w=6, units="in", dpi=300)