# Analysis of mallard data illustrating logistic exposure models
# How to fit a nest level categorical variable (habitat)

# 2019-05-01 CJS Initial code

# This is the mallard data that ships with RMark

library(AICcmodavg)
library(emmeans)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(plyr)
library(readxl)

source(file.path("..","..","logistic-exposure-model.R"))

# The dataframe must contain the following fields with the following names
#
#    FirstFound: day the nest was first found
#    LastPresent: last day that a chick was present in the nest
#    LastChecked: last day the nest was checked
#    Fate: fate of the nest; 0=hatch an
#    Freq: number of nests with this data
#   
# Also contains the following fields
#   Robel    -  Measurement of Robel pole of visibility of nest
#   PpnGrass - proportion of grassland cover on the 10.4 km2 study site t
#   AgeFound - Age of nest when found
#   AgeDay1  - Age of nest on day 1 of study (can be negative)
#   Habitat - N=Native; P=Planted; W=Wetland; R=roadside right of way
#
# In this example, the multiple visits to a nest have been collapsed
# to a single record for each nest.
# In more complex examples, you may have multple records per nest
# as shown in the mallard example.
#

malldata <- readxl::read_excel(file.path("..","mallard.xlsx"), 
                               sheet="mallard")
head(malldata)
## # A tibble: 6 x 10
##   FirstFound LastPresent LastChecked  Fate  Freq Robel PpnGrass AgeFound
##        <dbl>       <dbl>       <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
## 1         73          89          89     0     1  6       0.800       13
## 2         63          90          90     0     1  3.75    0.668        5
## 3         70          70          76     1     1  3.75    0.800       13
## 4         63          81          85     1     1  3.12    0.668        6
## 5         61          61          66     1     1  4.5     0.668        4
## 6         57          57          61     1     1  4.25    0.668        1
## # … with 2 more variables: AgeDay1 <dbl>, Habitat <chr>
malldata2 <- expand.nest.data(malldata)
rm(malldata)

malldata2$Habitat <- factor(malldata2$Habitat)  # RMark wants categorical variable to be factors



# Fit a particular model
# This is a model with S varying by habitat type
mod.hab <-  glm(Survive~Habitat,
         family=binomial(link=logexp(malldata2$Exposure)),
         data=malldata2)
summary(mod.hab)
## 
## Call:
## glm(formula = Survive ~ Habitat, family = binomial(link = logexp(malldata2$Exposure)), 
##     data = malldata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5021   0.2990   0.2990   0.3333   0.3333  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.86293    0.09508  30.111   <2e-16 ***
## HabitatP     0.22268    0.12234   1.820   0.0687 .  
## HabitatR     0.21111    0.22719   0.929   0.3528    
## HabitatW     0.09291    0.23509   0.395   0.6927    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2494.8  on 6125  degrees of freedom
## Residual deviance: 1564.0  on 6122  degrees of freedom
## AIC: 1572
## 
## Number of Fisher Scoring iterations: 7
# You can make predictions just as before
pred.data <- data.frame(Habitat=unique(malldata2$Habitat))
logit.dsr.pred.hab <- predict(mod.hab, newdata=pred.data, se.fit=TRUE)  

# put these together in a data frame
dsr.hab <- cbind(pred.data, logit.dsr=logit.dsr.pred.hab$fit, logit.dsr.se=logit.dsr.pred.hab$se.fit)
dsr.hab$dsr <- expit(dsr.hab$logit.dsr)
dsr.hab$dsr.se <- dsr.hab$logit.dsr.se* dsr.hab$dsr * (1-dsr.hab$dsr)
dsr.hab
##   Habitat logit.dsr logit.dsr.se       dsr      dsr.se
## 1       R  3.074044   0.20633908 0.9558093 0.008715326
## 2       P  3.085609   0.07698050 0.9562952 0.003217376
## 3       N  2.862929   0.09507968 0.9459832 0.004858478
## 4       W  2.955842   0.21500819 0.9505389 0.010108552
# extract the logit(DSR) for each habitat using emmeans
mod.hab.emmo <- emmeans::emmeans(mod.hab, ~Habitat)
dsr.logit <- CLD(mod.hab.emmo)
dsr.logit
##  Habitat emmean     SE  df asymp.LCL asymp.UCL .group
##  N         2.86 0.0951 Inf      2.68      3.05  1    
##  W         2.96 0.2150 Inf      2.53      3.38  1    
##  R         3.07 0.2063 Inf      2.67      3.48  1    
##  P         3.09 0.0770 Inf      2.93      3.24  1    
## 
## Results are given on the logexp(malldata2$Exposure) (not the response) scale. 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05
# Compute the multiple comparison on the ordinary scale
# We need to update the transformation so we can get answers on the original scale as well
mod.hab.rg <- update(ref_grid(mod.hab, at=list(exposure=1)), tran = logexp())

mod.hab.emmo2 <- emmeans::emmeans(mod.hab.rg, ~Habitat)
CLD(mod.hab.emmo2)
##  Habitat emmean     SE  df asymp.LCL asymp.UCL .group
##  N         2.86 0.0951 Inf      2.68      3.05  1    
##  W         2.96 0.2150 Inf      2.53      3.38  1    
##  R         3.07 0.2063 Inf      2.67      3.48  1    
##  P         3.09 0.0770 Inf      2.93      3.24  1    
## 
## Results are given on the logexp(1) (not the response) scale. 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05
dsr <- CLD(mod.hab.emmo2, type="response") # on DSR scale
dsr
##  Habitat  prob      SE  df asymp.LCL asymp.UCL .group
##  N       0.946 0.00486 Inf     0.936     0.955  1    
##  W       0.951 0.01011 Inf     0.927     0.967  1    
##  R       0.956 0.00872 Inf     0.935     0.970  1    
##  P       0.956 0.00322 Inf     0.950     0.962  1    
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logexp(1) scale 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05
# pairwise effects on logit scale
summary(pairs(mod.hab.emmo2),infer=TRUE)
##  contrast estimate    SE  df asymp.LCL asymp.UCL z.ratio p.value
##  N - P     -0.2227 0.122 Inf    -0.537    0.0916 -1.820  0.2637 
##  N - R     -0.2111 0.227 Inf    -0.795    0.3725 -0.929  0.7892 
##  N - W     -0.0929 0.235 Inf    -0.697    0.5110 -0.395  0.9791 
##  P - R      0.0116 0.220 Inf    -0.554    0.5773  0.053  0.9999 
##  P - W      0.1298 0.228 Inf    -0.457    0.7165  0.568  0.9415 
##  R - W      0.1182 0.298 Inf    -0.647    0.8838  0.397  0.9789 
## 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 4 estimates 
## P value adjustment: tukey method for comparing a family of 4 estimates
# pairwise effects on the DSR scale 
mod.hab.emmo3 <- regrid(emmeans(mod.hab, ~Habitat), transform="log")
summary(pairs(mod.hab.emmo3, type="response"), infer=TRUE)
##  contrast ratio     SE  df asymp.LCL asymp.UCL z.ratio p.value
##  N / P    0.928 0.0385 Inf     0.834      1.03 -1.803  0.2717 
##  N / R    0.931 0.0697 Inf     0.768      1.13 -0.950  0.7777 
##  N / W    0.969 0.0774 Inf     0.789      1.19 -0.399  0.9784 
##  P / R    1.004 0.0719 Inf     0.835      1.21  0.052  0.9999 
##  P / W    1.044 0.0803 Inf     0.857      1.27  0.559  0.9442 
##  R / W    1.040 0.1029 Inf     0.806      1.34  0.396  0.9789 
## 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 4 estimates 
## Intervals are back-transformed from the log scale 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## Tests are performed on the log scale
dsr.hab.plot <- ggplot(data=dsr, aes(x=Habitat, y=prob))+
  ggtitle("DSR in different habitats for mallards", subtitle="Logistic exposure model")+
  geom_point()+
  geom_errorbar(aes(ymin=asymp.LCL, ymax=asymp.UCL), width=.1)+
  ylab("DSR (95% ci)")
dsr.hab.plot 

ggsave(dsr.hab.plot,
       file=file.path("..","..","..","..","MyStuff","Images","mallard-dsr-habitat-le.png"), h=4, w=6, units="in", dpi=300)






# you can get the nest survival probability in the usual way by using the estimated DSRs.
# we want to do this for each habitat 
ns <- plyr::ddply(dsr, "Habitat", function(x, ndays){
  #browser()
  ns    <- x$prob^ndays
  ns.se <- x$SE* ndays * x$prob^(ndays-1) 
  data.frame(ns=ns, ns.se=ns.se)
}, ndays=20)
ns
##   Habitat        ns      ns.se
## 1       N 0.3293580 0.03383101
## 2       P 0.4091095 0.02752830
## 3       R 0.4049720 0.07385288
## 4       W 0.3625748 0.07711638
# fit a null models
mod.null <-  glm(Survive~1,
         family=binomial(link=logexp(malldata2$Exposure)),
         data=malldata2)
summary(mod.null)
## 
## Call:
## glm(formula = Survive ~ 1, family = binomial(link = logexp(malldata2$Exposure)), 
##     data = malldata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4714   0.3109   0.3109   0.3109   0.3109  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.00565    0.05551   54.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2494.8  on 6125  degrees of freedom
## Residual deviance: 1567.1  on 6125  degrees of freedom
## AIC: 1569.1
## 
## Number of Fisher Scoring iterations: 7
AICcmodavg::aictab( list(mod.hab, mod.null))
## Warning in aictab.AICglm.lm(list(mod.hab, mod.null)): 
## Model names have been supplied automatically in the table
## 
## Model selection based on AICc:
## 
##      K    AICc Delta_AICc AICcWt Cum.Wt      LL
## Mod2 1 1569.12       0.00   0.81   0.81 -783.56
## Mod1 4 1571.96       2.84   0.19   1.00 -781.98