salamander-gof.R

# Single Species Single Season Occupancy models 

# MacKenzie-Bailey goodness of fit test
# Illustration of using bootstrapping to estimate standard errors rather than using C-hat

# Using the unmarked package

#  unmarked package

library(AICcmodavg)
library(readxl)
library(unmarked)

## Loading required package: reshape

## Loading required package: lattice

## Loading required package: parallel

## Loading required package: Rcpp

library(ggplot2)

# get the data read in
# Data for detections should be a data frame with rows corresponding to sites
# and columns to visits.
# The usual 1=detected; 0=not detected; NA=not visited is used.

input.data <- readxl::read_excel("../salamander.xls",
                                 sheet="CompleteData",
                                 na="-",
                                 col_names=FALSE)  # notice no column names in row 1 of data file. 
head(input.data)

## # A tibble: 6 x 5
##    X__1  X__2  X__3  X__4  X__5
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     0     0     0     1     1
## 2     0     1     0     0     0
## 3     0     1     0     0     0
## 4     1     1     1     1     0
## 5     0     0     1     0     0
## 6     0     0     1     0     0

input.history <- input.data[, 1:5]
# do some basic checks on your data 
# e.g. check number of sites; number of visits etc
nrow(input.history)

## [1] 39

ncol(input.history)

## [1] 5

range(input.history, na.rm=TRUE)

## [1] 0 1

sum(is.na(input.history))

## [1] 0

head(input.history)

## # A tibble: 6 x 5
##    X__1  X__2  X__3  X__4  X__5
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     0     0     0     1     1
## 2     0     1     0     0     0
## 3     0     1     0     0     0
## 4     1     1     1     1     0
## 5     0     0     1     0     0
## 6     0     0     1     0     0

# Create the data structore for the occupancy model
# unmarked does not have any reserved keywords like RPresence so 
# we need to construct a covariate data frame with the covariates
# again stacked. The ordering is different in unmarked than in Rpresence or Presence.
# Here you want for site 1, the visit specific covariates, then
# for site 2, etc.
# This needs to be added to the unmarked dataframe.

# We also want to set up covariate values where we
# set the detection probability equal in first 2 occasions and last 3 occasions.
# We need to define a survey covariate that has two levels 
obs.covar <- data.frame( Site=rep(1:nrow(input.history), each=ncol(input.history)),
                         Visit=rep(1:ncol(input.history), nrow(input.history)),
                         Time =rep(c("T1","T2","T3","T4","T5"), nrow(input.history)),
                         D =rep(c("D1","D1","D2","D2","D2"), nrow(input.history)))
obs.covar[1:10,]

##    Site Visit Time  D
## 1     1     1   T1 D1
## 2     1     2   T2 D1
## 3     1     3   T3 D2
## 4     1     4   T4 D2
## 5     1     5   T5 D2
## 6     2     1   T1 D1
## 7     2     2   T2 D1
## 8     2     3   T3 D2
## 9     2     4   T4 D2
## 10    2     5   T5 D2

salamander.UMF <- unmarked::unmarkedFrameOccu(
                     y = input.history,
                     obsCovs=obs.covar) 
summary(salamander.UMF)

## unmarkedFrame Object
## 
## 39 sites
## Maximum number of observations per site: 5 
## Mean number of observations per site: 5 
## Sites with at least one detection: 18 
## 
## Tabulation of y observations:
##   0   1 
## 165  30 
## 
## Observation-level covariates:
##       Site        Visit   Time     D      
##  Min.   : 1   Min.   :1   T1:39   D1: 78  
##  1st Qu.:10   1st Qu.:2   T2:39   D2:117  
##  Median :20   Median :3   T3:39           
##  Mean   :20   Mean   :3   T4:39           
##  3rd Qu.:30   3rd Qu.:4   T5:39           
##  Max.   :39   Max.   :5

plot(salamander.UMF)

# Fit one model.
# Note that formula DO NOT HAVE AN = SIGN
# The two formula are for detecton and occupancy in that order
mod.pdot.u <- unmarked::occu(~1 ~1 , data=salamander.UMF, se=TRUE)

# This creates a complicated S4 object with SLOTS that can be accessed using various
# functions. This is more complcated than with RPresence
slotNames(mod.pdot.u)

##  [1] "knownOcc"         "fitType"          "call"            
##  [4] "formula"          "data"             "sitesRemoved"    
##  [7] "estimates"        "AIC"              "opt"             
## [10] "negLogLike"       "nllFun"           "bootstrapSamples"
## [13] "covMatBS"

# These functions return estimates on the LOGIT scale - not very useful for most people
coef(mod.pdot.u, type="state")

## psi(Int) 
## 0.383108

SE(mod.pdot.u, type="state")

##  psi(Int) 
## 0.5086094

confint(mod.pdot.u, type="state")

##               0.025    0.975
## psi(Int) -0.6137481 1.379964

# It is possible to use backTransform() but only if NO covariates
backTransform(mod.pdot.u, type='state')

## Backtransformed linear combination(s) of Occupancy estimate(s)
## 
##  Estimate    SE LinComb (Intercept)
##     0.595 0.123   0.383           1
## 
## Transformation: logistic

# better to use predict() to do the estimation.
# Because there are no covariates here, the data.frame is empty
newdata <- data.frame(factor=1:1)
predict(mod.pdot.u, type='state', newdata=newdata)

##   Predicted        SE     lower     upper
## 1 0.5946225 0.1225986 0.3512047 0.7989852

# Ditto for the detection probability
coef(mod.pdot.u,    type="det")

##    p(Int) 
## -1.052585

SE(mod.pdot.u,      type="det")

##    p(Int) 
## 0.3008626

confint(mod.pdot.u, type="det")

##            0.025      0.975
## p(Int) -1.642265 -0.4629054

backTransform(mod.pdot.u, type="det")

## Backtransformed linear combination(s) of Detection estimate(s)
## 
##  Estimate     SE LinComb (Intercept)
##     0.259 0.0577   -1.05           1
## 
## Transformation: logistic

predict(mod.pdot.u, type='det', newdata=newdata)

##   Predicted         SE     lower     upper
## 1  0.258729 0.05770192 0.1621571 0.3862968

# Look at the posterior probability of detection
# The best unbiased predictor of the posterior probability of detection
bup(ranef(mod.pdot.u))

##  [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [8] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563
## [22] 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563
## [29] 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563 0.2471563
## [36] 1.0000000 1.0000000 1.0000000 1.0000000

# Model where p(t) varies across survey occasions
# 

# Note that formula DO NOT HAVE AN = SIGN
# The two formula are for detecton and occupancy in that order
mod.pt.u <- unmarked::occu(~Time ~1 , data=salamander.UMF, se=TRUE)

# This creates a complicated S4 object with SLOTS that can be accessed using various
# functions. This is more complcated than with RPresence

# These functions return estimates on the LOGIT scale - not very useful for most people
# better to use predict() to do the estimation.
# Because there are no covariates here, the data.frame is empty
newdata <- data.frame(Time=c("T1","T2","T3","T4","T5"))
predict(mod.pt.u, type='state', newdata=newdata)[1,]

##   Predicted        SE   lower     upper
## 1 0.5798805 0.1175673 0.34897 0.7804233

# Ditto for the detection probability
cbind(newdata, predict(mod.pt.u, type='det', newdata=newdata))

##   Time Predicted         SE      lower     upper
## 1   T1 0.1768709 0.08451237 0.06443848 0.4013243
## 2   T2 0.1326551 0.07405439 0.04152004 0.3506471
## 3   T3 0.3979631 0.11900439 0.19981035 0.6363510
## 4   T4 0.3537424 0.11369993 0.17115987 0.5919834
## 5   T5 0.2653093 0.10101845 0.11564692 0.4993021

# Look at the posterior probability of detection
# The best unbiased predictor of the posterior probability of detection
bup(ranef(mod.pt.u))

##  [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [8] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751
## [22] 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751
## [29] 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751 0.2197751
## [36] 1.0000000 1.0000000 1.0000000 1.0000000

sum(bup(ranef(mod.pt.u)))

## [1] 22.61528

# Note that formula DO NOT HAVE AN = SIGN
# The two formula are for detecton and occupancy in that order
mod.pcustom.u <- unmarked::occu(~D ~1 , data=salamander.UMF, se=TRUE)

# This creates a complicated S4 object with SLOTS that can be accessed using various
# functions. This is more complcated than with RPresence

# These functions return estimates on the LOGIT scale - not very useful for most people
# better to use predict() to do the estimation.
# Because there are no covariates here, the data.frame is empty
newdata <- data.frame(D=c("D1","D1","D2","D2","D2"))
predict(mod.pcustom.u, type='state', newdata=newdata)[1,]

##   Predicted        SE     lower     upper
## 1  0.583171 0.1186577 0.3495705 0.7845761

# Ditto for the detection probability
cbind(newdata, predict(mod.pcustom.u, type='det', newdata=newdata))

##    D Predicted         SE      lower     upper
## 1 D1 0.1539190 0.05839225 0.07024011 0.3046249
## 2 D1 0.1539190 0.05839225 0.07024011 0.3046249
## 3 D2 0.3370867 0.07679144 0.20589609 0.4993087
## 4 D2 0.3370867 0.07679144 0.20589609 0.4993087
## 5 D2 0.3370867 0.07679144 0.20589609 0.4993087

# Look at the posterior probability of detection
# The best unbiased predictor of the posterior probability of detection
bup(ranef(mod.pcustom.u))

##  [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [8] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649
## [22] 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649
## [29] 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649 0.2258649
## [36] 1.0000000 1.0000000 1.0000000 1.0000000

sum(bup(ranef(mod.pcustom.u)))

## [1] 22.74316

# Model averaging
models.u <-unmarked::fitList(mod.pdot.u, mod.pt.u, mod.pcustom.u)

## Warning in unmarked::fitList(mod.pdot.u, mod.pt.u, mod.pcustom.u): Your
## list was unnamed, so model names were added as object names

aic.table.u <- unmarked::modSel(models.u)
aic.table.u

##               nPars    AIC delta AICwt cumltvWt
## mod.pcustom.u     3 162.82  0.00 0.760     0.76
## mod.pdot.u        2 165.76  2.94 0.175     0.93
## mod.pt.u          6 167.71  4.90 0.066     1.00

# Get model averaged estimates of occupancy
predict(models.u, type="state")[1,]

##   Predicted       SE     lower     upper
## 1 0.5849543 0.119358 0.3498164 0.7868192

# Get model averaged estimates of detection. 
# Notice these are one big list of nsites x nvisits long
# with the detection probabilities for each site
cbind(obs.covar, predict(models.u, type="det"))[1:10,]

##    Site Visit Time  D Predicted         SE      lower     upper
## 1     1     1   T1 D1 0.1737253 0.07033638 0.08590731 0.3252341
## 2     1     2   T2 D1 0.1708219 0.07001399 0.08440237 0.3219064
## 3     1     3   T3 D2 0.3274033 0.08354904 0.19785991 0.4885763
## 4     1     4   T4 D2 0.3244996 0.08210524 0.19597858 0.4856629
## 5     1     5   T5 D2 0.3186927 0.08202387 0.19233333 0.4795770
## 6     2     1   T1 D1 0.1737253 0.07033638 0.08590731 0.3252341
## 7     2     2   T2 D1 0.1708219 0.07001399 0.08440237 0.3219064
## 8     2     3   T3 D2 0.3274033 0.08354904 0.19785991 0.4885763
## 9     2     4   T4 D2 0.3244996 0.08210524 0.19597858 0.4856629
## 10    2     5   T5 D2 0.3186927 0.08202387 0.19233333 0.4795770

# Do we need to worry about lack of it

# See https://groups.google.com/forum/#!searchin/unmarked/goodness$20of$20fit%7Csort:date/unmarked/3wvnDyLlxok/ct1U723vCAAJ
# https://www.rdocumentation.org/packages/AICcmodavg/versions/2.1-1/topics/mb.gof.test
# MacKenzie, D. I., Bailey, L. L. (2004) Assessing the fit of site-occupancy models. Journal of Agricultural, Biological, and Environmental Statistics 9, 300--318.

library(AICcmodavg)

#  Mackenzie Bailey Goodness of fit test
#  compute chi-square for the top model
mod.pcustom.u.chi   = mb.chisq(mod.pcustom.u)
mod.pcustom.u.chi

## 
## MacKenzie and Bailey goodness-of-fit for single-season occupancy model
## 
## Pearson chi-square table:
## 
##       Cohort Observed Expected Chi-square
## 00000      0       21    21.00       0.00
## 00001      0        2     2.41       0.07
## 00010      0        2     2.41       0.07
## 00011      0        1     1.23       0.04
## 00100      0        5     2.41       2.78
## 00111      0        2     0.62       3.04
## 01000      0        2     0.86       1.50
## 10000      0        1     0.86       0.02
## 10011      0        1     0.22       2.71
## 10110      0        1     0.22       2.71
## 11110      0        1     0.04      22.68
## 
## Chi-square statistic = 42.311

mod.pcustom.u.boot = mb.gof.test(mod.pcustom.u, nsim = 100) # should do about 1000 sims

print(mod.pcustom.u.boot, digit.vals=4, digits.chisq=4)

## 
## MacKenzie and Bailey goodness-of-fit for single-season occupancy model
## 
## Pearson chi-square table:
## 
##       Cohort Observed Expected Chi-square
## 00000      0       21    21.00       0.00
## 00001      0        2     2.41       0.07
## 00010      0        2     2.41       0.07
## 00011      0        1     1.23       0.04
## 00100      0        5     2.41       2.78
## 00111      0        2     0.62       3.04
## 01000      0        2     0.86       1.50
## 10000      0        1     0.86       0.02
## 10011      0        1     0.22       2.71
## 10110      0        1     0.22       2.71
## 11110      0        1     0.04      22.68
## 
## Chi-square statistic = 42.311 
## Number of bootstrap samples = 100
## P-value = 0.04
## 
## Quantiles of bootstrapped statistics:
##   0%  25%  50%  75% 100% 
##   11   19   25   30  119 
## 
## Estimate of c-hat = 1.58

# estimating chat
names(mod.pcustom.u.boot)

## [1] "model.type"  "chisq.table" "chi.square"  "t.star"      "p.value"    
## [6] "c.hat.est"   "nsim"

chat <- mod.pcustom.u.boot$c.hat.est
chat

## [1] 1.582574

# Not possible to insert chat in the unmarked::modSel (groan) so we use te aictab from AICmodavg
AICcmodavg::aictab(list(mod.pdot.u, mod.pt.u, mod.pcustom.u) )

## Warning in aictab.AICunmarkedFitOccu(list(mod.pdot.u, mod.pt.u, mod.pcustom.u)): 
## Model names have been supplied automatically in the table

## 
## Model selection based on AICc:
## 
##      K   AICc Delta_AICc AICcWt Cum.Wt     LL
## Mod3 3 163.50       0.00   0.77   0.77 -78.41
## Mod1 2 166.09       2.59   0.21   0.97 -80.88
## Mod2 6 170.34       6.84   0.03   1.00 -77.86

AICcmodavg::aictab(list(mod.pdot.u, mod.pt.u, mod.pcustom.u), c.hat=chat )

## Warning in aictab.AICunmarkedFitOccu(list(mod.pdot.u, mod.pt.u, mod.pcustom.u), : 
## Model names have been supplied automatically in the table

## 
## Model selection based on QAICc:
## (c-hat estimate = 1.582574)
## 
##      K  QAICc Delta_QAICc QAICcWt Cum.Wt Quasi.LL
## Mod3 4 108.27        0.00    0.57   0.57   -49.55
## Mod1 3 108.90        0.63    0.42   0.99   -51.11
## Mod2 7 116.01        7.74    0.01   1.00   -49.20

# The bootstrap goodness of fit can also be done directly in umarked by defining
# our own statistics.
# Function returning three fit-statistics.
fitstats <- function(fm) {
    observed <- getY(fm@data)
    expected <- fitted(fm)
    resids <- residuals(fm)
    sse <- sum(resids^2)
    chisq <- sum((observed - expected)^2 / expected)
    freeTuke <- sum((sqrt(observed) - sqrt(expected))^2)
    out <- c(SSE=sse, Chisq=chisq, freemanTukey=freeTuke)
    return(out)
    }

mod.pcustom.u.pboot <- unmarked::parboot(mod.pcustom.u, fitstats, nsim=99, report=100)

## t0 = 24.85043 164.9871 35.41157

mod.pcustom.u.pboot@t0

##          SSE        Chisq freemanTukey 
##     24.85043    164.98712     35.41157

head(mod.pcustom.u.pboot@t.star)

##           SSE    Chisq freemanTukey
## [1,] 22.41026 169.0001     32.75553
## [2,] 24.01709 165.0151     33.67414
## [3,] 19.02564 172.9995     28.06794
## [4,] 26.30769 162.0000     36.73104
## [5,] 21.91026 168.9998     31.67448
## [6,] 23.40171 166.9964     33.59240

# How extreme is t0 relative to the bootstrap values.
# Small probabilities indicate lack of fit
colMeans(mod.pcustom.u.pboot@t.star > matrix(mod.pcustom.u.pboot@t0, byrow=TRUE,
                                    ncol=length(mod.pcustom.u.pboot@t0),
                                    nrow=nrow(mod.pcustom.u.pboot@t.star)))

##          SSE        Chisq freemanTukey 
##    0.5151515    0.4646465    0.4848485

# No easy way to automatically inflate the se and ci using c.hat in the unmarked package

salamander-gof.R

cschwarz

Thu Aug 9 11:36:14 2018