# BACI-fish

# This is the example that appears in Smith (2002, Table 6). 
# Samples of fish were taken for a period
# of 12 months before and 13 months after a nuclear
# power plant began operations. 

# The power plant is cooled by water that is drawn from a river.

# When the water exits the plant, its temperature is
# elevated. 
# The concern is that the warmed water will adversely affect the abundance 
# and composition of fish below the plant. 

# [It wasn't clear from Smith (2002) what the response measure is, 
# so I'll arbitrarily assume that it is Counts of fish]


library(ggplot2)
library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(plyr)

source("../../schwarz.functions.r")
# Get the BACI power program
source("../baci-power.r")

# Read in the actual data
# sink("baci-fish-R-001.txt",split=TRUE)
##***part001b;
fish <- read.csv("baci-fish.csv", header=TRUE, as.is=TRUE, strip.white=TRUE)

fish$trt <- interaction(fish$SiteClass, fish$Site, fish$Period)
fish$Period        <- factor(fish$Period)
fish$SiteClass     <- factor(fish$SiteClass)
fish$Site          <- factor(fish$Site)
fish$SamplingTimeF <- factor(fish$SamplingTime)
fish$trt           <- factor(fish$trt)

fish$logCount      <- log(fish$Count+ .5) # add 1/2 of smallest postive count

fish[1:10,]
##    SamplingTime Year Period SiteClass Site Count               trt
## 1             1   75 Before   Control   C1     1 Control.C1.Before
## 2             1   75 Before    Impact   I1     1  Impact.I1.Before
## 3             2   75 Before   Control   C1     0 Control.C1.Before
## 4             2   75 Before    Impact   I1     1  Impact.I1.Before
## 5             3   75 Before   Control   C1     0 Control.C1.Before
## 6             3   75 Before    Impact   I1     0  Impact.I1.Before
## 7             4   75 Before   Control   C1     2 Control.C1.Before
## 8             4   75 Before    Impact   I1     6  Impact.I1.Before
## 9             5   75 Before   Control   C1    47 Control.C1.Before
## 10            5   75 Before    Impact   I1   103  Impact.I1.Before
##    SamplingTimeF   logCount
## 1              1  0.4054651
## 2              1  0.4054651
## 3              2 -0.6931472
## 4              2  0.4054651
## 5              3 -0.6931472
## 6              3 -0.6931472
## 7              4  0.9162907
## 8              4  1.8718022
## 9              5  3.8607297
## 10             5  4.6395716
##***part001e;
# sink()

str(fish)
## 'data.frame':    50 obs. of  9 variables:
##  $ SamplingTime : int  1 1 2 2 3 3 4 4 5 5 ...
##  $ Year         : int  75 75 75 75 75 75 75 75 75 75 ...
##  $ Period       : Factor w/ 2 levels "After","Before": 2 2 2 2 2 2 2 2 2 2 ...
##  $ SiteClass    : Factor w/ 2 levels "Control","Impact": 1 2 1 2 1 2 1 2 1 2 ...
##  $ Site         : Factor w/ 2 levels "C1","I1": 1 2 1 2 1 2 1 2 1 2 ...
##  $ Count        : int  1 1 0 1 0 0 2 6 47 103 ...
##  $ trt          : Factor w/ 4 levels "Control.C1.After",..: 3 4 3 4 3 4 3 4 3 4 ...
##  $ SamplingTimeF: Factor w/ 25 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ logCount     : num  0.405 0.405 -0.693 0.405 -0.693 ...
# Preliminary plot

# Get plot of series over time
##***part010b;
prelimplot <- ggplot(data=fish, aes(x=SamplingTime, y=logCount,
                          group=Site, color=SiteClass, shape=Site))+
  ggtitle("Fish counts over time")+
  geom_point()+
  geom_line()+
  geom_vline(xintercept=-0.5+min(fish$SamplingTime[as.character(fish$Period) == "After"]))
prelimplot

##***part010e;
#ggsave(plot=prelimplot, file="baci-fish-R-010.png",
#       h=4, w=6, units="in", dpi=300)



# There are several ways in which this BACI design can be analyzed.


###############################################################################
# Do a t-test on the differeces of the averages at each sampling time for each site
# There is only one observtion at each sampling time, so we don't have compute
# the averages here


# sink('baci-fish-R-101.txt',split=TRUE)
##***part101b;
# Convert file from long- to wide-format
# to get the Control and Impact measurements on the same line
# and compute the difference
fish.month <- tidyr::pivot_wider(fish,
                                 id_cols=c("SamplingTime","Period"),
                                 names_from="SiteClass",
                                 values_from="logCount")
fish.month$diff <- fish.month$Impact - fish.month$Control
fish.month[1:10,]
## # A tibble: 10 × 5
##    SamplingTime Period Control Impact    diff
##           <int> <fct>    <dbl>  <dbl>   <dbl>
##  1            1 Before   0.405  0.405  0     
##  2            2 Before  -0.693  0.405  1.10  
##  3            3 Before  -0.693 -0.693  0     
##  4            4 Before   0.916  1.87   0.956 
##  5            5 Before   3.86   4.64   0.779 
##  6            6 Before   4.15   3.60  -0.554 
##  7            7 Before   4.36   1.87  -2.49  
##  8            8 Before   2.80   4.97   2.16  
##  9            9 Before   4.97   4.98   0.0138
## 10           10 Before   3.35   2.25  -1.10
##***part101e;
# sink()


# Plot the difference over time
##***part102b;
plotdiff <- ggplot(data=fish.month, aes(x=SamplingTime, y=diff))+
  ggtitle("Plot of differences over time")+
  ylab("Difference (Impact-Control)")+
  geom_point()+
  geom_line()+
  geom_vline(xintercept=-0.5+min(fish$SamplingTime[as.character(fish$Period) == "After"]))
plotdiff

##***part102e;
#ggsave(plot=plotdiff, file="baci-fish-R-102.png", h=4, w=6, units="in", dpi=300)


# do the two sample t-test not assuming equal variances
# sink('baci-fish-R-104.txt',split=TRUE)
##***part104b;
result <- try(t.test(diff ~ Period, data=fish.month),silent=TRUE)
if(class(result)=="try-error")
     {cat("Unable to do unequal variance t-test because of small sample size\n")} else
     { 
   result$diff.in.means <- sum(result$estimate*c(1,-1))
   names(result$diff.in.means)<- "diff.in.means"
   print(result)
   cat("Mean difference in log(Count) :,",result$diff.in.means," (SE ", result$stderr,")\n")
     }
## 
##  Welch Two Sample t-test
## 
## data:  diff by Period
## t = 0.28689, df = 20.742, p-value = 0.777
## alternative hypothesis: true difference in means between group After and group Before is not equal to 0
## 95 percent confidence interval:
##  -0.8284787  1.0934134
## sample estimates:
##  mean in group After mean in group Before 
##            0.4418295            0.3093621 
## 
## Mean difference in log(Count) :, 0.1324674  (SE  0.4617297 )
##***part104e;
# sink()


# do the two sample t-test  assuming equal variances
# sink('baci-fish-R-105.txt',split=TRUE)
##***part105b;
result <- t.test(diff ~ Period, data=fish.month, var.equal=TRUE)
result$diff.in.means <- sum(result$estimate*c(1,-1))
names(result$diff.in.means)<- "diff.in.means"
result
## 
##  Two Sample t-test
## 
## data:  diff by Period
## t = 0.28989, df = 23, p-value = 0.7745
## alternative hypothesis: true difference in means between group After and group Before is not equal to 0
## 95 percent confidence interval:
##  -0.8128144  1.0777491
## sample estimates:
##  mean in group After mean in group Before 
##            0.4418295            0.3093621
cat("Mean difference in log(Count) :,",result$diff.in.means," (SE ", result$stderr,")\n")
## Mean difference in log(Count) :, 0.1324674  (SE  0.4569542 )
##***part105e;
# sink()




# do the two sample Wilcoxon test
# sink('baci-fish-R-107.txt',split=TRUE)
##***part107b;
result <- wilcox.test(diff ~ Period, data=fish.month, conf.int=TRUE)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact confidence intervals with ties
result
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  diff by Period
## W = 81, p-value = 0.8916
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.8611464  1.0985632
## sample estimates:
## difference in location 
##             0.02032091
##***part107e;
# sink()





###############################################################################
# Do a Mixed effect linear model on the individual values


# sink('baci-fish-R-300-type3.txt',split=TRUE)
##***part300b;
# Because there is ONLY one measurement per year, the SamplingTime*Site and
# residual variance are total confounded and cannot be separated. This is 
# the residual term.
result.lmer <- lmerTest::lmer(log(Count+.5) ~ SiteClass+Period+SiteClass:Period +
                                     (1|SamplingTimeF),
                                        data=fish)
##***part300e;

##***part301b;
anova(result.lmer, ddf="Kenward-Roger")
## Type III Analysis of Variance Table with Kenward-Roger's method
##                   Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## SiteClass        1.76058 1.76058     1    23  2.7024 0.1138
## Period           0.11746 0.11746     1    23  0.1803 0.6751
## SiteClass:Period 0.05475 0.05475     1    23  0.0840 0.7745
##***part301e;
# sink()

summary(result.lmer)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log(Count + 0.5) ~ SiteClass + Period + SiteClass:Period + (1 |  
##     SamplingTimeF)
##    Data: fish
## 
## REML criterion at convergence: 174.6
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.62164 -0.51054 -0.01742  0.50000  1.84821 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  SamplingTimeF (Intercept) 3.0325   1.7414  
##  Residual                  0.6515   0.8071  
## Number of obs: 50, groups:  SamplingTimeF, 25
## 
## Fixed effects:
##                              Estimate Std. Error      df t value Pr(>|t|)   
## (Intercept)                    1.6423     0.5323 27.4202   3.085  0.00461 **
## SiteClassImpact                0.4418     0.3166 23.0000   1.396  0.17616   
## PeriodBefore                   0.3777     0.7684 27.4202   0.492  0.62690   
## SiteClassImpact:PeriodBefore  -0.1325     0.4570 23.0000  -0.290  0.77450   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) StClsI PrdBfr
## StClssImpct -0.297              
## PeriodBefor -0.693  0.206       
## StClssIm:PB  0.206 -0.693 -0.297
# sink('baci-fish-R-300-vc.txt',split=TRUE)
##***part300vcb;
# Extract the variance components
vc <- VarCorr(result.lmer)
vc
##  Groups        Name        Std.Dev.
##  SamplingTimeF (Intercept) 1.74142 
##  Residual                  0.80714
##***part300vce;
# sink()


# emmeans after a lm() fit
# sink('baci-fish-R-s300LSM-SiteClass.txt',split=TRUE)
##***parts300LSM-SiteClassb;
result.lmer.emmo.S <- emmeans::emmeans(result.lmer, ~SiteClass)
## NOTE: Results may be misleading due to involvement in interactions
cat("\nEstimated marginal means for SiteClass \n\n")
## 
## Estimated marginal means for SiteClass
summary(result.lmer.emmo.S)
##  SiteClass emmean    SE   df lower.CL upper.CL
##  Control     1.83 0.384 27.4     1.04     2.62
##  Impact      2.21 0.384 27.4     1.42     2.99
## 
## Results are averaged over the levels of: Period 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log(mu + 0.5) (not the response) scale. 
## Confidence level used: 0.95
##***parts300LSM-SiteClasse;
# sink()

# sink('baci-fish-R-300LSM-Period.txt',split=TRUE)
##***part300LSM-Periodb;
result.lmer.emmo.P <- emmeans::emmeans(result.lmer, ~Period)
## NOTE: Results may be misleading due to involvement in interactions
cat("\nEstimated marginal means \n\n")
## 
## Estimated marginal means
summary(result.lmer.emmo.P)
##  Period emmean    SE df lower.CL upper.CL
##  After    1.86 0.508 23    0.812     2.91
##  Before   2.17 0.529 23    1.080     3.27
## 
## Results are averaged over the levels of: SiteClass 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log(mu + 0.5) (not the response) scale. 
## Confidence level used: 0.95
##***part300LSM-Periode;
# sink()

# sink('baci-fish-R-300LSM-int.txt',split=TRUE)
##***part300LSM-intb;
result.lmer.emmo.SP <- emmeans::emmeans(result.lmer, ~SiteClass:Period)
cat("\nEstimated marginal means \n\n")
## 
## Estimated marginal means
summary(result.lmer.emmo.SP)
##  SiteClass Period emmean    SE   df lower.CL upper.CL
##  Control   After    1.64 0.532 27.4    0.551     2.73
##  Impact    After    2.08 0.532 27.4    0.993     3.18
##  Control   Before   2.02 0.554 27.4    0.884     3.16
##  Impact    Before   2.33 0.554 27.4    1.193     3.47
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log(mu + 0.5) (not the response) scale. 
## Confidence level used: 0.95
##***part300LSM-inte;
# sink()

# Estimate the BACI contrast
# You could look at the entry in the summary table from the model fit, but
# this is dangerous as these entries depend on the contrast matrix.
# It is far safer to the contrast function applied to an emmeans object
temp <- summary(result.lmer)$coefficients # get all the coefficients
temp[grepl("SiteClass",rownames(temp)) & grepl("Period", rownames(temp)),]
##   Estimate Std. Error         df    t value   Pr(>|t|) 
## -0.1324674  0.4569542 23.0000004 -0.2898920  0.7744963
# sink("baci-fish-R-300baci.txt",split=TRUE)
##***part300bacib; 
# Estimate the BACI contrast along with a se
summary(contrast(result.lmer.emmo.SP, list(baci=c(1,-1,-1,1))), infer=TRUE)
##  contrast estimate    SE df lower.CL upper.CL t.ratio p.value
##  baci       -0.132 0.457 23    -1.08    0.813  -0.290  0.7745
## 
## Note: contrasts are still on the log(mu + 0.5) scale. Consider using
##       regrid() if you want contrasts of back-transformed estimates. 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95
##***part300bacie;
# sink()




# Check the residuals etc

##***part300diagnosticb;
diagplot <- sf.autoplot.lmer(result.lmer)
## Loading required package: broom.mixed
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: lattice
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plot(diagplot)

##***part300diagnostice;
#ggsave.ggmultiplot(plotdiag, # see https://github.com/sinhrks/ggfortify/issues/98 for bug in autoplot  
#       file='baci-fish-R-300-diagnostic.png', h=4, w=6, units="in", dpi=300)



#######################################################################################
#######################################################################################
#######################################################################################

# Power analysis for BACI design with a single control and single impact site, but 
# with multiple years before/after impact

# Example of of the fish analysis.
# Fish counts are quite variable and a BACI difference of .5 on the log scale is important
#    We use mu_IA=20, mu_IB=30, mu_CB=30, mu_cA=30
#
# We have the year-to-year variation (sdYear) 
# The site-to-site variation can be set to any arbitrary value (e.g. 0)
# The residual and site-year variance components are confounded to set
# the sdYearSite and the sdResidual to 0
# Also set the n_IA=1 etc.
#
# We examine various scenarios for the number of sites and number subsamples 




# Illustrate how to get computations for a single scenario
baci.power(n_IA=1, n_IB=1, n_CA=1, n_CB=1, 
           ns_I=1, ns_C=1, 
           ny_B=12, ny_A=13, 
           mu_IA=.2, mu_IB=0, mu_CA=0, mu_CB=0, 
           sdYear=31.71, sdSite=99, sdSiteYear=27.07, sdResid=0)
##   alpha sdSite sdYear sdSiteYear sdResid n_IA n_IB n_CA n_CB ns_I ns_C ny_B
## 1  0.05     99  31.71      27.07       0    1    1    1    1    1    1   12
##   ny_A mu_IA mu_IB mu_CA mu_CB baci  baci.se dfdenom          ncp    Fcrit
## 1   13   0.2     0     0     0  0.2 15.32537      23 0.0001703092 4.279344
##        power    Tcrit  os.power1  os.power2
## 1 0.05001794 1.713872 0.05132061 0.04870662
vc <- as.data.frame(VarCorr(result.lmer))
##***part500b;
# Note that because there is only 1 site, there is NO variance component
# associated with site. Use any value here. In general, the site variance
# component is not important because when you meaure the same sites over time
# the site effects cancel out (like blocking) and so site-to-site variability
# is not important. The site-year variation really is what drives the BACI power.

sdResid = vc[ vc$grp=="Residual",      "sdcor"]
sdYear  = vc[ vc$grp=="SamplingTimeF", "sdcor"]

scenarios <- expand.grid(n_quad=1,
                         n.months_B =seq(12,20,2),
                         n.months_A =seq(13,20,1),
                         baci_effect=0.5,
                         sdYear = sdYear, sdResid=sdResid)
head(scenarios)
##   n_quad n.months_B n.months_A baci_effect   sdYear  sdResid
## 1      1         12         13         0.5 1.741419 0.807142
## 2      1         14         13         0.5 1.741419 0.807142
## 3      1         16         13         0.5 1.741419 0.807142
## 4      1         18         13         0.5 1.741419 0.807142
## 5      1         20         13         0.5 1.741419 0.807142
## 6      1         12         14         0.5 1.741419 0.807142
power <- plyr::adply(scenarios,1,function(x){
  #browser()
  power <- baci.power(
    n_IA=x$n_quad, n_IB=x$n_quad, n_CA=x$n_quad, n_CB=x$n_quad, 
    ns_I=1, ns_C=1, 
    ny_B=x$n.months_B, ny_A=x$n.months_A, 
    mu_IA=x$baci_effect, mu_IB=0, mu_CA=0, mu_CB=0, 
    sdYear=x$sdYear, sdSite=0, sdSiteYear=x$sdResid, sdResid=0)
  power
})

head(power)
##   n_quad n.months_B n.months_A baci_effect alpha sdSite   sdYear sdSiteYear
## 1      1         12         13         0.5  0.05      0 1.741419   0.807142
## 2      1         14         13         0.5  0.05      0 1.741419   0.807142
## 3      1         16         13         0.5  0.05      0 1.741419   0.807142
## 4      1         18         13         0.5  0.05      0 1.741419   0.807142
## 5      1         20         13         0.5  0.05      0 1.741419   0.807142
## 6      1         12         14         0.5  0.05      0 1.741419   0.807142
##   sdResid n_IA n_IB n_CA n_CB ns_I ns_C ny_B ny_A mu_IA mu_IB mu_CA mu_CB baci
## 1       0    1    1    1    1    1    1   12   13   0.5     0     0     0  0.5
## 2       0    1    1    1    1    1    1   14   13   0.5     0     0     0  0.5
## 3       0    1    1    1    1    1    1   16   13   0.5     0     0     0  0.5
## 4       0    1    1    1    1    1    1   18   13   0.5     0     0     0  0.5
## 5       0    1    1    1    1    1    1   20   13   0.5     0     0     0  0.5
## 6       0    1    1    1    1    1    1   12   14   0.5     0     0     0  0.5
##     baci.se dfdenom      ncp    Fcrit     power    Tcrit os.power1   os.power2
## 1 0.4569542      23 1.197277 4.279344 0.1824510 1.713872 0.2800207 0.003387034
## 2 0.4396541      25 1.293355 4.241699 0.1943614 1.708141 0.2951721 0.002960494
## 3 0.4262185      27 1.376181 4.210008 0.2047466 1.703288 0.3081427 0.002641309
## 4 0.4154683      29 1.448319 4.182964 0.2138748 1.699127 0.3193705 0.002394624
## 5 0.4066636      31 1.511714 4.159615 0.2219566 1.695519 0.3291837 0.002198952
## 6 0.4490524      24 1.239784 4.259677 0.1877758 1.710882 0.2868063 0.003188366
##***part500e;



##***part510b;
power.plot <- ggplot(data=power, aes(x=ny_A, y=power, color=as.factor(ny_B)))+
  ggtitle("Estimated power",
          subtitle=paste("alpha: ", power$alpha[1],"; baci_effect:", power$baci_effect[1]))+
  geom_point()+
  geom_line()+
  ylim(0,1)+
  geom_hline(yintercept=0.8, color="blue")+
  xlab("Number of months of monitoring after impact")+
  scale_color_discrete(name="# months\nbefore")
power.plot

##***part510e;