-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathR Workshop Script - Trend Analysis.R
339 lines (296 loc) · 15.5 KB
/
R Workshop Script - Trend Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
###########################################################################################
#Script for the R workshop held in conjunction with the OBBN Annual Biomonitoring Meeting
#January 18 2017
#
#Part 2: Best statistical practices for benthic biomonitoring
#Prepared by: Patrick Schaefer ([email protected])
###########################################################################################
###########################################################################################
#Scenario 2: Temporal Trend Analysis - A framework for multiscale trend analysis
#
#Questions:
#1) Are metrics at individual sites increasing or decreasing significantly?
#2) Is there evidence for declines on a watershed/subwatershed scale?
#3) Are some watersheds/subwatersheds significantly better or worse than others?
#
#Study Design:
#Long term environmental monitoring, repeated sampling at sites, stratified random sampling,
#minimum data requirments depend on questions being asked
#
#Example dataset: trenddata.csv
###########################################################################################
###########################################################################################
#1) Are metrics at individual sites increasing or decreasing significantly?
###########################################################################################
require(lme4)
require(MuMIn)
require(plotrix)
require(merTools)
require(lattice)
data("trenddata")
set.seed(101)
head(trenddata)
str(trenddata)
trenddata$Year.factor<-as.factor(trenddata$Year.factor)
trenddata$Site<-as.factor(trenddata$Site)
#########################################
#Step 1 - Determine the response distribution
#
#Usually best to use theoretical understanding of variables rather than actual distributions:
# - Binomial - presence/absence or percentages
# - Poisson - counts, richness (mean<8)
# - Gausian (or normal) - HBI, ratios (sometimes), richness (mean >10)
##########################################
#Will only examine HBI (Gaussian) and Intolerants Richness (Poisson)
hist(trenddata$HBI)
hist(trenddata$Intolerants.Richness)
##########################################
#Step 2 - Determine fixed and random predictor variables
#
#Because we are only interested in the effect of time, including too many additional predictors will
#limit the interprative power of time.
#
#If building multiple models, keep pool of predictor variables consistant between models, or it will be difficult to
#compare results
##########################################
#Site specific trend models
site.model1<-lmer(HBI~1+(MonYr|Site)+(1|Year.factor),
data=trenddata,
na.action=na.fail,
REML=F)
site.model2<-glmer(EPT.Richness~1+(MonYr|Site)+(1|Year.factor),
data=trenddata,
family=poisson,
na.action=na.fail)
##########################################
#Step 3 - Model Validation and Checking model assumptions
#
#Using models standardized residuals to check for normality, homoscedasticity, outliers and influential values
##########################################
#Check Model Assumptions
hist(resid(site.model1,type="pearson")) # The residual frequency distribution should vary around zero, showing no substantial skewness pattern
scatter.smooth(x=fitted(site.model1), y=resid(site.model1,type="pearson")) #The plot of standardised residuals (model.res) against fitted response values (fitted(model1)) must not show any change in the residual's variability along the fitted values
qqnorm(resid(site.model1))
qqline(resid(site.model1))
hist(resid(site.model2,type="pearson"))
scatter.smooth(x=fitted(site.model2), y=resid(site.model2,type="pearson"))
qqnorm(resid(site.model2))
qqline(resid(site.model2))
#Check Model Fit
summary(site.model1)
r.squaredGLMM(site.model1)
scatter.smooth(x=trenddata$HBI, y=fitted(site.model1))
xyplot(trenddata$HBI~trenddata$MonYr|trenddata$Site)
xyplot(fitted(site.model1)~trenddata$MonYr|trenddata$Site)
summary(site.model2)
r.squaredGLMM(site.model2)
scatter.smooth(x=trenddata$EPT.Richness, y=fitted(site.model2))
xyplot(trenddata$EPT.Richness~trenddata$MonYr|trenddata$Site)
xyplot(fitted(site.model2)~trenddata$MonYr|trenddata$Site)
#########################################
#Step 4 - Interpret Results
#
#Examine 95% confidence-intervals for random effect slope parameters. If they dont overlap 0 there is a significant
#trend at that site
#########################################
ci1<-ci_fun2(model=site.model1,
model.variable="MonYr",
data.variable="Site",
col.variable="Zone",
plot=T,
stat="median",
level=0.95,
data=trenddata)
ci2<-ci_fun2(model=site.model2,
model.variable="MonYr",
data.variable="Site",
col.variable="Zone",
plot=T,
stat="median",
level=0.85,
data=trenddata)
xyplot_fun2(model=site.model1,
model.variable="MonYr",
data.variable="Site",
data=trenddata)
xyplot_fun2(model=site.model2,
model.variable="MonYr",
data.variable="Site",
level=0.85,
data=trenddata)
###########################################################################################
#2) Is there evidence for declines on a watershed/subwatershed scale?
#3) Are some watersheds/subwatersheds significantly better or worse than others?
###########################################################################################
require(lme4)
require(MuMIn)
require(plotrix)
require(merTools)
require(lattice)
data("trenddata")
set.seed(101)
head(trenddata)
str(trenddata)
trenddata$Year.factor<-as.factor(trenddata$Year.factor)
trenddata$Site<-as.factor(trenddata$Site)
#########################################
#Step 1 - Determine the response distribution
#
#Usually best to use theoretical understanding of variables rather than actual distributions:
# - Binomial - presence/absence or percentages
# - Poisson - counts, richness (mean<8)
# - Gausian (or normal) - HBI, ratios (sometimes), richness (mean >10)
#Will only examine HBI (Gaussian) and Percent.ICHAEBO (Binomial)
##########################################
##########################################
#Step 2 - Determine fixed and random predictor variables
#
#Here we are interested in determining if there are significant differences in our metrics in lower (L), middle (M) and
#upper (U) portions of the watershed, and if there is evidence for significant trends in these regions.
#
#Because we are interested in the effect of time and space, these two variables are now added as fixed effects.
#
##########################################
#Watershed/subwatershed model
watershed.model1<-lmer(HBI~0+MonYr*Zone+(MonYr|Site)+(1|Year.factor),
data=trenddata,
na.action=na.fail,
REML=F)
watershed.model2<-glmer(EPT.Richness~0+MonYr*Zone+(MonYr|Site)+(1|Year.factor)+(1|obs),
data=trenddata,
na.action=na.fail,
family=poisson)
#Here we run into one of the most common warning messages using glmer. Essentially, this warning tells us
#the heiristic algorithms that estimate model parameters may not have landed on the best parameter estimate.
#see:
?convergence
#and a checklist for troubleshooting convergence errors can be found here:
#https://rstudio-pubs-static.s3.amazonaws.com/33653_57fc7b8e5d484c909b615d8633c01d51.html
#
#In this case we can fix the warning by changing the optimizer:
watershed.model2<-glmer(EPT.Richness~0+MonYr*Zone+(MonYr|Site)+(1|Year.factor)+(1|obs),
data=trenddata,
na.action=na.fail,
family=poisson,
glmerControl(optimizer = "bobyqa"))
##########################################
#Step 3 - Model Validation and Checking model assumptions
#
#Using models standardized residuals to check for normality, homoscedasticity, outliers and influential values
##########################################
#Check Model Assumptions
hist(resid(watershed.model1,type="pearson"))
scatter.smooth(x=fitted(watershed.model1), y=resid(watershed.model1,type="pearson"))
qqnorm(resid(watershed.model1))
qqline(resid(watershed.model1))
hist(resid(watershed.model2,type="pearson"))
scatter.smooth(x=fitted(watershed.model2), y=resid(watershed.model2,type="pearson"))
qqnorm(resid(watershed.model2))
qqline(resid(watershed.model2))
#Check Model Fit
summary(watershed.model1)
r.squaredGLMM(watershed.model1)
scatter.smooth(x=trenddata$HBI, y=fitted(watershed.model1))
xyplot(trenddata$HBI~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
xyplot(fitted(watershed.model1)~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
summary(watershed.model2)
r.squaredGLMM(watershed.model2)
scatter.smooth(x=trenddata$EPT.Richness, y=fitted(watershed.model2))
xyplot(trenddata$EPT.Richness~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
xyplot(fitted(watershed.model2)~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
##########################################
#Step 4 - Multi-model inference (part 1)
#
#Using more classical hypothesis testing, we could test the significance of the time and space parameters in the previous model as well as
#their interactions to gain insights into their importance. For many reasons (that are beyond the scope of this workshop),
#this is not the approach that should be taken.
#Simply, parameter significance depends on what other parameters are in the model. This complicates matters when there isn't overwhelming
#support for a single model. Multiple models may be nearly equally good at describing variation in the dataset. A more objective approach is
#to build multiple models that provide a concensus of variable importance.
#
#The dredge() function takes the full (i.e. most complex, including all terms of interest and interactions) model and automatically
#builds all combinations of simpler models from the provided terms. For each model AIC values are computed where measure the model fit
#in relation to the numer of parameters used to achieve the fit.
##########################################
mm1<-dredge(watershed.model1,rank="AIC")
mm1
mm2<-dredge(watershed.model2,rank="AIC")
mm2
########################################
##########################################
#Step 4 - Multi-model inference (part 2)
#
#In both models, there is stronger support for the top fitting model than the rest, but the difference is not so great that subsequent models
#should not be evaluated. the subset argument lets you test different thresholds of ΔAIC to test:
# i.e. ΔAIC ≤ 2, ΔAIC ≤ 4, ΔAIC ≤ 6, cumulative AICw ≤ 0.95
#
##########################################
top.models1<-get.models(mm1,subset=cumsum(mm1$weight)<=0.95)
top.models2<-get.models(mm2,subset=mm2$delta<4)
##########################################
#Step 5 - Multi-model inference (part 3)
#
#Once we have selected our top models, we can avergae parameter estimates from top models to better understand their importance.
#
#There are two methods to average model coefficients and their errors:
#The natural method provides a weighted mean for each variable coefficient and its errors, based on AICw values, but only,
#if this variable occurs in a model. Variables not occurring in a model are neglected. In contrast, the zero method
#calculates a weighted mean for coefficients and errors, while substituting variable coefficients absent in a model by zero.
#
#Note that the argument revised.var=TRUE provides a corrected estimation of the standard errors. The function
#summary (mm1.av) provides the model outputs for the natural (called subset/conditional) and zero methods (called full), including averaged
#model parameters (SES), errors and significance
##########################################
mm1.av<-model.avg(get.models(mm1,subset=cumsum(mm1$weight)<=0.95), revised.var=T)
mm2.av<-model.avg(get.models(mm2,subset=mm2$delta<8), revised.var=T)
summary(mm1.av)
summary(mm2.av)
xyplot(trenddata$HBI~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
xyplot(trenddata$EPT.Richness~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
xyplot(fitted(watershed.model2)~trenddata$MonYr|trenddata$Zone,groups=trenddata$Site)
##########################################
#Step 6 - Interpreting parameter estimates
#
#For lmer models, parameter interpretation is fairly easy if you fit each level of the model with its own intercept term
#(i.e. response~1+predictors, vs. response~0+predictors).
summary(mm1.av)
confint(mm1.av,full=F)
#There is strong evidence for differences between zones (L=6.7,M=5.4,U=5.7) - parameter estimates can be treated as model derived averages
#
#Because monyr*zone interactions were significant in at least one of the selected models, there is no global slope
#parameter estimate, only zone specific slopes. The parameter estimate "MonYr" is the slope in the reference
#treatment (i.e. the first in the list or L here). Here HBI decreases by 0.02 per year in the Lower zone. Parameter estimates in
#the other 2 zones are also based on the slope in the reference zone (i.e. the slope in the upper zone = "MonYr" + "MonYr:ZoneM").
#Using the more conservative full average, U zone slope is = -0.024848+-0.003120 = -0.027968, and M zone slope is = -0.024848 + 0.003259 = -0.021589
#U and M zones slopes are not significantly different from the refernce zone (L) slope, and the relative importance of the MonYr*Zone interaction
#is quite low. This indicates zonal differences in slope are likely not signifcant.
#
#The ecological interpretation is that there are significant differences in HBI values between zones. There is also evidence for a slight
#decline in HBI values across the watershed (~0.02 per year). This trend was not significantly different in different parts of the watershed.
#
#NOT: If you did not fit a intercept term to each level of the model (i.e. response~1+predictors), then grouping parameters need to be treated like
#slope was above (i.e. each level of the treatment is relative to the reference group)
#
###############
#
#For glmer models, interpretation is a little more dificult.
summary(mm2.av)
confint(mm1.av,full=F)
#Group/intercept estimates need to be back transformed based on the family used in the model. For default: Poisson use exp(x), for binomial use 1/(1+1/exp(x)).
coef(mm2.av,full=F)[c("ZoneL","ZoneM","ZoneU")]
exp(coef(mm2.av,full=F)[c("ZoneL","ZoneM","ZoneU")])
#The average number of EPT taxa is 3.6 in the L zone, and ~6.4 for M and U. Significant differences detected between zones.
#
#Slope estimates are interpreted as % changes when raw and as multiplicative factors when exponentiated.
coef(mm2.av,full=F)["MonYr"]
coef(mm2.av,full=F)["MonYr"]+coef(mm2.av,full=F)["MonYr:ZoneU"]
exp(coef(mm2.av,full=F)["MonYr"])
exp(coef(mm2.av,full=F)["MonYr"]+coef(mm2.av,full=F)["MonYr:ZoneU"])
#In the L zone, EPT richness is decreasing by ~1% every year or multiplied by ~0.99. Alternatively, in the upper zone,
#richness is increasing by ~2% or multiplied by 1.02. In this case, the model with MonYr and the MonYr*Zone interaction performed
#considerably better than the model without, resulting in higher relative importance of the temporal trend, and different trends between zones.
#
#Ecological Interpretation: There are significant differences in intolerant richness between zones. There is evidence for different trends in
#different parts of the watershed. Specifically, it appears the U zone is increasing in intolerants richness relative to M and L zones.
#
##########################################