knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rm(list = ls())
library(sf)
library(dplyr)
library(ggplot2)
library(cowplot)
library(mgcv)
library(car)
library(ggpubr)
library(gridExtra)
library(GGally)
library(e1071)
library(tidyr)
# Assuming `gam_pond_simplified` is your fitted GAM model
outdir <- "H:/My Drive/Projects/Beavers/Beaver_EarthEngine/RFanalysis_clusters/outputs"
outfig <- "H:/My Drive/Manuscript/BeavePond_westernUS/Figs"
load(paste0(outdir,"/merged_pond_var_update.RData"))
names(merged_pond_var_update)
## [1] "ClusterID" "Clay_median"
## [3] "Thetas_median" "woody_cover_percent"
## [5] "woody_height" "NHDPlusHR_Flowline_Length_m"
## [7] "MainChannelSlope" "ValleyBottWidth"
## [9] "upa_km2" "CTI_max"
## [11] "SPI_median" "STI_median"
## [13] "Max_Summer_Temp_C" "Total_Spring_Precip_mm"
## [15] "Total_Precip_Annual_mm" "Max_SWE_March_snodas_m"
## [17] "DamLenMcluster" "pondAream2cluster"
merged_pond_var <- merged_pond_var_update # in seek of convenience
names(merged_pond_var)
## [1] "ClusterID" "Clay_median"
## [3] "Thetas_median" "woody_cover_percent"
## [5] "woody_height" "NHDPlusHR_Flowline_Length_m"
## [7] "MainChannelSlope" "ValleyBottWidth"
## [9] "upa_km2" "CTI_max"
## [11] "SPI_median" "STI_median"
## [13] "Max_Summer_Temp_C" "Total_Spring_Precip_mm"
## [15] "Total_Precip_Annual_mm" "Max_SWE_March_snodas_m"
## [17] "DamLenMcluster" "pondAream2cluster"
# change name
names(merged_pond_var) <- c("ClusterID","ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
"MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
"STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength","PondArea")
merged_pond_var_select <- merged_pond_var[,c("ClusterID","WoodyHeight","SPImedian","DamLength","PondArea")]
predictors_all <- merged_pond_var[,c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
"MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
"STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength")]
# wide to long format
predictors_all_long <- predictors_all %>%
pivot_longer(
cols = c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
"MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
"STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength"),
names_to = c("Variable"), # New column names
values_to = "Value" # Name of the new value column
)
# Manually set the order of the facets
predictors_all_long$Variable <- factor(predictors_all_long$Variable,
levels = c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
"MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
"STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength"))
# remove cluster ID
names(merged_pond_var)
## [1] "ClusterID" "ClayMedian" "ThetasMedian" "WoodyCover"
## [5] "WoodyHeight" "FlowlineLength" "MainChannelSlope" "ValleyBottWidth"
## [9] "UPA" "CTImax" "SPImedian" "STImedian"
## [13] "SummerTempMax" "SpringPrecip" "AnnualPrecip" "SWEMarchMax"
## [17] "DamLength" "PondArea"
merged_pond_allx_y <- merged_pond_var[,c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
"MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
"STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength","PondArea")]
necessary for both linear and non-linear models when your goal is to explore which factors are impacting ponding size and interpret individual predictors.
# VIF is less critical if goal is pure prediction
names(merged_pond_allx_y)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "PondArea"
merged_pond_allx_y$log_PondArea <- log(merged_pond_allx_y$PondArea) # natural logarithms
names(merged_pond_allx_y)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "PondArea" "log_PondArea"
lm_model <- lm(PondArea ~ ., data = merged_pond_allx_y)
vif_values <- vif(lm_model)
vif_values_df <- as.data.frame(vif_values)
print(vif_values)
## ClayMedian ThetasMedian WoodyCover WoodyHeight
## 3.781273 4.126833 2.260386 2.624755
## FlowlineLength MainChannelSlope ValleyBottWidth UPA
## 2.556603 2.818023 2.343186 3.473111
## CTImax SPImedian STImedian SummerTempMax
## 5.838891 1.426137 1.657281 5.337581
## SpringPrecip AnnualPrecip SWEMarchMax DamLength
## 8.122911 4.009046 5.798889 4.033476
## log_PondArea
## 2.704626
# unselect pond area as we use log of pond area
merged_pond_allx_logY <- merged_pond_allx_y[,c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
"MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
"STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength","log_PondArea")]
lm_model_log <- lm(log_PondArea ~ ., data = merged_pond_allx_logY)
vif_values_log <- vif(lm_model_log)
vif_values_log <- as.data.frame(vif_values_log)
# Use scaled predictors if their magnitudes differ significantly.
# Define the min-max normalization function to scale data to 0-100
normalize_0_100 <- function(x) {
return ((x - min(x)) / (max(x) - min(x)) * 100)
}
# Separate the response variable and predictors
names(merged_pond_allx_logY)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "log_PondArea"
response <- merged_pond_allx_logY$log_PondArea
predictors <- merged_pond_allx_logY[, !(names(merged_pond_allx_logY) %in% "log_PondArea")]
names(predictors)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
# Apply the min-max normalization to scale the predictors to 0-100
scaled_predictors_0_100 <- as.data.frame(lapply(predictors, normalize_0_100))
# Combine the scaled predictors with the original response variable
merged_pond_allx_logY_scaled_0_100 <- cbind(scaled_predictors_0_100, log_PondArea = response)
# View the resulting dataframe
names(merged_pond_allx_logY_scaled_0_100)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "log_PondArea"
head(merged_pond_allx_logY_scaled_0_100)
## ClayMedian ThetasMedian WoodyCover WoodyHeight FlowlineLength
## 1 34.29783 2.701224 99.90321 79.06769 9.198218
## 2 32.88167 2.249796 100.00000 86.23323 21.333077
## 3 37.03081 9.547493 83.75932 65.58021 23.992004
## 4 37.04036 12.977138 94.52426 66.71171 100.000000
## 5 32.35430 2.172239 80.85986 72.89917 9.035877
## 6 33.89057 1.546819 90.69703 84.92204 22.891123
## MainChannelSlope ValleyBottWidth UPA CTImax SPImedian STImedian
## 1 30.35212 55.02588 3.16497634 53.28812 2.64401671 35.14212
## 2 39.09153 72.72291 3.16360280 51.37204 7.03225968 52.02101
## 3 14.40434 66.96410 3.17184406 70.44844 5.26346020 22.99742
## 4 5.73680 91.06715 6.51495481 80.24080 0.00000000 7.49354
## 5 18.14229 13.49863 0.01861389 33.21678 0.02457321 12.91990
## 6 24.54358 62.26085 4.60572900 39.10227 7.44632979 36.69251
## SummerTempMax SpringPrecip AnnualPrecip SWEMarchMax DamLength log_PondArea
## 1 59.58284 32.64623 56.61523 89.77432 22.231690 8.826174
## 2 57.53816 35.32843 62.07297 100.00000 63.220050 9.825994
## 3 59.83798 32.16254 54.84491 84.37797 34.217803 8.766188
## 4 62.80078 28.75636 47.88093 85.15618 86.887273 10.599311
## 5 63.31363 23.27517 44.70349 87.26372 9.047785 8.542570
## 6 69.07890 15.14048 30.00758 87.69873 18.605815 7.899021
# for convenience
pondFinal <- merged_pond_allx_logY_scaled_0_100
sum(is.na(pondFinal)) # Should be 0 after na.omit if previously handled
## [1] 0
# Fit the refined GAM without the interaction term
names(pondFinal)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "log_PondArea"
gam_pond <- gam(
log_PondArea ~
s(ClayMedian) +
s(ThetasMedian) +
s(WoodyCover) +
s(WoodyHeight) +
s(FlowlineLength) +
s(MainChannelSlope)+
s(ValleyBottWidth) +
s(UPA) +
s(CTImax) +
s(SPImedian) +
s(STImedian) +
s(SummerTempMax) +
s(SpringPrecip) +
s(AnnualPrecip) +
s(SWEMarchMax) +
s(DamLength),
data = pondFinal,
method = "REML"
)
# Summarize the refined model
summary(gam_pond)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_PondArea ~ s(ClayMedian) + s(ThetasMedian) + s(WoodyCover) +
## s(WoodyHeight) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(SPImedian) +
## s(STImedian) + s(SummerTempMax) + s(SpringPrecip) + s(AnnualPrecip) +
## s(SWEMarchMax) + s(DamLength)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.08456 0.05662 142.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 1.000 1.000 0.743 0.392162
## s(ThetasMedian) 1.455 1.797 0.147 0.844672
## s(WoodyCover) 2.586 3.229 0.880 0.538247
## s(WoodyHeight) 6.430 7.538 4.407 0.000946 ***
## s(FlowlineLength) 1.000 1.000 0.001 0.976094
## s(MainChannelSlope) 2.166 2.607 0.489 0.647521
## s(ValleyBottWidth) 1.000 1.000 0.617 0.435226
## s(UPA) 1.000 1.000 0.280 0.599009
## s(CTImax) 1.061 1.114 0.149 0.739770
## s(SPImedian) 1.000 1.000 4.256 0.043593 *
## s(STImedian) 1.000 1.000 0.183 0.670679
## s(SummerTempMax) 1.000 1.000 0.861 0.357175
## s(SpringPrecip) 1.000 1.000 1.549 0.218312
## s(AnnualPrecip) 1.000 1.000 0.243 0.623891
## s(SWEMarchMax) 1.000 1.000 2.001 0.162541
## s(DamLength) 4.547 5.440 16.891 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.807 Deviance explained = 87%
## -REML = 102.89 Scale est. = 0.2789 n = 87
# Simplifying a GAM by retaining only the significant factors helps improve interpretability while maintaining predictive power.
# Refit the GAM with significant predictors
gam_pond_simplified <- gam(
log_PondArea ~
s(WoodyHeight) +
s(DamLength) +
s(SPImedian),
data = pondFinal,
method = "REML"
)
# Summarize the refined model
summary(gam_pond_simplified)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_PondArea ~ s(WoodyHeight) + s(DamLength) + s(SPImedian)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.08456 0.06555 123.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(WoodyHeight) 6.299 7.424 4.370 0.000361 ***
## s(DamLength) 4.950 5.944 28.903 < 2e-16 ***
## s(SPImedian) 2.280 2.705 4.408 0.012177 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.741 Deviance explained = 78.1%
## -REML = 100.37 Scale est. = 0.37381 n = 87
# plot(gam_pond_simplified, pages = 1, se = TRUE)
AIC(gam_pond, gam_pond_simplified)
## df AIC
## gam_pond 33.72506 167.6106
## gam_pond_simplified 17.47484 180.3388
# Plotting all smooth terms and residuals
# par(mfrow = c(2, 2))
# plot(gam_pond_simplified, pages = 1, shade = TRUE, seWithMean = TRUE)
# gam.check(gam_pond_simplified)
# Your model has successfully converged using the REML method with the Outer Newton optimizer. The small gradient values and positive definite Hessian indicate a stable and reliable solution. The full model rank confirms that all predictors contribute uniquely without redundancy.
predicted <- predict(gam_pond_simplified, newdata = pondFinal, type = "response")
# plot(gam_pond_simplified, pages = 1, se = TRUE)
# Extract the summary from the model
gam_pond_simplified_summary <- summary(gam_pond_simplified)
# Extract the relevant data for each smooth term
edf <- gam_pond_simplified_summary$s.table[, "edf"]
f_value <- gam_pond_simplified_summary$s.table[, "F"]
p_value <- gam_pond_simplified_summary$s.table[, "p-value"]
# Generate significance levels based on p-values
significance <- ifelse(p_value < 0.001, "***",
ifelse(p_value < 0.01, "**",
ifelse(p_value < 0.05, "*",
ifelse(p_value < 0.1, ".", " "))))
# Create a data frame with predictor names, EDF, F-values, and significance levels
importance_df_pond <- data.frame(
Predictor = rownames(gam_pond_simplified_summary$s.table),
F_value = f_value,
EDF = edf,
Significance = significance
)
pond_order <- ggplot(importance_df_pond, aes(x = reorder(Predictor, F_value), y = F_value)) +
geom_bar(stat = "identity", fill = "#0070FF", width = 0.5) +
coord_flip() +
geom_text(aes(label = paste("EDF =", round(EDF, 2), Significance)), hjust = -0.05, size = 3) +
labs(title = "(a) Factor contributions to log-transformed pond area",
y = "F-value") +
theme_bw(base_size = 12) +
scale_y_continuous(breaks = seq(0, 30, by = 5), # Ticks every 5 units
expand = expansion(mult = c(0, 0.2))) + # Extra space for labels
theme(plot.title = element_text(hjust = 0.5),
axis.title.y = element_blank())
# pond_order
names(pondFinal)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "log_PondArea"
# Fit the refined GAM without the interaction term
gam_model_noDam <- gam(
log_PondArea ~
s(ClayMedian) +
s(ThetasMedian) +
s(WoodyCover) +
s(WoodyHeight) +
s(FlowlineLength) +
s(MainChannelSlope) +
s(ValleyBottWidth) +
s(UPA) +
s(CTImax) +
s(SPImedian) +
s(STImedian) +
s(SummerTempMax) +
s(SpringPrecip) +
s(AnnualPrecip) +
s(SWEMarchMax),
data = pondFinal,
method = "REML"
)
# Summarize the refined model
summary(gam_model_noDam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_PondArea ~ s(ClayMedian) + s(ThetasMedian) + s(WoodyCover) +
## s(WoodyHeight) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(SPImedian) +
## s(STImedian) + s(SummerTempMax) + s(SpringPrecip) + s(AnnualPrecip) +
## s(SWEMarchMax)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.08456 0.08122 99.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 1.000 1.000 1.266 0.265
## s(ThetasMedian) 1.575 1.963 0.275 0.752
## s(WoodyCover) 1.720 2.160 0.318 0.660
## s(WoodyHeight) 1.000 1.000 2.051 0.157
## s(FlowlineLength) 2.457 3.015 14.765 2.41e-07 ***
## s(MainChannelSlope) 2.202 2.654 0.503 0.732
## s(ValleyBottWidth) 1.000 1.000 0.900 0.347
## s(UPA) 2.125 2.624 1.509 0.304
## s(CTImax) 1.000 1.000 0.691 0.409
## s(SPImedian) 1.496 1.802 0.080 0.924
## s(STImedian) 4.151 5.091 1.503 0.235
## s(SummerTempMax) 1.000 1.000 0.009 0.925
## s(SpringPrecip) 1.000 1.000 0.003 0.954
## s(AnnualPrecip) 1.000 1.000 0.773 0.383
## s(SWEMarchMax) 2.212 2.711 0.713 0.641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.602 Deviance explained = 71.7%
## -REML = 121.48 Scale est. = 0.57392 n = 87
# log dam length
names(merged_pond_allx_y)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "PondArea" "log_PondArea"
# Calculate skewness for DamLenMcluster
skewness_value <- skewness(merged_pond_allx_y$DamLength, na.rm = TRUE)
print(paste("Skewness of Dam Length: ", skewness_value))
## [1] "Skewness of Dam Length: 3.02584821891829"
merged_pond_allx_y$log_DamLength <- log(merged_pond_allx_y$DamLength)
# Separate the response variable and predictors
response <- merged_pond_allx_y$log_DamLength
names(merged_pond_allx_y)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "DamLength"
## [17] "PondArea" "log_PondArea" "log_DamLength"
predictors <- merged_pond_allx_y[, !(names(merged_pond_allx_y) %in% c("DamLength","log_DamLength",
"PondArea","log_PondArea"))]
names(predictors)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax"
# Apply the min-max normalization to scale the predictors to 0-100
scaled_predictors_0_100 <- as.data.frame(lapply(predictors, normalize_0_100))
# Combine the scaled predictors with the original response variable
merged_pond_allx_logY_scaled_0_100 <- cbind(scaled_predictors_0_100, log_DamLength = response)
# View the resulting dataframe
head(merged_pond_allx_logY_scaled_0_100)
## ClayMedian ThetasMedian WoodyCover WoodyHeight FlowlineLength
## 1 34.29783 2.701224 99.90321 79.06769 9.198218
## 2 32.88167 2.249796 100.00000 86.23323 21.333077
## 3 37.03081 9.547493 83.75932 65.58021 23.992004
## 4 37.04036 12.977138 94.52426 66.71171 100.000000
## 5 32.35430 2.172239 80.85986 72.89917 9.035877
## 6 33.89057 1.546819 90.69703 84.92204 22.891123
## MainChannelSlope ValleyBottWidth UPA CTImax SPImedian STImedian
## 1 30.35212 55.02588 3.16497634 53.28812 2.64401671 35.14212
## 2 39.09153 72.72291 3.16360280 51.37204 7.03225968 52.02101
## 3 14.40434 66.96410 3.17184406 70.44844 5.26346020 22.99742
## 4 5.73680 91.06715 6.51495481 80.24080 0.00000000 7.49354
## 5 18.14229 13.49863 0.01861389 33.21678 0.02457321 12.91990
## 6 24.54358 62.26085 4.60572900 39.10227 7.44632979 36.69251
## SummerTempMax SpringPrecip AnnualPrecip SWEMarchMax log_DamLength
## 1 59.58284 32.64623 56.61523 89.77432 6.964924
## 2 57.53816 35.32843 62.07297 100.00000 7.982165
## 3 59.83798 32.16254 54.84491 84.37797 7.381195
## 4 62.80078 28.75636 47.88093 85.15618 8.295972
## 5 63.31363 23.27517 44.70349 87.26372 6.125848
## 6 69.07890 15.14048 30.00758 87.69873 6.795105
# # Linear regression using scaled predictors
# model_normalize <- lm(log_DamLength ~ ., data = merged_pond_allx_logY_scaled_0_100)
#
# # Summary of the model
# summary(model_normalize)
# Fit the GAM with log-transformed dam length
names(merged_pond_allx_logY_scaled_0_100)
## [1] "ClayMedian" "ThetasMedian" "WoodyCover" "WoodyHeight"
## [5] "FlowlineLength" "MainChannelSlope" "ValleyBottWidth" "UPA"
## [9] "CTImax" "SPImedian" "STImedian" "SummerTempMax"
## [13] "SpringPrecip" "AnnualPrecip" "SWEMarchMax" "log_DamLength"
# for convenience
damFinal <- merged_pond_allx_logY_scaled_0_100
sum(is.na(damFinal)) # Should be 0 after na.omit if previously handled
## [1] 0
# linear
# Linear regression using scaled predictors
lm_dam <- lm(log_DamLength ~ ., data = damFinal)
# Summary of the model
summary(lm_dam)
##
## Call:
## lm(formula = log_DamLength ~ ., data = damFinal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48972 -0.36130 0.00599 0.39598 1.12182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3909802 1.0653344 4.122 0.000101 ***
## ClayMedian -0.0133187 0.0051462 -2.588 0.011697 *
## ThetasMedian -0.0153722 0.0054834 -2.803 0.006515 **
## WoodyCover -0.0010211 0.0041070 -0.249 0.804359
## WoodyHeight 0.0058499 0.0043056 1.359 0.178556
## FlowlineLength 0.0307176 0.0042019 7.310 3.16e-10 ***
## MainChannelSlope 0.0169841 0.0075195 2.259 0.026977 *
## ValleyBottWidth 0.0149132 0.0038903 3.833 0.000271 ***
## UPA -0.0111440 0.0062542 -1.782 0.079050 .
## CTImax 0.0058328 0.0087375 0.668 0.506580
## SPImedian -0.0020626 0.0053996 -0.382 0.703614
## STImedian 0.0030488 0.0043080 0.708 0.481449
## SummerTempMax 0.0126297 0.0076264 1.656 0.102123
## SpringPrecip -0.0009297 0.0074973 -0.124 0.901661
## AnnualPrecip 0.0076791 0.0066095 1.162 0.249196
## SWEMarchMax -0.0074151 0.0051926 -1.428 0.157669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5986 on 71 degrees of freedom
## Multiple R-squared: 0.6964, Adjusted R-squared: 0.6323
## F-statistic: 10.86 on 15 and 71 DF, p-value: 5.218e-13
# gam
gam_dam <- gam(
log_DamLength ~
s(ClayMedian) +
s(ThetasMedian) +
s(WoodyCover) +
s(WoodyHeight) +
s(FlowlineLength) +
s(MainChannelSlope) +
s(ValleyBottWidth) +
s(UPA) +
s(CTImax) +
s(SPImedian) +
s(STImedian) +
s(SummerTempMax) +
s(SpringPrecip) +
s(AnnualPrecip) +
s(SWEMarchMax),
data = damFinal,
method = "REML"
)
# Summarize the transformed model
summary(gam_dam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(ThetasMedian) + s(WoodyCover) +
## s(WoodyHeight) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(SPImedian) +
## s(STImedian) + s(SummerTempMax) + s(SpringPrecip) + s(AnnualPrecip) +
## s(SWEMarchMax)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.925 0.046 128.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 2.738 3.427 4.364 0.007836 **
## s(ThetasMedian) 2.262 2.800 0.231 0.888512
## s(WoodyCover) 1.000 1.000 0.187 0.666910
## s(WoodyHeight) 1.000 1.000 0.918 0.341852
## s(FlowlineLength) 3.017 3.676 19.339 < 2e-16 ***
## s(MainChannelSlope) 1.833 2.206 4.743 0.016170 *
## s(ValleyBottWidth) 1.000 1.000 7.688 0.007461 **
## s(UPA) 3.272 3.947 6.577 0.000321 ***
## s(CTImax) 1.000 1.000 6.057 0.016850 *
## s(SPImedian) 1.000 1.000 1.604 0.210385
## s(STImedian) 4.674 5.667 2.483 0.068117 .
## s(SummerTempMax) 1.000 1.000 1.902 0.173198
## s(SpringPrecip) 1.096 1.176 0.804 0.340079
## s(AnnualPrecip) 1.000 1.001 10.210 0.002256 **
## s(SWEMarchMax) 1.739 2.117 0.238 0.842493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.811 Deviance explained = 87.2%
## -REML = 84.924 Scale est. = 0.18407 n = 87
# AIC and BIC
AIC(lm_dam, gam_dam) # gam should be chosen
## df AIC
## lm_dam 17.00000 173.9276
## gam_dam 34.01632 132.9637
BIC(lm_dam, gam_dam) # difference is 1 BIC has a stronger emphasis on simplicity.
## df BIC
## lm_dam 17.00000 215.8481
## gam_dam 34.01632 216.8448
# Fit a Generalized Additive Model (GAM) with log-transformed dam length
# using significant predictors
gam_dam_simplified <- gam(
log_DamLength ~
s(ClayMedian) +
s(FlowlineLength) +
s(MainChannelSlope) +
s(ValleyBottWidth) +
s(UPA) +
s(CTImax) +
s(AnnualPrecip),
data = damFinal,
method = "REML"
)
# Summarize the transformed model
summary(gam_dam_simplified)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.92498 0.05242 113 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 2.497 3.127 3.259 0.025323 *
## s(FlowlineLength) 2.921 3.591 25.520 < 2e-16 ***
## s(MainChannelSlope) 1.000 1.000 6.384 0.013748 *
## s(ValleyBottWidth) 1.002 1.004 25.321 3.72e-06 ***
## s(UPA) 3.492 4.250 6.342 0.000153 ***
## s(CTImax) 1.045 1.085 4.897 0.031073 *
## s(AnnualPrecip) 2.961 3.685 4.687 0.002589 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.755 Deviance explained = 79.7%
## -REML = 80.155 Scale est. = 0.2391 n = 87
model_data <- model.frame(gam_dam_simplified)
summary(gam_dam_simplified)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.92498 0.05242 113 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 2.497 3.127 3.259 0.025323 *
## s(FlowlineLength) 2.921 3.591 25.520 < 2e-16 ***
## s(MainChannelSlope) 1.000 1.000 6.384 0.013748 *
## s(ValleyBottWidth) 1.002 1.004 25.321 3.72e-06 ***
## s(UPA) 3.492 4.250 6.342 0.000153 ***
## s(CTImax) 1.045 1.085 4.897 0.031073 *
## s(AnnualPrecip) 2.961 3.685 4.687 0.002589 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.755 Deviance explained = 79.7%
## -REML = 80.155 Scale est. = 0.2391 n = 87
gam_test <- gam(
log_DamLength ~
s(ClayMedian) +
s(FlowlineLength) +
ti(ClayMedian,FlowlineLength),
family = gaussian(),
method = "REML",
data = damFinal
)
# Print model summary
summary(gam_test)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + ti(ClayMedian,
## FlowlineLength)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.92028 0.08048 73.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 2.603 3.271 3.484 0.0168 *
## s(FlowlineLength) 1.000 1.001 44.288 <2e-16 ***
## ti(ClayMedian,FlowlineLength) 1.001 1.001 2.275 0.1352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.423 Deviance explained = 45.4%
## -REML = 101.53 Scale est. = 0.56269 n = 87
# Print all terms in s.table
print(rownames(summary(gam_test)$s.table))
## [1] "s(ClayMedian)" "s(FlowlineLength)"
## [3] "ti(ClayMedian,FlowlineLength)"
# Extract interaction term values manually
interaction_index <- which(rownames(summary(gam_test)$s.table) == "ti(ClayMedian,FlowlineLength)")
if (length(interaction_index) > 0) {
interaction_row <- summary(gam_test)$s.table[interaction_index, ]
print(interaction_row) # Check the actual values for edf, F, and p-value
} else {
print("Interaction term not found in model summary.")
}
## edf Ref.df F p-value
## 1.0005592 1.0011051 2.2748559 0.1352136
gam_final <- gam(
log_DamLength ~
s(ClayMedian) +
s(FlowlineLength) +
s(MainChannelSlope) +
s(ValleyBottWidth) +
s(UPA) +
s(CTImax) +
s(AnnualPrecip) +
ti(ClayMedian,UPA) +
ti(ClayMedian,AnnualPrecip) +
ti(FlowlineLength,MainChannelSlope) +
ti(ValleyBottWidth,CTImax),
family = gaussian(),
method = "REML",
data = damFinal
)
# Print summary
summary(gam_final)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip) +
## ti(ClayMedian, UPA) + ti(ClayMedian, AnnualPrecip) + ti(FlowlineLength,
## MainChannelSlope) + ti(ValleyBottWidth, CTImax)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.99962 0.08532 70.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 1.000 1.000 12.236 0.000873 ***
## s(FlowlineLength) 2.467 2.985 17.975 < 2e-16 ***
## s(MainChannelSlope) 1.000 1.000 2.502 0.118807
## s(ValleyBottWidth) 1.037 1.065 39.333 < 2e-16 ***
## s(UPA) 3.113 3.803 4.640 0.001536 **
## s(CTImax) 1.000 1.000 2.228 0.140635
## s(AnnualPrecip) 1.000 1.000 10.793 0.001679 **
## ti(ClayMedian,UPA) 1.000 1.000 0.703 0.405031
## ti(ClayMedian,AnnualPrecip) 5.536 7.265 2.373 0.026231 *
## ti(FlowlineLength,MainChannelSlope) 1.000 1.000 0.029 0.864590
## ti(ValleyBottWidth,CTImax) 6.001 7.883 1.655 0.125327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.805 Deviance explained = 86%
## -REML = 68.845 Scale est. = 0.19033 n = 87
# ti(ClayMedian, UPA) and ti(FlowlineLength, MainChannelSlope) are NOT significant, could be removed
# Keep the simplified interaction model (gam_final_refined), which includes only significant interactions (ti(ClayMedian, AnnualPrecip) and ti(ValleyBottWidth, CTImax)).
interactive_dam_gam <- gam(
log_DamLength ~
s(ClayMedian) +
s(FlowlineLength) +
s(MainChannelSlope) +
s(ValleyBottWidth) +
s(UPA) +
s(CTImax) +
s(AnnualPrecip) +
ti(ClayMedian,AnnualPrecip) +
ti(ValleyBottWidth,CTImax),
family = gaussian(),
method = "REML",
data = damFinal
)
summary(interactive_dam_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip) +
## ti(ClayMedian, AnnualPrecip) + ti(ValleyBottWidth, CTImax)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.97185 0.06602 90.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 1.099 1.165 9.200 0.002164 **
## s(FlowlineLength) 2.460 2.988 38.879 < 2e-16 ***
## s(MainChannelSlope) 1.000 1.000 7.739 0.007091 **
## s(ValleyBottWidth) 1.478 1.761 25.023 2.73e-07 ***
## s(UPA) 3.101 3.748 8.008 0.000262 ***
## s(CTImax) 1.000 1.000 2.439 0.123258
## s(AnnualPrecip) 1.000 1.000 10.608 0.001803 **
## ti(ClayMedian,AnnualPrecip) 5.679 7.390 2.792 0.010731 *
## ti(ValleyBottWidth,CTImax) 5.509 7.273 1.921 0.065889 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.805 Deviance explained = 85.6%
## -REML = 73.207 Scale est. = 0.18974 n = 87
# Summarize the transformed model
summary(interactive_dam_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) +
## s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip) +
## ti(ClayMedian, AnnualPrecip) + ti(ValleyBottWidth, CTImax)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.97185 0.06602 90.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(ClayMedian) 1.099 1.165 9.200 0.002164 **
## s(FlowlineLength) 2.460 2.988 38.879 < 2e-16 ***
## s(MainChannelSlope) 1.000 1.000 7.739 0.007091 **
## s(ValleyBottWidth) 1.478 1.761 25.023 2.73e-07 ***
## s(UPA) 3.101 3.748 8.008 0.000262 ***
## s(CTImax) 1.000 1.000 2.439 0.123258
## s(AnnualPrecip) 1.000 1.000 10.608 0.001803 **
## ti(ClayMedian,AnnualPrecip) 5.679 7.390 2.792 0.010731 *
## ti(ValleyBottWidth,CTImax) 5.509 7.273 1.921 0.065889 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.805 Deviance explained = 85.6%
## -REML = 73.207 Scale est. = 0.18974 n = 87
# Extract the summary from the model
gam_summary <- summary(interactive_dam_gam)
# Extract the relevant data for each smooth term
edf <- gam_summary$s.table[, "edf"]
f_value <- gam_summary$s.table[, "F"]
p_value <- gam_summary$s.table[, "p-value"]
# Generate significance levels based on p-values
significance <- ifelse(p_value < 0.001, "***",
ifelse(p_value < 0.01, "**",
ifelse(p_value < 0.05, "*",
ifelse(p_value < 0.1, ".", " "))))
# Create a data frame with predictor names, EDF, F-values, and significance levels
importance_df <- data.frame(
Predictor = rownames(gam_summary$s.table),
F_value = f_value,
EDF = edf,
Significance = significance
)
# Print the importance dataframe for verification
print(importance_df)
## Predictor F_value EDF
## s(ClayMedian) s(ClayMedian) 9.199565 1.099375
## s(FlowlineLength) s(FlowlineLength) 38.878674 2.459791
## s(MainChannelSlope) s(MainChannelSlope) 7.739055 1.000041
## s(ValleyBottWidth) s(ValleyBottWidth) 25.022960 1.478208
## s(UPA) s(UPA) 8.007724 3.101110
## s(CTImax) s(CTImax) 2.439481 1.000068
## s(AnnualPrecip) s(AnnualPrecip) 10.608375 1.000013
## ti(ClayMedian,AnnualPrecip) ti(ClayMedian,AnnualPrecip) 2.792059 5.679249
## ti(ValleyBottWidth,CTImax) ti(ValleyBottWidth,CTImax) 1.920829 5.509033
## Significance
## s(ClayMedian) **
## s(FlowlineLength) ***
## s(MainChannelSlope) **
## s(ValleyBottWidth) ***
## s(UPA) ***
## s(CTImax)
## s(AnnualPrecip) **
## ti(ClayMedian,AnnualPrecip) *
## ti(ValleyBottWidth,CTImax) .
# Plot predictor importance from highest to lowest, with significance levels and EDF
dam_order <- ggplot(importance_df, aes(x = reorder(Predictor, F_value), y = F_value)) +
geom_bar(stat = "identity", fill = "#ff8f00",width = 0.5) +
coord_flip() +
geom_text(aes(label = paste("EDF =", round(EDF, 2), Significance)), hjust = -0.05, size = 3) +
labs(title = "(b) Factor contributions to log-transformed dam length",
# x = "Predictor",
y = "F-value") +
theme_bw(base_size = 12) +
scale_y_continuous(breaks = seq(0, 40, by = 5), # Ticks every 5 units
expand = expansion(mult = c(0, 0.2))) + # Extra space for labels
theme(plot.title = element_text(hjust = 0.5),
axis.title.y = element_blank())
# dam_order
# pond order from summary(gam_pond_simplified)
# dam order from summary(interactive_dam_gam)
x_limit <- c(0, 35) # Set a consistent range across both plots
# Aligning plots with discrete scales
pond_order_aligned <- pond_order +
coord_flip() + # Flip coordinates for horizontal bars
theme(
plot.margin = unit(c(5, 5, 5, 5), "pt") # Add consistent plot margins
)
dam_order_aligned <- dam_order +
coord_flip() + # Flip coordinates for horizontal bars
theme(
plot.margin = unit(c(5, 5, 5, 5), "pt") # Add consistent plot margins
)
combined_plot <- plot_grid(
pond_order_aligned,
dam_order_aligned,
ncol = 1, # Arrange in a single column
rel_heights = c(3, 7), # Proportional heights
align = "v" # Align vertically
)
# Print or save the combined plot
print(combined_plot)
# Save the plot
# fname <- paste0(outfig, '/var_importance_update.png')
# fname <- paste0(outfig, '/var_importance_update.pdf')
# ggsave(filename = fname, plot = combined_plot, dpi = 600, width = 7, units = 'in')