knitr::opts_chunk$set(warning = FALSE, message = FALSE)

rm(list = ls())
library(sf)
library(dplyr)
library(ggplot2)
library(cowplot)
library(mgcv)
library(car)
library(ggpubr)
library(gridExtra)
library(GGally)
library(e1071)
library(tidyr)

# Assuming `gam_pond_simplified` is your fitted GAM model
outdir <- "H:/My Drive/Projects/Beavers/Beaver_EarthEngine/RFanalysis_clusters/outputs"
outfig <- "H:/My Drive/Manuscript/BeavePond_westernUS/Figs"

load(paste0(outdir,"/merged_pond_var_update.RData"))

names(merged_pond_var_update)

##  [1] "ClusterID"                   "Clay_median"                
##  [3] "Thetas_median"               "woody_cover_percent"        
##  [5] "woody_height"                "NHDPlusHR_Flowline_Length_m"
##  [7] "MainChannelSlope"            "ValleyBottWidth"            
##  [9] "upa_km2"                     "CTI_max"                    
## [11] "SPI_median"                  "STI_median"                 
## [13] "Max_Summer_Temp_C"           "Total_Spring_Precip_mm"     
## [15] "Total_Precip_Annual_mm"      "Max_SWE_March_snodas_m"     
## [17] "DamLenMcluster"              "pondAream2cluster"

merged_pond_var <- merged_pond_var_update # in seek of convenience 

names(merged_pond_var)

##  [1] "ClusterID"                   "Clay_median"                
##  [3] "Thetas_median"               "woody_cover_percent"        
##  [5] "woody_height"                "NHDPlusHR_Flowline_Length_m"
##  [7] "MainChannelSlope"            "ValleyBottWidth"            
##  [9] "upa_km2"                     "CTI_max"                    
## [11] "SPI_median"                  "STI_median"                 
## [13] "Max_Summer_Temp_C"           "Total_Spring_Precip_mm"     
## [15] "Total_Precip_Annual_mm"      "Max_SWE_March_snodas_m"     
## [17] "DamLenMcluster"              "pondAream2cluster"

# change name 
names(merged_pond_var) <- c("ClusterID","ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
                                         "MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
                                         "STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength","PondArea")

merged_pond_var_select <- merged_pond_var[,c("ClusterID","WoodyHeight","SPImedian","DamLength","PondArea")]

1. Variable data checking

1.1 sixteen predictors distribution

predictors_all <- merged_pond_var[,c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
                                         "MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
                                         "STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength")]

# wide to long format 
predictors_all_long <- predictors_all %>%
  pivot_longer(
    cols = c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
                                         "MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
                                         "STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength"),  
    names_to = c("Variable"),  # New column names
    values_to = "Value"  # Name of the new value column
  )

# Manually set the order of the facets
predictors_all_long$Variable <- factor(predictors_all_long$Variable, 
                                       levels = c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
                                         "MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
                                         "STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength"))

# remove cluster ID
names(merged_pond_var)

##  [1] "ClusterID"        "ClayMedian"       "ThetasMedian"     "WoodyCover"      
##  [5] "WoodyHeight"      "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth" 
##  [9] "UPA"              "CTImax"           "SPImedian"        "STImedian"       
## [13] "SummerTempMax"    "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"     
## [17] "DamLength"        "PondArea"

merged_pond_allx_y <- merged_pond_var[,c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
                                         "MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
                                         "STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength","PondArea")]

2. VIF (Variance Inflation Factor)

2.1 VIF, all < 10

necessary for both linear and non-linear models when your goal is to explore which factors are impacting ponding size and interpret individual predictors.

# VIF is less critical if goal is pure prediction 
names(merged_pond_allx_y)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "PondArea"

merged_pond_allx_y$log_PondArea <- log(merged_pond_allx_y$PondArea) # natural logarithms
names(merged_pond_allx_y)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "PondArea"         "log_PondArea"

lm_model <- lm(PondArea ~ ., data = merged_pond_allx_y)
vif_values <- vif(lm_model)
vif_values_df <- as.data.frame(vif_values)
print(vif_values)

##       ClayMedian     ThetasMedian       WoodyCover      WoodyHeight 
##         3.781273         4.126833         2.260386         2.624755 
##   FlowlineLength MainChannelSlope  ValleyBottWidth              UPA 
##         2.556603         2.818023         2.343186         3.473111 
##           CTImax        SPImedian        STImedian    SummerTempMax 
##         5.838891         1.426137         1.657281         5.337581 
##     SpringPrecip     AnnualPrecip      SWEMarchMax        DamLength 
##         8.122911         4.009046         5.798889         4.033476 
##     log_PondArea 
##         2.704626

# unselect pond area as we use log of pond area
merged_pond_allx_logY <- merged_pond_allx_y[,c("ClayMedian","ThetasMedian","WoodyCover","WoodyHeight","FlowlineLength",
                                         "MainChannelSlope","ValleyBottWidth","UPA","CTImax","SPImedian",
                                         "STImedian","SummerTempMax","SpringPrecip","AnnualPrecip","SWEMarchMax","DamLength","log_PondArea")]
lm_model_log <- lm(log_PondArea ~ ., data = merged_pond_allx_logY)
vif_values_log <- vif(lm_model_log)
vif_values_log <- as.data.frame(vif_values_log)

3. normalization

3.1 min-max normalization (0-100) - allow more intuitive comparision about which factors is more important

# Use scaled predictors if their magnitudes differ significantly.
# Define the min-max normalization function to scale data to 0-100
normalize_0_100 <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)) * 100)
}

# Separate the response variable and predictors
names(merged_pond_allx_logY)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "log_PondArea"

response <- merged_pond_allx_logY$log_PondArea
predictors <- merged_pond_allx_logY[, !(names(merged_pond_allx_logY) %in% "log_PondArea")]
names(predictors)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"

# Apply the min-max normalization to scale the predictors to 0-100
scaled_predictors_0_100 <- as.data.frame(lapply(predictors, normalize_0_100))

# Combine the scaled predictors with the original response variable
merged_pond_allx_logY_scaled_0_100 <- cbind(scaled_predictors_0_100, log_PondArea = response)

# View the resulting dataframe
names(merged_pond_allx_logY_scaled_0_100)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "log_PondArea"

head(merged_pond_allx_logY_scaled_0_100)

##   ClayMedian ThetasMedian WoodyCover WoodyHeight FlowlineLength
## 1   34.29783     2.701224   99.90321    79.06769       9.198218
## 2   32.88167     2.249796  100.00000    86.23323      21.333077
## 3   37.03081     9.547493   83.75932    65.58021      23.992004
## 4   37.04036    12.977138   94.52426    66.71171     100.000000
## 5   32.35430     2.172239   80.85986    72.89917       9.035877
## 6   33.89057     1.546819   90.69703    84.92204      22.891123
##   MainChannelSlope ValleyBottWidth        UPA   CTImax  SPImedian STImedian
## 1         30.35212        55.02588 3.16497634 53.28812 2.64401671  35.14212
## 2         39.09153        72.72291 3.16360280 51.37204 7.03225968  52.02101
## 3         14.40434        66.96410 3.17184406 70.44844 5.26346020  22.99742
## 4          5.73680        91.06715 6.51495481 80.24080 0.00000000   7.49354
## 5         18.14229        13.49863 0.01861389 33.21678 0.02457321  12.91990
## 6         24.54358        62.26085 4.60572900 39.10227 7.44632979  36.69251
##   SummerTempMax SpringPrecip AnnualPrecip SWEMarchMax DamLength log_PondArea
## 1      59.58284     32.64623     56.61523    89.77432 22.231690     8.826174
## 2      57.53816     35.32843     62.07297   100.00000 63.220050     9.825994
## 3      59.83798     32.16254     54.84491    84.37797 34.217803     8.766188
## 4      62.80078     28.75636     47.88093    85.15618 86.887273    10.599311
## 5      63.31363     23.27517     44.70349    87.26372  9.047785     8.542570
## 6      69.07890     15.14048     30.00758    87.69873 18.605815     7.899021

# for convenience 
pondFinal <- merged_pond_allx_logY_scaled_0_100

sum(is.na(pondFinal))  # Should be 0 after na.omit if previously handled

## [1] 0

4. Pond Generalized Additive Models (GAM)

4.1 Model 1a - full pond model

# Fit the refined GAM without the interaction term
names(pondFinal)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "log_PondArea"

gam_pond <- gam(
  log_PondArea ~ 
    s(ClayMedian) + 
    s(ThetasMedian) + 
    s(WoodyCover) + 
    s(WoodyHeight) + 
    s(FlowlineLength) + 
    s(MainChannelSlope)+
    s(ValleyBottWidth) + 
    s(UPA) + 
    s(CTImax) + 
    s(SPImedian) + 
    s(STImedian) + 
    s(SummerTempMax) + 
    s(SpringPrecip) + 
    s(AnnualPrecip) + 
    s(SWEMarchMax) + 
    s(DamLength),
  data = pondFinal,
  method = "REML"
)

# Summarize the refined model
summary(gam_pond)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_PondArea ~ s(ClayMedian) + s(ThetasMedian) + s(WoodyCover) + 
##     s(WoodyHeight) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(SPImedian) + 
##     s(STImedian) + s(SummerTempMax) + s(SpringPrecip) + s(AnnualPrecip) + 
##     s(SWEMarchMax) + s(DamLength)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.08456    0.05662   142.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df      F  p-value    
## s(ClayMedian)       1.000  1.000  0.743 0.392162    
## s(ThetasMedian)     1.455  1.797  0.147 0.844672    
## s(WoodyCover)       2.586  3.229  0.880 0.538247    
## s(WoodyHeight)      6.430  7.538  4.407 0.000946 ***
## s(FlowlineLength)   1.000  1.000  0.001 0.976094    
## s(MainChannelSlope) 2.166  2.607  0.489 0.647521    
## s(ValleyBottWidth)  1.000  1.000  0.617 0.435226    
## s(UPA)              1.000  1.000  0.280 0.599009    
## s(CTImax)           1.061  1.114  0.149 0.739770    
## s(SPImedian)        1.000  1.000  4.256 0.043593 *  
## s(STImedian)        1.000  1.000  0.183 0.670679    
## s(SummerTempMax)    1.000  1.000  0.861 0.357175    
## s(SpringPrecip)     1.000  1.000  1.549 0.218312    
## s(AnnualPrecip)     1.000  1.000  0.243 0.623891    
## s(SWEMarchMax)      1.000  1.000  2.001 0.162541    
## s(DamLength)        4.547  5.440 16.891  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.807   Deviance explained =   87%
## -REML = 102.89  Scale est. = 0.2789    n = 87

4.2 model 1b - simplified pond model

# Simplifying a GAM by retaining only the significant factors helps improve interpretability while maintaining predictive power. 
# Refit the GAM with significant predictors
gam_pond_simplified <- gam(
  log_PondArea ~ 
    s(WoodyHeight) + 
    s(DamLength) + 
    s(SPImedian),
  data = pondFinal,
  method = "REML"
)

# Summarize the refined model
summary(gam_pond_simplified)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_PondArea ~ s(WoodyHeight) + s(DamLength) + s(SPImedian)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.08456    0.06555   123.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df      F  p-value    
## s(WoodyHeight) 6.299  7.424  4.370 0.000361 ***
## s(DamLength)   4.950  5.944 28.903  < 2e-16 ***
## s(SPImedian)   2.280  2.705  4.408 0.012177 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.741   Deviance explained = 78.1%
## -REML = 100.37  Scale est. = 0.37381   n = 87

# plot(gam_pond_simplified, pages = 1, se = TRUE)

AIC(gam_pond, gam_pond_simplified)

##                           df      AIC
## gam_pond            33.72506 167.6106
## gam_pond_simplified 17.47484 180.3388

# Plotting all smooth terms and residuals
# par(mfrow = c(2, 2))
# plot(gam_pond_simplified, pages = 1, shade = TRUE, seWithMean = TRUE)

# gam.check(gam_pond_simplified)
# Your model has successfully converged using the REML method with the Outer Newton optimizer. The small gradient values and positive definite Hessian indicate a stable and reliable solution. The full model rank confirms that all predictors contribute uniquely without redundancy.

predicted <- predict(gam_pond_simplified, newdata = pondFinal, type = "response")
# plot(gam_pond_simplified, pages = 1, se = TRUE)

4.2.0 importance order

# Extract the summary from the model
gam_pond_simplified_summary <- summary(gam_pond_simplified)

# Extract the relevant data for each smooth term
edf <- gam_pond_simplified_summary$s.table[, "edf"]
f_value <- gam_pond_simplified_summary$s.table[, "F"]
p_value <- gam_pond_simplified_summary$s.table[, "p-value"]

# Generate significance levels based on p-values
significance <- ifelse(p_value < 0.001, "***", 
                       ifelse(p_value < 0.01, "**", 
                              ifelse(p_value < 0.05, "*", 
                                     ifelse(p_value < 0.1, ".", " "))))

# Create a data frame with predictor names, EDF, F-values, and significance levels
importance_df_pond  <- data.frame(
  Predictor = rownames(gam_pond_simplified_summary$s.table),
  F_value = f_value,
  EDF = edf,
  Significance = significance
)

pond_order <- ggplot(importance_df_pond, aes(x = reorder(Predictor, F_value), y = F_value)) +
  geom_bar(stat = "identity", fill = "#0070FF", width = 0.5) +
  coord_flip() +
  geom_text(aes(label = paste("EDF =", round(EDF, 2), Significance)), hjust = -0.05, size = 3) +
  labs(title = "(a) Factor contributions to log-transformed pond area", 
       y = "F-value") +
  theme_bw(base_size = 12) +
  scale_y_continuous(breaks = seq(0, 30, by = 5),  # Ticks every 5 units
                     expand = expansion(mult = c(0, 0.2))) +  # Extra space for labels
  theme(plot.title = element_text(hjust = 0.5),
        axis.title.y = element_blank())
# pond_order

4.3 model 1c - reduced pond model

names(pondFinal)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "log_PondArea"

# Fit the refined GAM without the interaction term
gam_model_noDam <- gam(
  log_PondArea ~ 
    s(ClayMedian) + 
    s(ThetasMedian) + 
    s(WoodyCover) + 
    s(WoodyHeight) + 
    s(FlowlineLength) + 
    s(MainChannelSlope) + 
    s(ValleyBottWidth) + 
    s(UPA) + 
    s(CTImax) + 
    s(SPImedian) + 
    s(STImedian) + 
    s(SummerTempMax) + 
    s(SpringPrecip) + 
    s(AnnualPrecip) + 
    s(SWEMarchMax),
  data = pondFinal,
  method = "REML"
)

# Summarize the refined model
summary(gam_model_noDam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_PondArea ~ s(ClayMedian) + s(ThetasMedian) + s(WoodyCover) + 
##     s(WoodyHeight) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(SPImedian) + 
##     s(STImedian) + s(SummerTempMax) + s(SpringPrecip) + s(AnnualPrecip) + 
##     s(SWEMarchMax)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.08456    0.08122   99.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df      F  p-value    
## s(ClayMedian)       1.000  1.000  1.266    0.265    
## s(ThetasMedian)     1.575  1.963  0.275    0.752    
## s(WoodyCover)       1.720  2.160  0.318    0.660    
## s(WoodyHeight)      1.000  1.000  2.051    0.157    
## s(FlowlineLength)   2.457  3.015 14.765 2.41e-07 ***
## s(MainChannelSlope) 2.202  2.654  0.503    0.732    
## s(ValleyBottWidth)  1.000  1.000  0.900    0.347    
## s(UPA)              2.125  2.624  1.509    0.304    
## s(CTImax)           1.000  1.000  0.691    0.409    
## s(SPImedian)        1.496  1.802  0.080    0.924    
## s(STImedian)        4.151  5.091  1.503    0.235    
## s(SummerTempMax)    1.000  1.000  0.009    0.925    
## s(SpringPrecip)     1.000  1.000  0.003    0.954    
## s(AnnualPrecip)     1.000  1.000  0.773    0.383    
## s(SWEMarchMax)      2.212  2.711  0.713    0.641    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.602   Deviance explained = 71.7%
## -REML = 121.48  Scale est. = 0.57392   n = 87

5. Dam GAM

# log dam length 
names(merged_pond_allx_y)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "PondArea"         "log_PondArea"

# Calculate skewness for DamLenMcluster
skewness_value <- skewness(merged_pond_allx_y$DamLength, na.rm = TRUE)
print(paste("Skewness of Dam Length: ", skewness_value))

## [1] "Skewness of Dam Length:  3.02584821891829"

merged_pond_allx_y$log_DamLength <- log(merged_pond_allx_y$DamLength)

# Separate the response variable and predictors
response <- merged_pond_allx_y$log_DamLength
names(merged_pond_allx_y)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "DamLength"       
## [17] "PondArea"         "log_PondArea"     "log_DamLength"

predictors <- merged_pond_allx_y[, !(names(merged_pond_allx_y) %in% c("DamLength","log_DamLength",
                                                                      "PondArea","log_PondArea"))]
names(predictors)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"

# Apply the min-max normalization to scale the predictors to 0-100
scaled_predictors_0_100 <- as.data.frame(lapply(predictors, normalize_0_100))

# Combine the scaled predictors with the original response variable
merged_pond_allx_logY_scaled_0_100 <- cbind(scaled_predictors_0_100, log_DamLength = response)

# View the resulting dataframe
head(merged_pond_allx_logY_scaled_0_100)

##   ClayMedian ThetasMedian WoodyCover WoodyHeight FlowlineLength
## 1   34.29783     2.701224   99.90321    79.06769       9.198218
## 2   32.88167     2.249796  100.00000    86.23323      21.333077
## 3   37.03081     9.547493   83.75932    65.58021      23.992004
## 4   37.04036    12.977138   94.52426    66.71171     100.000000
## 5   32.35430     2.172239   80.85986    72.89917       9.035877
## 6   33.89057     1.546819   90.69703    84.92204      22.891123
##   MainChannelSlope ValleyBottWidth        UPA   CTImax  SPImedian STImedian
## 1         30.35212        55.02588 3.16497634 53.28812 2.64401671  35.14212
## 2         39.09153        72.72291 3.16360280 51.37204 7.03225968  52.02101
## 3         14.40434        66.96410 3.17184406 70.44844 5.26346020  22.99742
## 4          5.73680        91.06715 6.51495481 80.24080 0.00000000   7.49354
## 5         18.14229        13.49863 0.01861389 33.21678 0.02457321  12.91990
## 6         24.54358        62.26085 4.60572900 39.10227 7.44632979  36.69251
##   SummerTempMax SpringPrecip AnnualPrecip SWEMarchMax log_DamLength
## 1      59.58284     32.64623     56.61523    89.77432      6.964924
## 2      57.53816     35.32843     62.07297   100.00000      7.982165
## 3      59.83798     32.16254     54.84491    84.37797      7.381195
## 4      62.80078     28.75636     47.88093    85.15618      8.295972
## 5      63.31363     23.27517     44.70349    87.26372      6.125848
## 6      69.07890     15.14048     30.00758    87.69873      6.795105

  # # Linear regression using scaled predictors
  # model_normalize <- lm(log_DamLength ~ ., data = merged_pond_allx_logY_scaled_0_100)
  # 
  # # Summary of the model
  # summary(model_normalize)

# Fit the GAM with log-transformed dam length
names(merged_pond_allx_logY_scaled_0_100)

##  [1] "ClayMedian"       "ThetasMedian"     "WoodyCover"       "WoodyHeight"     
##  [5] "FlowlineLength"   "MainChannelSlope" "ValleyBottWidth"  "UPA"             
##  [9] "CTImax"           "SPImedian"        "STImedian"        "SummerTempMax"   
## [13] "SpringPrecip"     "AnnualPrecip"     "SWEMarchMax"      "log_DamLength"

# for convenience 
damFinal <- merged_pond_allx_logY_scaled_0_100

sum(is.na(damFinal))  # Should be 0 after na.omit if previously handled

## [1] 0

5.1 model2a - full dam gam

# linear 
# Linear regression using scaled predictors
lm_dam <- lm(log_DamLength ~ ., data = damFinal)

# Summary of the model
summary(lm_dam)

## 
## Call:
## lm(formula = log_DamLength ~ ., data = damFinal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48972 -0.36130  0.00599  0.39598  1.12182 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.3909802  1.0653344   4.122 0.000101 ***
## ClayMedian       -0.0133187  0.0051462  -2.588 0.011697 *  
## ThetasMedian     -0.0153722  0.0054834  -2.803 0.006515 ** 
## WoodyCover       -0.0010211  0.0041070  -0.249 0.804359    
## WoodyHeight       0.0058499  0.0043056   1.359 0.178556    
## FlowlineLength    0.0307176  0.0042019   7.310 3.16e-10 ***
## MainChannelSlope  0.0169841  0.0075195   2.259 0.026977 *  
## ValleyBottWidth   0.0149132  0.0038903   3.833 0.000271 ***
## UPA              -0.0111440  0.0062542  -1.782 0.079050 .  
## CTImax            0.0058328  0.0087375   0.668 0.506580    
## SPImedian        -0.0020626  0.0053996  -0.382 0.703614    
## STImedian         0.0030488  0.0043080   0.708 0.481449    
## SummerTempMax     0.0126297  0.0076264   1.656 0.102123    
## SpringPrecip     -0.0009297  0.0074973  -0.124 0.901661    
## AnnualPrecip      0.0076791  0.0066095   1.162 0.249196    
## SWEMarchMax      -0.0074151  0.0051926  -1.428 0.157669    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5986 on 71 degrees of freedom
## Multiple R-squared:  0.6964, Adjusted R-squared:  0.6323 
## F-statistic: 10.86 on 15 and 71 DF,  p-value: 5.218e-13

# gam 
gam_dam <- gam(
  log_DamLength ~ 
    s(ClayMedian) + 
    s(ThetasMedian) + 
    s(WoodyCover) + 
    s(WoodyHeight) + 
    s(FlowlineLength) + 
    s(MainChannelSlope) + 
    s(ValleyBottWidth) + 
    s(UPA) + 
    s(CTImax) + 
    s(SPImedian) + 
    s(STImedian) + 
    s(SummerTempMax) + 
    s(SpringPrecip) + 
    s(AnnualPrecip) + 
    s(SWEMarchMax),
  data = damFinal,
  method = "REML"
)

# Summarize the transformed model
summary(gam_dam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(ThetasMedian) + s(WoodyCover) + 
##     s(WoodyHeight) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(SPImedian) + 
##     s(STImedian) + s(SummerTempMax) + s(SpringPrecip) + s(AnnualPrecip) + 
##     s(SWEMarchMax)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.925      0.046   128.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df      F  p-value    
## s(ClayMedian)       2.738  3.427  4.364 0.007836 ** 
## s(ThetasMedian)     2.262  2.800  0.231 0.888512    
## s(WoodyCover)       1.000  1.000  0.187 0.666910    
## s(WoodyHeight)      1.000  1.000  0.918 0.341852    
## s(FlowlineLength)   3.017  3.676 19.339  < 2e-16 ***
## s(MainChannelSlope) 1.833  2.206  4.743 0.016170 *  
## s(ValleyBottWidth)  1.000  1.000  7.688 0.007461 ** 
## s(UPA)              3.272  3.947  6.577 0.000321 ***
## s(CTImax)           1.000  1.000  6.057 0.016850 *  
## s(SPImedian)        1.000  1.000  1.604 0.210385    
## s(STImedian)        4.674  5.667  2.483 0.068117 .  
## s(SummerTempMax)    1.000  1.000  1.902 0.173198    
## s(SpringPrecip)     1.096  1.176  0.804 0.340079    
## s(AnnualPrecip)     1.000  1.001 10.210 0.002256 ** 
## s(SWEMarchMax)      1.739  2.117  0.238 0.842493    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.811   Deviance explained = 87.2%
## -REML = 84.924  Scale est. = 0.18407   n = 87

# AIC and BIC
AIC(lm_dam, gam_dam) # gam should be chosen

##               df      AIC
## lm_dam  17.00000 173.9276
## gam_dam 34.01632 132.9637

BIC(lm_dam, gam_dam) # difference is 1 BIC has a stronger emphasis on simplicity.

##               df      BIC
## lm_dam  17.00000 215.8481
## gam_dam 34.01632 216.8448

5.2 model2b - simplified dam gam

# Fit a Generalized Additive Model (GAM) with log-transformed dam length
# using significant predictors
gam_dam_simplified <- gam(
  log_DamLength ~ 
    s(ClayMedian) + 
    s(FlowlineLength) + 
    s(MainChannelSlope) + 
    s(ValleyBottWidth) + 
    s(UPA) + 
    s(CTImax) + 
    s(AnnualPrecip),
  data = damFinal,
  method = "REML"
)

# Summarize the transformed model
summary(gam_dam_simplified)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.92498    0.05242     113   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df      F  p-value    
## s(ClayMedian)       2.497  3.127  3.259 0.025323 *  
## s(FlowlineLength)   2.921  3.591 25.520  < 2e-16 ***
## s(MainChannelSlope) 1.000  1.000  6.384 0.013748 *  
## s(ValleyBottWidth)  1.002  1.004 25.321 3.72e-06 ***
## s(UPA)              3.492  4.250  6.342 0.000153 ***
## s(CTImax)           1.045  1.085  4.897 0.031073 *  
## s(AnnualPrecip)     2.961  3.685  4.687 0.002589 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.755   Deviance explained = 79.7%
## -REML = 80.155  Scale est. = 0.2391    n = 87

5.3 model 2c- interactive dam GAM

5.3.1 test

model_data <- model.frame(gam_dam_simplified)
summary(gam_dam_simplified)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.92498    0.05242     113   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df      F  p-value    
## s(ClayMedian)       2.497  3.127  3.259 0.025323 *  
## s(FlowlineLength)   2.921  3.591 25.520  < 2e-16 ***
## s(MainChannelSlope) 1.000  1.000  6.384 0.013748 *  
## s(ValleyBottWidth)  1.002  1.004 25.321 3.72e-06 ***
## s(UPA)              3.492  4.250  6.342 0.000153 ***
## s(CTImax)           1.045  1.085  4.897 0.031073 *  
## s(AnnualPrecip)     2.961  3.685  4.687 0.002589 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.755   Deviance explained = 79.7%
## -REML = 80.155  Scale est. = 0.2391    n = 87

gam_test <- gam(
  log_DamLength ~ 
    s(ClayMedian) + 
    s(FlowlineLength) + 
    ti(ClayMedian,FlowlineLength),
  family = gaussian(), 
  method = "REML", 
  data = damFinal
)

# Print model summary
summary(gam_test)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + ti(ClayMedian, 
##     FlowlineLength)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.92028    0.08048   73.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                 edf Ref.df      F p-value    
## s(ClayMedian)                 2.603  3.271  3.484  0.0168 *  
## s(FlowlineLength)             1.000  1.001 44.288  <2e-16 ***
## ti(ClayMedian,FlowlineLength) 1.001  1.001  2.275  0.1352    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.423   Deviance explained = 45.4%
## -REML = 101.53  Scale est. = 0.56269   n = 87

# Print all terms in s.table
print(rownames(summary(gam_test)$s.table))

## [1] "s(ClayMedian)"                 "s(FlowlineLength)"            
## [3] "ti(ClayMedian,FlowlineLength)"

# Extract interaction term values manually
interaction_index <- which(rownames(summary(gam_test)$s.table) == "ti(ClayMedian,FlowlineLength)")

if (length(interaction_index) > 0) {
  interaction_row <- summary(gam_test)$s.table[interaction_index, ]
  print(interaction_row)  # Check the actual values for edf, F, and p-value
} else {
  print("Interaction term not found in model summary.")
}

##       edf    Ref.df         F   p-value 
## 1.0005592 1.0011051 2.2748559 0.1352136

5.3.2 Loop Through Interactions

5.3.3 gam final with significant (p_value < 0.1) interactions

gam_final <- gam(
  log_DamLength ~ 
    s(ClayMedian) + 
    s(FlowlineLength) + 
    s(MainChannelSlope) + 
    s(ValleyBottWidth) + 
    s(UPA) + 
    s(CTImax) + 
    s(AnnualPrecip) + 
    ti(ClayMedian,UPA) + 
    ti(ClayMedian,AnnualPrecip) + 
    ti(FlowlineLength,MainChannelSlope) + 
    ti(ValleyBottWidth,CTImax),
  family = gaussian(), 
  method = "REML", 
  data = damFinal
)

# Print summary
summary(gam_final)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip) + 
##     ti(ClayMedian, UPA) + ti(ClayMedian, AnnualPrecip) + ti(FlowlineLength, 
##     MainChannelSlope) + ti(ValleyBottWidth, CTImax)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.99962    0.08532   70.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                       edf Ref.df      F  p-value    
## s(ClayMedian)                       1.000  1.000 12.236 0.000873 ***
## s(FlowlineLength)                   2.467  2.985 17.975  < 2e-16 ***
## s(MainChannelSlope)                 1.000  1.000  2.502 0.118807    
## s(ValleyBottWidth)                  1.037  1.065 39.333  < 2e-16 ***
## s(UPA)                              3.113  3.803  4.640 0.001536 ** 
## s(CTImax)                           1.000  1.000  2.228 0.140635    
## s(AnnualPrecip)                     1.000  1.000 10.793 0.001679 ** 
## ti(ClayMedian,UPA)                  1.000  1.000  0.703 0.405031    
## ti(ClayMedian,AnnualPrecip)         5.536  7.265  2.373 0.026231 *  
## ti(FlowlineLength,MainChannelSlope) 1.000  1.000  0.029 0.864590    
## ti(ValleyBottWidth,CTImax)          6.001  7.883  1.655 0.125327    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.805   Deviance explained =   86%
## -REML = 68.845  Scale est. = 0.19033   n = 87

# ti(ClayMedian, UPA) and ti(FlowlineLength, MainChannelSlope) are NOT significant, could be removed 
# Keep the simplified interaction model (gam_final_refined), which includes only significant interactions (ti(ClayMedian, AnnualPrecip) and ti(ValleyBottWidth, CTImax)).

interactive_dam_gam <- gam(
  log_DamLength ~ 
    s(ClayMedian) + 
    s(FlowlineLength) + 
    s(MainChannelSlope) + 
    s(ValleyBottWidth) + 
    s(UPA) + 
    s(CTImax) + 
    s(AnnualPrecip) + 
    ti(ClayMedian,AnnualPrecip) + 
    ti(ValleyBottWidth,CTImax),
  family = gaussian(), 
  method = "REML", 
  data = damFinal
)

summary(interactive_dam_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip) + 
##     ti(ClayMedian, AnnualPrecip) + ti(ValleyBottWidth, CTImax)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.97185    0.06602   90.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df      F  p-value    
## s(ClayMedian)               1.099  1.165  9.200 0.002164 ** 
## s(FlowlineLength)           2.460  2.988 38.879  < 2e-16 ***
## s(MainChannelSlope)         1.000  1.000  7.739 0.007091 ** 
## s(ValleyBottWidth)          1.478  1.761 25.023 2.73e-07 ***
## s(UPA)                      3.101  3.748  8.008 0.000262 ***
## s(CTImax)                   1.000  1.000  2.439 0.123258    
## s(AnnualPrecip)             1.000  1.000 10.608 0.001803 ** 
## ti(ClayMedian,AnnualPrecip) 5.679  7.390  2.792 0.010731 *  
## ti(ValleyBottWidth,CTImax)  5.509  7.273  1.921 0.065889 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.805   Deviance explained = 85.6%
## -REML = 73.207  Scale est. = 0.18974   n = 87

5.3.7 model2c - interactive dam gam - dam order

# Summarize the transformed model
summary(interactive_dam_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log_DamLength ~ s(ClayMedian) + s(FlowlineLength) + s(MainChannelSlope) + 
##     s(ValleyBottWidth) + s(UPA) + s(CTImax) + s(AnnualPrecip) + 
##     ti(ClayMedian, AnnualPrecip) + ti(ValleyBottWidth, CTImax)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.97185    0.06602   90.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df      F  p-value    
## s(ClayMedian)               1.099  1.165  9.200 0.002164 ** 
## s(FlowlineLength)           2.460  2.988 38.879  < 2e-16 ***
## s(MainChannelSlope)         1.000  1.000  7.739 0.007091 ** 
## s(ValleyBottWidth)          1.478  1.761 25.023 2.73e-07 ***
## s(UPA)                      3.101  3.748  8.008 0.000262 ***
## s(CTImax)                   1.000  1.000  2.439 0.123258    
## s(AnnualPrecip)             1.000  1.000 10.608 0.001803 ** 
## ti(ClayMedian,AnnualPrecip) 5.679  7.390  2.792 0.010731 *  
## ti(ValleyBottWidth,CTImax)  5.509  7.273  1.921 0.065889 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.805   Deviance explained = 85.6%
## -REML = 73.207  Scale est. = 0.18974   n = 87

# Extract the summary from the model
gam_summary <- summary(interactive_dam_gam)

# Extract the relevant data for each smooth term
edf <- gam_summary$s.table[, "edf"]
f_value <- gam_summary$s.table[, "F"]
p_value <- gam_summary$s.table[, "p-value"]

# Generate significance levels based on p-values
significance <- ifelse(p_value < 0.001, "***", 
                       ifelse(p_value < 0.01, "**", 
                              ifelse(p_value < 0.05, "*", 
                                     ifelse(p_value < 0.1, ".", " "))))

# Create a data frame with predictor names, EDF, F-values, and significance levels
importance_df <- data.frame(
  Predictor = rownames(gam_summary$s.table),
  F_value = f_value,
  EDF = edf,
  Significance = significance
)

# Print the importance dataframe for verification
print(importance_df)

##                                               Predictor   F_value      EDF
## s(ClayMedian)                             s(ClayMedian)  9.199565 1.099375
## s(FlowlineLength)                     s(FlowlineLength) 38.878674 2.459791
## s(MainChannelSlope)                 s(MainChannelSlope)  7.739055 1.000041
## s(ValleyBottWidth)                   s(ValleyBottWidth) 25.022960 1.478208
## s(UPA)                                           s(UPA)  8.007724 3.101110
## s(CTImax)                                     s(CTImax)  2.439481 1.000068
## s(AnnualPrecip)                         s(AnnualPrecip) 10.608375 1.000013
## ti(ClayMedian,AnnualPrecip) ti(ClayMedian,AnnualPrecip)  2.792059 5.679249
## ti(ValleyBottWidth,CTImax)   ti(ValleyBottWidth,CTImax)  1.920829 5.509033
##                             Significance
## s(ClayMedian)                         **
## s(FlowlineLength)                    ***
## s(MainChannelSlope)                   **
## s(ValleyBottWidth)                   ***
## s(UPA)                               ***
## s(CTImax)                               
## s(AnnualPrecip)                       **
## ti(ClayMedian,AnnualPrecip)            *
## ti(ValleyBottWidth,CTImax)             .

# Plot predictor importance from highest to lowest, with significance levels and EDF
dam_order <- ggplot(importance_df, aes(x = reorder(Predictor, F_value), y = F_value)) +
  geom_bar(stat = "identity", fill = "#ff8f00",width = 0.5) +
  coord_flip() +
  geom_text(aes(label = paste("EDF =", round(EDF, 2), Significance)), hjust = -0.05, size = 3) +
  labs(title = "(b) Factor contributions to log-transformed dam length", 
     # x = "Predictor", 
     y = "F-value") +
  theme_bw(base_size = 12) +
  scale_y_continuous(breaks = seq(0, 40, by = 5),  # Ticks every 5 units
                     expand = expansion(mult = c(0, 0.2))) +  # Extra space for labels
  theme(plot.title = element_text(hjust = 0.5),
        axis.title.y = element_blank())
# dam_order

4&5. merge pond and dam order

# pond order from summary(gam_pond_simplified)
# dam order from summary(interactive_dam_gam)

x_limit <- c(0, 35)  # Set a consistent range across both plots
# Aligning plots with discrete scales
pond_order_aligned <- pond_order + 
  coord_flip() +  # Flip coordinates for horizontal bars
  theme(
    plot.margin = unit(c(5, 5, 5, 5), "pt")  # Add consistent plot margins
  )

dam_order_aligned <- dam_order + 
  coord_flip() +  # Flip coordinates for horizontal bars
  theme(
    plot.margin = unit(c(5, 5, 5, 5), "pt")  # Add consistent plot margins
  )


combined_plot <- plot_grid(
  pond_order_aligned, 
  dam_order_aligned, 
  ncol = 1,                 # Arrange in a single column
  rel_heights = c(3, 7),    # Proportional heights
  align = "v"               # Align vertically
)

# Print or save the combined plot
print(combined_plot)

# Save the plot
# fname <- paste0(outfig, '/var_importance_update.png')
# fname <- paste0(outfig, '/var_importance_update.pdf')
# ggsave(filename = fname, plot = combined_plot, dpi = 600, width = 7, units = 'in')

Pond clusters

Luwen Wan

2024-06-18

1. Variable data checking

1.1 sixteen predictors distribution

2. VIF (Variance Inflation Factor)

2.1 VIF, all < 10

3. normalization

3.1 min-max normalization (0-100) - allow more intuitive comparision about which factors is more important

4. Pond Generalized Additive Models (GAM)

4.1 Model 1a - full pond model

4.2 model 1b - simplified pond model

4.2.0 importance order

4.3 model 1c - reduced pond model

5. Dam GAM

5.1 model2a - full dam gam

5.2 model2b - simplified dam gam

5.3 model 2c- interactive dam GAM

5.3.1 test

5.3.2 Loop Through Interactions

5.3.3 gam final with significant (p_value < 0.1) interactions

5.3.7 model2c - interactive dam gam - dam order

4&5. merge pond and dam order