Workshop given in preparation for the International Congress for Conservation Biology held mid-June in Brisbane 2025.

Workshop slides

Species Data

Install and import packages

# install.packages("galah")

library(dplyr)
library(purrr)
library(ggplot2)
library(galah)
library(terra)
library(sf)
library(predicts)
library(tidyterra)

To cite the galah package use the following code.

citation(package = "galah")

To cite galah in publications use:

  Westgate M, Kellie D, Stevenson M, Newman P (2025). _galah:
  Biodiversity Data from the GBIF Node Network_. R package version
  2.1.1, <https://CRAN.R-project.org/package=galah>.

A BibTeX entry for LaTeX users is

  @Manual{,
    title = {galah: Biodiversity Data from the GBIF Node Network},
    author = {Martin Westgate and Dax Kellie and Matilda Stevenson and Peggy Newman},
    year = {2025},
    note = {R package version 2.1.1},
    url = {https://CRAN.R-project.org/package=galah},
  }

Load South East Queensland (SEQ) boundary

We made this boundary in the ‘ICCB_Environmental_data.qmd’ script. Also load local government area polygons (LGAs) just for plotting.

SEQ_extent.vect <- vect("Data/Environmental_variables/SEQ_extent.shp")

# Define an sf object as well
SEQ_extent <- st_as_sf(SEQ_extent.vect, 
                        coords = c("x", "y"), 
                        crs = 3112)
                        


# Load the study area shapefile
LGA <- st_read("Data/Environmental_variables/Local_Government_Areas.shp")

Reading layer `Local_Government_Areas' from data source 
  `C:\Users\kimc7\OneDrive - Queensland University of Technology\Outreach\GeospatialShare\geocommunity_blog\posts\2025-04-16-species-distribution-modelling\Data\Environmental_variables\Local_Government_Areas.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 78 features and 8 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 137.9946 ymin: -29.17927 xmax: 153.5519 ymax: -9.087991
Geodetic CRS:  GDA94

# Check the coordinate reference system (CRS)
st_crs(LGA)

Coordinate Reference System:
  User input: GDA94 
  wkt:
GEOGCRS["GDA94",
    DATUM["Geocentric Datum of Australia 1994",
        ELLIPSOID["GRS 1980",6378137,298.257222101,
            LENGTHUNIT["metre",1]]],
    PRIMEM["Greenwich",0,
        ANGLEUNIT["degree",0.0174532925199433]],
    CS[ellipsoidal,2],
        AXIS["geodetic latitude (Lat)",north,
            ORDER[1],
            ANGLEUNIT["degree",0.0174532925199433]],
        AXIS["geodetic longitude (Lon)",east,
            ORDER[2],
            ANGLEUNIT["degree",0.0174532925199433]],
    USAGE[
        SCOPE["Horizontal component of 3D system."],
        AREA["Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore."],
        BBOX[-60.55,93.41,-8.47,173.34]],
    ID["EPSG",4283]]

# Convert to WGS84
LGA <- LGA %>% st_transform(3112)

# Select local govt. areas for South East Queensland
LGA_SEQ <- LGA %>% 
  filter(lga %in% c("Brisbane City", 
                    "Moreton Bay City", 
                    "Logan City", 
                    "Ipswich City", 
                    "Redland City", 
                    "Scenic Rim Regional", 
                    "Somerset Regional", 
                    "Lockyer Valley Regional", 
                    "Gold Coast City", 
                    "Sunshine Coast Regional", 
                    "Toowoomba Regional", 
                    "Noosa Shire"))

Atlas of Living Australia using “galah” package

To access records from the ALA, you will need to be registered.

There are two registration options:

If you are affiliated with an Australian institution, you should be able to register for ALA through your institution. Find your institution in the list of institutions when you select ‘Login’ on the ALA website. Follow the prompts to enter your institution email and password. If your institution is not listed, you will need to register for an ALA account.
If you are not affiliated with an Australian institution, you will need to register for an ALA account. You can do this by selecting ‘Register’ on the ALA website. Follow the prompts to enter your email and password.

galah_config(atlas = "ALA",
             # username = "scott.forrest@hdr.qut.edu.au",
             email = "scott.forrest@hdr.qut.edu.au"
             # password = "Password"
             )

Download koala data from ALA

Usually you would check that the CRS for the occurrences is the same as the shapefile. However, we know that the `galah’ package operates in WGS84.

koala_occurrences <- galah_call() %>% 
  galah_identify("Phascolarctos cinereus") %>% 
  galah_filter(
    stateProvince == "Queensland",
    occurrenceStatus == "PRESENT"
  ) %>% 
  atlas_occurrences()

-----

Clean koala data

# Create a map using ggplot2
ggplot() +
  geom_sf(data = LGA, color = "black") +
  geom_point(data = koala_occurrences,
             aes(x = decimalLongitude,
                 y = decimalLatitude),
             color = "blue", size = 0.5) + # Add points for occurrences
  ggtitle("Koala occurrences across Queensland") +             # Add title
  theme_bw()

Filter by SEQ region

# Check for missing values in decimalLongitude and decimalLatitude
paste0("Number of NAs in 'longitude' ", sum(is.na(koala_occurrences$decimalLongitude)))

[1] "Number of NAs in 'longitude' 215"

paste0("Number of NAs in 'latitude' ", sum(is.na(koala_occurrences$decimalLatitude)))

[1] "Number of NAs in 'latitude' 215"

koala_occ_sf <- koala_occurrences %>% 
  drop_na(decimalLongitude, decimalLatitude) %>% # remove NA values
  st_as_sf(coords = c("decimalLongitude", "decimalLatitude"), 
           crs = 4326) %>%
  st_transform(3112) %>% # Transform to the same CRS as SEQ_extent
  st_intersection(SEQ_extent) %>% # Mask to extent
  distinct() # drop any duplicated records

Plot the filtered data

# Create a map using ggplot2
ggplot() +
  geom_sf(data = SEQ_extent.vect, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
  geom_sf(data = koala_occ_sf,                           # Add koala presence locations
          aes(geometry = geometry),
             color = "blue", size = 0.5) +               # Add points for occurrences
  ggtitle("Koala occurrences in South East Queensland") +      # Add title
  theme_bw()

Sample background points

We need to load the environmental covariate grid to use as our template grid for creating background points. We generated the environmental covariates in the ‘ICCB_Environmental_data.qmd’ script.

covs <- rast("Data/Environmental_variables/current_bioclim.tif")

# Make a blank raster of all 1s of the same dimensions as our covariate grid
domain <- covs[[1]]
values(domain) <- 1


# Mask extent by SEQ boundary
domain <- terra::mask(domain, SEQ_extent.vect, updatevalue = NA)

names(domain) <- "SEQ_extent"

plot(domain)

# Set the location and number of background points
# Random sampling at first
# The mask means that any NA (not SEQ) locations do not include background points
background <- predicts::backgroundSample(mask = domain, 
                                         n = 2500)

# Convert to terra SpatVector object
background <- terra::vect(background, crs = "EPSG:3112")

# Convert background points (SpatVector) to data frame
background_df <- as.data.frame(geom(background))

koala_occ.vect <- vect(koala_occ_sf)

# Plot the presences (blue) and the background points (grey)
ggplot() +
  geom_sf(data = SEQ_extent, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
  geom_spatvector(data = background,                           # Add koala presence locations
             color = "gray", cex = 0.5) +               # Add points for occurrences
  geom_spatvector(data = koala_occ.vect, 
                  aes(geometry = geometry), 
                  color = "blue", cex = 0.5) + # Add background points
  ggtitle("Koala occurrences (blue) and background points (grey) in South East Queensland") +      # Add title
  theme_bw()

Save koala presences and background points

This ensures quick upload in the future and that these records can be used among models.

writeVector(koala_occ.vect, "Data/Biological_records/SEQ_koala_occurrences.shp", overwrite = T)

writeVector(background, "Data/Biological_records/background_points_2.5k_random.shp", overwrite = T)

Environmental Data

Import packages

library(terra)
library(dplyr)
library(sf)
library(ggplot2)

Load South East Queensland (SEQ) boundary

We start by defining our study area, which is the South East Queensland (SEQ) region. We will use the Local Government Areas (LGA) shapefile to define the extent of SEQ.

https://qldspatial.information.qld.gov.au/catalogue/custom/detail.page?fid={3F3DBD69-647B-4833-B0A5-CC43D5E70699}

# Load the study area shapefile
LGA <- st_read("Data/Environmental_variables/Local_Government_Areas.shp")

Reading layer `Local_Government_Areas' from data source 
  `C:\Users\kimc7\OneDrive - Queensland University of Technology\Outreach\GeospatialShare\geocommunity_blog\posts\2025-04-16-species-distribution-modelling\Data\Environmental_variables\Local_Government_Areas.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 78 features and 8 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 137.9946 ymin: -29.17927 xmax: 153.5519 ymax: -9.087991
Geodetic CRS:  GDA94

# Check the coordinate reference system (CRS)
st_crs(LGA)

Coordinate Reference System:
  User input: GDA94 
  wkt:
GEOGCRS["GDA94",
    DATUM["Geocentric Datum of Australia 1994",
        ELLIPSOID["GRS 1980",6378137,298.257222101,
            LENGTHUNIT["metre",1]]],
    PRIMEM["Greenwich",0,
        ANGLEUNIT["degree",0.0174532925199433]],
    CS[ellipsoidal,2],
        AXIS["geodetic latitude (Lat)",north,
            ORDER[1],
            ANGLEUNIT["degree",0.0174532925199433]],
        AXIS["geodetic longitude (Lon)",east,
            ORDER[2],
            ANGLEUNIT["degree",0.0174532925199433]],
    USAGE[
        SCOPE["Horizontal component of 3D system."],
        AREA["Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore."],
        BBOX[-60.55,93.41,-8.47,173.34]],
    ID["EPSG",4283]]

# Convert to WGS84
LGA <- LGA %>% st_transform(3112)

# Select local govt. areas for South East Queensland
LGA_SEQ <- LGA %>% 
  filter(lga %in% c("Brisbane City", 
                    "Moreton Bay City", 
                    "Logan City", 
                    "Ipswich City", 
                    "Redland City", 
                    "Scenic Rim Regional", 
                    "Somerset Regional", 
                    "Lockyer Valley Regional", 
                    "Gold Coast City", 
                    "Sunshine Coast Regional", 
                    "Toowoomba Regional", 
                    "Noosa Shire"))

ggplot() +
  geom_sf(data = LGA, color = "black") +
  geom_sf(data = LGA_SEQ, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Local Government Areas Queensland (SEQ in purple)")

ggplot() +
  geom_sf(data = LGA_SEQ, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Local Government Areas South East Queensland (SEQ)")

Merge into a single polygon

# Merge the SEQ LGAs into one polygon
SEQ_extent <- st_union(LGA_SEQ)

ggplot() +
  geom_sf(data = SEQ_extent, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
  theme_minimal() +
  theme(legend.position = "none") +
  ggtitle("South-East Queensland Spatial Extent") + 
  theme_bw()

Save SEQ extent for other scripts

# Convert our SEQ extent to a SpatExtent object by converting to a SpatVector
SEQ_extent.vect <- terra::vect(SEQ_extent)

writeVector(SEQ_extent.vect, "Data/Environmental_variables/SEQ_extent.shp", overwrite = T)

Load current environmental data

Layers were made available to us by the EcoCommons team and were created by Toombs and Ma (2025):

Toombs, N., and Ma S., 2025, A High-Resolution Dataset of 19 Bioclimatic Indices over Australia, Climate Projections and Services – Queensland Treasury, Brisbane, Queensland. [https://longpaddock.qld.gov.au/qld-future-climate/data-info/tern/]

files <- list.files("Data/Environmental_variables/Current_climate_QLD", 
             pattern = ".tif$", 
             full.names = TRUE)

# Load all bioclim rasters
current_bioclim <- lapply(files, terra::rast) 

# Make into one raster stack
current_bioclim <- rast(current_bioclim)

plot(current_bioclim)

# Examine the resolution
current_bioclim

class       : SpatRaster 
dimensions  : 681, 841, 19  (nrow, ncol, nlyr)
resolution  : 0.05, 0.05  (x, y)
extent      : 111.975, 154.025, -44.025, -9.975  (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (EPSG:4326) 
sources     : bioclim_01.tif  
              bioclim_02.tif  
              bioclim_03.tif  
              ... and 16 more sources
names       : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...

# Check the CRS
crs(current_bioclim)

[1] "GEOGCRS[\"WGS 84\",\n    ENSEMBLE[\"World Geodetic System 1984 ensemble\",\n        MEMBER[\"World Geodetic System 1984 (Transit)\"],\n        MEMBER[\"World Geodetic System 1984 (G730)\"],\n        MEMBER[\"World Geodetic System 1984 (G873)\"],\n        MEMBER[\"World Geodetic System 1984 (G1150)\"],\n        MEMBER[\"World Geodetic System 1984 (G1674)\"],\n        MEMBER[\"World Geodetic System 1984 (G1762)\"],\n        MEMBER[\"World Geodetic System 1984 (G2139)\"],\n        ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n            LENGTHUNIT[\"metre\",1]],\n        ENSEMBLEACCURACY[2.0]],\n    PRIMEM[\"Greenwich\",0,\n        ANGLEUNIT[\"degree\",0.0174532925199433]],\n    CS[ellipsoidal,2],\n        AXIS[\"geodetic latitude (Lat)\",north,\n            ORDER[1],\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n        AXIS[\"geodetic longitude (Lon)\",east,\n            ORDER[2],\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n    USAGE[\n        SCOPE[\"Horizontal component of 3D system.\"],\n        AREA[\"World.\"],\n        BBOX[-90,-180,90,180]],\n    ID[\"EPSG\",4326]]"

# Update CRS 
current_bioclim <- terra::project(current_bioclim, "EPSG:3112")

# Our resolution is now ~5km by 5km
current_bioclim

class       : SpatRaster 
dimensions  : 770, 924, 19  (nrow, ncol, nlyr)
resolution  : 5123.954, 5123.954  (x, y)
extent      : -2477612, 2256922, -5117358, -1171913  (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112) 
source(s)   : memory
names       : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ... 
min values  :   5.030256,   5.511227,   34.70108,   105.4365,   16.72381,  -4.999238, ... 
max values  :  29.429586,  16.877110,   61.76879,   680.1939,   42.44056,  23.001247, ...

Mask to SEQ extent

current_bioclim <- terra::mask(current_bioclim, SEQ_extent.vect)

# You can see that this has masked the area but the extent is still the same
plot(current_bioclim[[1]])

current_bioclim <- terra::crop(current_bioclim, SEQ_extent.vect)

plot(current_bioclim)

# Save the current environmental covariates
writeRaster(current_bioclim, 
            filename = "Data/Environmental_variables/current_bioclim.tif",
            overwrite = T)

Load future environmental data

Here we load outputs from a moderate-high emissions shared socio-economic path scenario (SSP 3.70) for the year 2090 (2080 - 2099).

files <- list.files("Data/Environmental_variables/Future_climate_SSP370_2090", 
             pattern = ".tif$", 
             full.names = TRUE)

# Load all bioclim rasters
future_bioclim <- lapply(files, terra::rast) 

# Make into one raster stack
future_bioclim <- rast(future_bioclim)

plot(future_bioclim)

# Examine the resolution
future_bioclim

class       : SpatRaster 
dimensions  : 681, 841, 19  (nrow, ncol, nlyr)
resolution  : 0.05, 0.05  (x, y)
extent      : 111.975, 154.025, -44.025, -9.975  (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (EPSG:4326) 
sources     : bioclim_01.tif  
              bioclim_02.tif  
              bioclim_03.tif  
              ... and 16 more sources
names       : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...

# Check the CRS
crs(future_bioclim)

[1] "GEOGCRS[\"WGS 84\",\n    ENSEMBLE[\"World Geodetic System 1984 ensemble\",\n        MEMBER[\"World Geodetic System 1984 (Transit)\"],\n        MEMBER[\"World Geodetic System 1984 (G730)\"],\n        MEMBER[\"World Geodetic System 1984 (G873)\"],\n        MEMBER[\"World Geodetic System 1984 (G1150)\"],\n        MEMBER[\"World Geodetic System 1984 (G1674)\"],\n        MEMBER[\"World Geodetic System 1984 (G1762)\"],\n        MEMBER[\"World Geodetic System 1984 (G2139)\"],\n        ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n            LENGTHUNIT[\"metre\",1]],\n        ENSEMBLEACCURACY[2.0]],\n    PRIMEM[\"Greenwich\",0,\n        ANGLEUNIT[\"degree\",0.0174532925199433]],\n    CS[ellipsoidal,2],\n        AXIS[\"geodetic latitude (Lat)\",north,\n            ORDER[1],\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n        AXIS[\"geodetic longitude (Lon)\",east,\n            ORDER[2],\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n    USAGE[\n        SCOPE[\"Horizontal component of 3D system.\"],\n        AREA[\"World.\"],\n        BBOX[-90,-180,90,180]],\n    ID[\"EPSG\",4326]]"

# Update CRS 
future_bioclim <- terra::project(future_bioclim, "EPSG:3112")

# Our resolution is now ~5km by 5km
future_bioclim

class       : SpatRaster 
dimensions  : 770, 924, 19  (nrow, ncol, nlyr)
resolution  : 5123.954, 5123.954  (x, y)
extent      : -2477612, 2256922, -5117358, -1171913  (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112) 
source(s)   : memory
names       : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ... 
min values  :    0.00000,     0.0000,    0.00000,     0.0000,    0.00000,  -2.320518, ... 
max values  :   32.99174,    16.1037,   66.06882,   706.0898,   46.24387,  25.877970, ...

Mask to SEQ extent

future_bioclim <- terra::subst(from = 0, to = NA, future_bioclim) # Set all values of 0 to NA

future_bioclim <- terra::mask(future_bioclim, SEQ_extent.vect)

# You can see that this has masked the area but the extent is still the same
plot(future_bioclim[[1]])

future_bioclim <- terra::crop(future_bioclim, SEQ_extent.vect)

plot(future_bioclim[[1]])

# Save the future environmental covariates
writeRaster(future_bioclim, 
            filename = "Data/Environmental_variables/future_bioclim.2090.SSP370.tif",
            overwrite = T)

Load future environmental data 2

Here we load outputs from a low emissions shared socio-economic path scenario (SSP 1.26) for the year 2090 (2080 - 2099).

files <- list.files("Data/Environmental_variables/Future_climate_SSP126_2090", 
             pattern = ".tif$", 
             full.names = TRUE)

# Load all bioclim rasters
future_bioclim <- lapply(files, terra::rast) 

# Make into one raster stack
future_bioclim <- rast(future_bioclim)

plot(future_bioclim)

# Examine the resolution
future_bioclim

class       : SpatRaster 
dimensions  : 681, 841, 19  (nrow, ncol, nlyr)
resolution  : 0.05, 0.05  (x, y)
extent      : 111.975, 154.025, -44.025, -9.975  (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (EPSG:4326) 
sources     : bioclim_01.tif  
              bioclim_02.tif  
              bioclim_03.tif  
              ... and 16 more sources
names       : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ... 
min values  :    0.00000,    0.00000,     0.0000,         ? ,         ? ,         ? , ... 
max values  :   30.47033,   16.75114,    63.2751,         ? ,         ? ,         ? , ...

# Check the CRS
crs(future_bioclim)

[1] "GEOGCRS[\"WGS 84\",\n    ENSEMBLE[\"World Geodetic System 1984 ensemble\",\n        MEMBER[\"World Geodetic System 1984 (Transit)\"],\n        MEMBER[\"World Geodetic System 1984 (G730)\"],\n        MEMBER[\"World Geodetic System 1984 (G873)\"],\n        MEMBER[\"World Geodetic System 1984 (G1150)\"],\n        MEMBER[\"World Geodetic System 1984 (G1674)\"],\n        MEMBER[\"World Geodetic System 1984 (G1762)\"],\n        MEMBER[\"World Geodetic System 1984 (G2139)\"],\n        ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n            LENGTHUNIT[\"metre\",1]],\n        ENSEMBLEACCURACY[2.0]],\n    PRIMEM[\"Greenwich\",0,\n        ANGLEUNIT[\"degree\",0.0174532925199433]],\n    CS[ellipsoidal,2],\n        AXIS[\"geodetic latitude (Lat)\",north,\n            ORDER[1],\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n        AXIS[\"geodetic longitude (Lon)\",east,\n            ORDER[2],\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n    USAGE[\n        SCOPE[\"Horizontal component of 3D system.\"],\n        AREA[\"World.\"],\n        BBOX[-90,-180,90,180]],\n    ID[\"EPSG\",4326]]"

# Update CRS 
future_bioclim <- terra::project(future_bioclim, "EPSG:3112")

# Our resolution is now ~5km by 5km
future_bioclim

class       : SpatRaster 
dimensions  : 770, 924, 19  (nrow, ncol, nlyr)
resolution  : 5123.954, 5123.954  (x, y)
extent      : -2477612, 2256922, -5117358, -1171913  (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112) 
source(s)   : memory
names       : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ... 
min values  :    0.00000,    0.00000,    0.00000,     0.0000,    0.00000,  -4.138016, ... 
max values  :   30.75635,   16.89703,   63.40196,   687.5078,   43.83433,  24.002493, ...

Mask to SEQ extent

future_bioclim <- terra::subst(from = 0, to = NA, future_bioclim) # Set all values of 0 to NA

future_bioclim <- terra::mask(future_bioclim, SEQ_extent.vect)

# You can see that this has masked the area but the extent is still the same
plot(future_bioclim[[1]])

future_bioclim <- terra::crop(future_bioclim, SEQ_extent.vect)

plot(future_bioclim)

# Save the future environmental covariates
writeRaster(future_bioclim, 
            filename = "Data/Environmental_variables/future_bioclim.2090.SSP126.tif",
            overwrite = T)

Modelling and Validation

We wrote this script drawing on some of the following resources:

-Ecocommons Notebooks https://www.ecocommons.org.au/notebooks/

-Damaris Zurell’s SDM Intro https://damariszurell.github.io/SDM-Intro/

https://damariszurell.github.io/EEC-MGC/b4_SDM_eval.html

Import packages

library(dplyr)
library(purrr)
library(ggplot2)
library(terra)
library(sf)
library(predicts)
library(blockCV)
library(ecospat)
library(usdm)
library(randomForest)
library(precrec)
library(corrplot)

# Install the mecofun package, used in the materials at https://damariszurell.github.io/SDM-Intro/
library(devtools)
devtools::install_git("https://gitup.uni-potsdam.de/macroecology/mecofun.git")
# Load the mecofun package
library(mecofun)

Load koala presences and background points

They are loaded as spatvectors, but we also want them as dataframes for model input requirements.

koala_occ <- vect("Data/Biological_records/SEQ_koala_occurrences.shp")
background <- vect("Data/Biological_records/background_points_2.5k_random.shp")

# Make a dataframe of just x, y and presence
koala_occ_df <- koala_occ %>% 
  as.data.frame(geom = "XY") %>% 
  dplyr::select(x,y) %>% 
  mutate(Presence = 1)

head(koala_occ_df)

        x        y Presence
1 1868673 -3265806        1
2 1851054 -3232106        1
3 1846917 -3231913        1
4 1874167 -3262365        1
5 1857487 -3259935        1
6 1874262 -3262395        1

background_df <- background %>% 
  as.data.frame(geom = "XY") %>% 
  dplyr::select(x,y) %>% 
  mutate(Presence = 0)

head(background_df)

        x        y Presence
1 1854692 -3111330        0
2 1859816 -3111330        0
3 1864940 -3111330        0
4 1870064 -3111330        0
5 1875188 -3111330        0
6 1880312 -3111330        0

# Combine to one
pr_bg <- rbind(koala_occ_df, background_df)

Load environmental covariates

Loading current covariate rasters. We formatted these rasters in the same way as the Koala data, so that they are all in the same projection and extent. We did this in the script: ‘ICCB_Environmental_data.qmd’

covs_current <- rast("Data/Environmental_variables/current_bioclim.tif")


# Define the BIOCLIM names for the raster layers
layer_names <- c(
  "BIO1_Annual_Mean_Temp",
  "BIO2_Mean_Diurnal_Temp_Range",
  "BIO3_Isothermality",
  "BIO4_Temperature_Seasonality",
  "BIO5_Max_Temp_Warmest_Month",
  "BIO6_Min_Temp_Coldest_Month",
  "BIO7_Temperature_Annual_Range",
  "BIO8_Mean_Temp_Wettest_Quarter",
  "BIO9_Mean_Temp_Driest_Quarter",
  "BIO10_Mean_Temp_Warmest_Quarter",
  "BIO11_Mean_Temp_Coldest_Quarter",
  "BIO12_Annual_Precipitation",
  "BIO13_Precip_Wettest_Month",
  "BIO14_Precip_Driest_Month",
  "BIO15_Precip_Seasonality",
  "BIO16_Precip_Wettest_Quarter",
  "BIO17_Precip_Driest_Quarter",
  "BIO18_Precip_Warmest_Quarter",
  "BIO19_Precip_Coldest_Quarter")

names(covs_current) <- layer_names

Covariate selection

Option 1. Narrow down potential covariates based on ecological knowledge

For this example, we had advice from CSIRO scientists who conducted an expert elicitation to gather a set of potential covariates that are likely to be important for koalas. We use this knowledge to filter out the key bioclim variables.

We select the following: Bio5 : Max temp of the warmest month (mainly for the northern populations) Bio6 : Min temp of the coldest month (mainly for southern populations, which essentially excludes alpine regions) Bio12 : Annual Precipitation Bio15 : Precipitation seasonality (coefficient of variation)

for(i in 1:nlyr(covs_current)) {
  terra::plot(covs_current[[i]], main = names(covs_current)[[i]])
}

Show the four from expert elicitation the layers

covs_current_expert <- subset(covs_current, names(covs_current) %in% c("BIO5_Max_Temp_Warmest_Month", 
                                                                       "BIO6_Min_Temp_Coldest_Month", 
                                                                       "BIO12_Annual_Precipitation", 
                                                                       "BIO15_Precip_Seasonality"))

for(i in 1:nlyr(covs_current_expert)) {
  terra::plot(covs_current_expert[[i]], main = names(covs_current_expert)[[i]])
}

Extract environmental covariate values from presence and background locations (training locations)

train_PB_covs <- terra::extract(covs_current, pr_bg[,c("x", "y")], xy = T)
train_PB_covs <- cbind(train_PB_covs, pr_bg["Presence"])

# Remove rows where there's values missing from at least one covariate
print(paste0("RECORDS FROM ", nrow(train_PB_covs) - sum(complete.cases(train_PB_covs)), " ROWS IN TRAINING DATA REMOVED DUE TO MISSING COVARIATE VALUES"))

[1] "RECORDS FROM 68 ROWS IN TRAINING DATA REMOVED DUE TO MISSING COVARIATE VALUES"

train_PB_covs <- train_PB_covs[complete.cases(train_PB_covs), ] 
train_PB_covs <- dplyr::select(train_PB_covs, -ID)

head(train_PB_covs)

  BIO1_Annual_Mean_Temp BIO2_Mean_Diurnal_Temp_Range BIO3_Isothermality
1              20.73219                     9.515615           46.85551
2              20.85025                    10.291359           47.88751
3              20.61474                    10.648253           48.39870
4              21.00405                     9.129249           46.28033
5              20.75752                    10.070734           47.49575
6              21.00405                     9.129249           46.28033
  BIO4_Temperature_Seasonality BIO5_Max_Temp_Warmest_Month
1                     367.6388                    29.73501
2                     379.8710                    30.47122
3                     384.8382                    30.53297
4                     363.1416                    29.72843
5                     378.2968                    30.20078
6                     363.1416                    29.72843
  BIO6_Min_Temp_Coldest_Month BIO7_Temperature_Annual_Range
1                    9.386996                      20.34801
2                    8.940289                      21.53093
3                    8.487451                      22.04552
4                    9.958574                      19.76986
5                    8.959633                      21.24114
6                    9.958574                      19.76986
  BIO8_Mean_Temp_Wettest_Quarter BIO9_Mean_Temp_Driest_Quarter
1                       22.98827                      18.36451
2                       23.55152                      18.09120
3                       23.31077                      17.78633
4                       23.01914                      18.89500
5                       23.32381                      18.13480
6                       23.01914                      18.89500
  BIO10_Mean_Temp_Warmest_Quarter BIO11_Mean_Temp_Coldest_Quarter
1                        24.94183                        15.92675
2                        25.18869                        15.87509
3                        24.99932                        15.57214
4                        25.18043                        16.27002
5                        25.08320                        15.80299
6                        25.18043                        16.27002
  BIO12_Annual_Precipitation BIO13_Precip_Wettest_Month
1                   1238.362                   327.7901
2                   1165.235                   327.1804
3                   1161.138                   327.4446
4                   1222.618                   311.3793
5                   1109.354                   305.5583
6                   1222.618                   311.3793
  BIO14_Precip_Driest_Month BIO15_Precip_Seasonality
1                 10.800349                 92.89355
2                  8.330686                 98.80420
3                  8.048150                 99.45329
4                 11.913177                 88.93709
5                  8.327950                 97.22968
6                 11.913177                 88.93709
  BIO16_Precip_Wettest_Quarter BIO17_Precip_Driest_Quarter
1                     606.3058                    94.95189
2                     598.7009                    77.30684
3                     598.9535                    75.86132
4                     583.3614                   100.51526
5                     558.9021                    77.90620
6                     583.3614                   100.51526
  BIO18_Precip_Warmest_Quarter BIO19_Precip_Coldest_Quarter       x        y
1                     449.8074                     189.9305 1870064 -3265048
2                     444.0957                     147.6409 1849568 -3234305
3                     445.4260                     145.7036 1844444 -3234305
4                     448.1904                     200.5528 1875188 -3259924
5                     408.9403                     153.3001 1859816 -3259924
6                     448.1904                     200.5528 1875188 -3259924
  Presence
1        1
2        1
3        1
4        1
5        1
6        1

Thin the koala presence points (for tutorial only)

We now thin the presences to reduce the number of points to a manageable size for plotting and modelling. This is not a recommended step for real data, but is done here to make the tutorial run faster and to make the plots clearer.

train_PB_covs_pres <- train_PB_covs %>% filter(Presence == 1)
train_PB_covs_bg <- train_PB_covs %>% filter(Presence == 0)

# Thin the presences for plotting
train_PB_covs_pres_thin <- train_PB_covs_pres[sample(nrow(train_PB_covs_pres), 10000), ]

# Combine back into both presence and background
train_PB_covs_thinned <- rbind(train_PB_covs_pres_thin, train_PB_covs_bg)

Check correlation and multicollinearity of covariates

Correlation plot

There are several different methods for creating correlation plots.

ecospat.cor.plot(covs_current_expert)

Using the corplots package

For a simple and quick plot.

corplots <- ENMTools::raster.cor.plot(covs_current_expert)
corplots$cor.mds.plot

corplots$cor.heatmap

Here, we use the corrplot package to create a correlation plot of the selected covariates (this is taken from an EcoCommons Australia notebook).

# Select columns by their names
cor_data <- train_PB_covs[, names(train_PB_covs) %in% c("BIO5_Max_Temp_Warmest_Month", 
                                                        "BIO6_Min_Temp_Coldest_Month", 
                                                        "BIO12_Annual_Precipitation", 
                                                        "BIO15_Precip_Seasonality")]

# Check the structure of the numeric data
str(cor_data)

'data.frame':   88075 obs. of  4 variables:
 $ BIO5_Max_Temp_Warmest_Month: num  29.7 30.5 30.5 29.7 30.2 ...
 $ BIO6_Min_Temp_Coldest_Month: num  9.39 8.94 8.49 9.96 8.96 ...
 $ BIO12_Annual_Precipitation : num  1238 1165 1161 1223 1109 ...
 $ BIO15_Precip_Seasonality   : num  92.9 98.8 99.5 88.9 97.2 ...

# Calculate the correlation matrix for the numeric columns
cor_matrix <- cor(cor_data, use = "complete.obs", method = "pearson")


corrplot(cor_matrix,
         method = "color",            # Use colored squares for correlation
         type = "upper",              # Show upper triangle only
         order = "hclust",            # Reorder variables hierarchically
         addCoef.col = "black",       # Show correlation coefficients in black
         number.cex = 0.5,            # Reduce the size of correlation labels
         tl.col = "black",            # Text label color
         tl.srt = 30,                 # Rotate labels slightly for readability
         tl.cex = 0.5,                # Reduce text size of variable labels (set smaller valu)
         cl.cex = 0.8,                # Reduce text size of color legend
         diag = FALSE,                # Hide diagonal
         col = colorRampPalette(c("#11aa96", "#61c6fa", "#f6aa70"))(200),
         sig.level = 0.01, insig = "blank")

Variance Inflation Factor (VIF)

If you find corrplot is hard for you to make decisions, we can use Variance Inflation Factor (VIF). VIF is another statistical measure used to detect multicollinearity in a set of explanatory (independent) variables in a regression model.

Interpretation:

VIF = 1: No correlation
VIF > 1 and <= 5: Moderate correlation; may not require corrective action.
VIF > 5: Indicates high correlation. Multicollinearity may be problematic, and further investigation is recommended.
VIF > 10: Strong multicollinearity. The variable is highly collinear with others, and steps should be taken to address this.

# usdm::vif(covs_current_expert) # just VIF for all covariates
usdm::vifstep(covs_current_expert) # Variance Inflation Factor and test for multicollinearity

No variable from the 4 input variables has collinearity problem. 

The linear correlation coefficients ranges between: 
min correlation ( BIO15_Precip_Seasonality ~ BIO6_Min_Temp_Coldest_Month ):  -0.2756697 
max correlation ( BIO12_Annual_Precipitation ~ BIO6_Min_Temp_Coldest_Month ):  0.8769718 

---------- VIFs of the remained variables -------- 
                    Variables      VIF
1 BIO5_Max_Temp_Warmest_Month 2.751370
2 BIO6_Min_Temp_Coldest_Month 4.528719
3  BIO12_Annual_Precipitation 6.888077
4    BIO15_Precip_Seasonality 1.263385

Exploration of the koala presence and background data

It is good practice to assess where in the environmental space the presence and background points are located. This can help to identify if there are any potential issues with the data, such as a lack of background points in certain areas of environmental space, and should show any patterns in the data that the model should pick up.

# Iterate over all of the variables to create density plots of the background and presence data
for(i in 1:ncol(train_PB_covs_thinned)) {
  
  print(ggplot() +
          geom_density(data = train_PB_covs_thinned, 
                       aes(x = .data[[names(train_PB_covs_thinned)[i]]], fill = as.factor(Presence)), 
                       alpha = 0.5) +
    theme_bw() +
    labs(title = names(train_PB_covs_thinned)[i]))
  
}

First Model: GLM model fitting

# Make a folder to save outputs
dir.create("Outputs/GLM_outputs", showWarnings = F)

Null model

Null model: no explanatory variables or predictors are included.

It is always helpful to create a null model as a benchmark to assess how the inclusion of explanatory variables improves the model.

# Fit a null model with only the intercept
null_model <- glm(Presence ~ 1,
                  data = train_PB_covs,
                  family = binomial(link = "logit"))

# Check the model results
summary(null_model)


Call:
glm(formula = Presence ~ 1, family = binomial(link = "logit"), 
    data = train_PB_covs)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  4.08025    0.02636   154.8   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 14900  on 88074  degrees of freedom
Residual deviance: 14900  on 88074  degrees of freedom
AIC: 14902

Number of Fisher Scoring iterations: 7

GLM Model 1 - expert variables

glm_model_1 <- glm(Presence ~ 
                     BIO5_Max_Temp_Warmest_Month + 
                     BIO6_Min_Temp_Coldest_Month + 
                     BIO12_Annual_Precipitation + 
                     BIO15_Precip_Seasonality,
                   data=train_PB_covs_thinned,
                   family = binomial(link = "logit"))

# Check the model results
summary(glm_model_1)


Call:
glm(formula = Presence ~ BIO5_Max_Temp_Warmest_Month + BIO6_Min_Temp_Coldest_Month + 
    BIO12_Annual_Precipitation + BIO15_Precip_Seasonality, family = binomial(link = "logit"), 
    data = train_PB_covs_thinned)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -4.8956819  1.5630088  -3.132  0.00173 ** 
BIO5_Max_Temp_Warmest_Month -0.6341523  0.0514780 -12.319  < 2e-16 ***
BIO6_Min_Temp_Coldest_Month  0.8653363  0.0284251  30.443  < 2e-16 ***
BIO12_Annual_Precipitation  -0.0030684  0.0002791 -10.992  < 2e-16 ***
BIO15_Precip_Seasonality     0.2437024  0.0106358  22.913  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 8758.5  on 11463  degrees of freedom
Residual deviance: 5758.2  on 11459  degrees of freedom
AIC: 5768.2

Number of Fisher Scoring iterations: 6

# These response curves don't look very helpful
dismo::response(glm_model_1)

GLM Model 2 - expert variables with quadratics

In this model, we include quadratic terms for the covariates. This is a common approach in species distribution modelling to account for non-linear relationships between the predictors and the response variable. This increases the complexity of the model and allows for more flexibility in fitting the data.

glm_model_2 <- glm(Presence ~ 
                     BIO5_Max_Temp_Warmest_Month + I(BIO5_Max_Temp_Warmest_Month^2) + 
                     BIO6_Min_Temp_Coldest_Month + I(BIO6_Min_Temp_Coldest_Month^2) + 
                     BIO12_Annual_Precipitation + I(BIO12_Annual_Precipitation^2) + 
                     BIO15_Precip_Seasonality + I(BIO15_Precip_Seasonality^2), 
                   data=train_PB_covs_thinned,
                   family = binomial(link = "logit"))

# Check the model results
summary(glm_model_2)


Call:
glm(formula = Presence ~ BIO5_Max_Temp_Warmest_Month + I(BIO5_Max_Temp_Warmest_Month^2) + 
    BIO6_Min_Temp_Coldest_Month + I(BIO6_Min_Temp_Coldest_Month^2) + 
    BIO12_Annual_Precipitation + I(BIO12_Annual_Precipitation^2) + 
    BIO15_Precip_Seasonality + I(BIO15_Precip_Seasonality^2), 
    family = binomial(link = "logit"), data = train_PB_covs_thinned)

Coefficients:
                                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -9.304e+01  1.887e+01  -4.930 8.22e-07 ***
BIO5_Max_Temp_Warmest_Month      -6.996e-01  1.229e+00  -0.569 0.569266    
I(BIO5_Max_Temp_Warmest_Month^2)  3.164e-03  1.989e-02   0.159 0.873623    
BIO6_Min_Temp_Coldest_Month      -1.309e+00  1.884e-01  -6.951 3.63e-12 ***
I(BIO6_Min_Temp_Coldest_Month^2)  1.557e-01  1.272e-02  12.244  < 2e-16 ***
BIO12_Annual_Precipitation        4.063e-03  1.942e-03   2.092 0.036403 *  
I(BIO12_Annual_Precipitation^2)  -2.504e-06  6.972e-07  -3.592 0.000329 ***
BIO15_Precip_Seasonality          2.044e+00  2.360e-01   8.660  < 2e-16 ***
I(BIO15_Precip_Seasonality^2)    -9.028e-03  1.261e-03  -7.160 8.08e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 8758.5  on 11463  degrees of freedom
Residual deviance: 5578.7  on 11455  degrees of freedom
AIC: 5596.7

Number of Fisher Scoring iterations: 6

# These response curves don't look very helpful
dismo::response(glm_model_2)

Model effect evaluation

Here we use a function presented in an EcoCommons Australia notebook to evaluate the model performance. The notebook can be found on their GitHub: https://github.com/EcoCommonsAustralia/notebooks/tree/main/notebooks.

# Function to plot effect size graph
plot_effect_size <- function(glm_model) {
  # Check if required libraries are installed
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("Please install the 'ggplot2' package to use this function.")
  }
  library(ggplot2)

  # Extract effect sizes (coefficients) from the model
  coefs <- summary(glm_model)$coefficients
  effect_sizes <- data.frame(
    Variable = rownames(coefs)[-1],  # Exclude the intercept
    Effect_Size = coefs[-1, "Estimate"],
    Std_Error = coefs[-1, "Std. Error"]
  )

  # Sort by effect size
  effect_sizes <- effect_sizes[order(-abs(effect_sizes$Effect_Size)), ]

  # Plot the effect sizes with error bars
  ggplot(effect_sizes, aes(x = reorder(Variable, Effect_Size), y = Effect_Size)) +
    geom_bar(stat = "identity", fill = "#11aa96") +
    geom_errorbar(aes(ymin = Effect_Size - Std_Error, ymax = Effect_Size + Std_Error), width = 0.2) +
    coord_flip() +
    labs(
      title = "Effect Sizes of Variables",
      x = "Variable",
      y = "Effect Size (Coefficient Estimate)"
    ) +
    theme_minimal()
}

Use the function to check the effect sizes

We need to be careful when interpreting the effect sizes of models with quadratic terms however, as the response curve depends on the linear and the quadratic term.

# Example usage of effect size plot
plot_effect_size(glm_model_1)

plot_effect_size(glm_model_2)

Response curves

Again, we can use a function from the EcoCommons notebook to plot the response curves from the model, although for quadratics we need to adjust the function or use something else.

plot_species_response <- function(glm_model, predictors, data) {
  # Check if required libraries are installed
  if (!requireNamespace("ggplot2", quietly = TRUE) || !requireNamespace("gridExtra", quietly = TRUE)) {
    stop("Please install the 'ggplot2' and 'gridExtra' packages to use this function.")
  }
  library(ggplot2)
  library(gridExtra)

  # Create empty list to store response plots
  response_plots <- list()

  # Loop through each predictor variable
  for (predictor in predictors) {
    # Create new data frame to vary predictor while keeping others constant
    pred_range <- seq(
      min(data[[predictor]], na.rm = TRUE),
      max(data[[predictor]], na.rm = TRUE),
      length.out = 100
    )
    const_data <- data[1, , drop = FALSE]  # Use first row to keep other predictors constant
    response_data <- const_data[rep(1, 100), ]  # Duplicate the row
    response_data[[predictor]] <- pred_range

    # Predict probabilities
    predicted_response <- predict(glm_model, newdata = response_data, type = "response")

    # Create data frame for plotting
    plot_data <- data.frame(
      Predictor_Value = pred_range,
      Predicted_Probability = predicted_response
    )

    # Add presence and absence data
    presence_absence_data <- data.frame(
      Predictor_Value = data[[predictor]],
      Presence_Absence = data$Presence
    )

    # Generate the response plot
    p <- ggplot() +
      
      geom_line(data = plot_data, 
                aes(x = Predictor_Value, y = Predicted_Probability), 
                color = "#61c6fa", linewidth = 1) +
      
      geom_point(data = presence_absence_data[presence_absence_data$Presence_Absence == 1, ], 
                 aes(x = Predictor_Value, y = Presence_Absence), 
                 color = "#11aa96", alpha = 0.6) +
      
      geom_point(data = presence_absence_data[presence_absence_data$Presence_Absence == 0, ], 
                 aes(x = Predictor_Value, y = Presence_Absence), 
                 color = "#f6aa70", alpha = 0.6) +
      
      labs(x = predictor, y = NULL) +
      theme_minimal() +
      theme(axis.title.y = element_blank())

    # Store the plot in the list
    response_plots[[predictor]] <- p
  }

  # Arrange all plots in one combined plot with a single shared y-axis label
  grid.arrange(
    grobs = response_plots,
    ncol = 3,
    left = "Predicted Probability / Presence-Absence"
  )
}

Use the response curve function

predictors <- c("BIO5_Max_Temp_Warmest_Month", 
                "BIO6_Min_Temp_Coldest_Month", 
                "BIO12_Annual_Precipitation", 
                "BIO15_Precip_Seasonality")

plot_species_response(glm_model_1, predictors, train_PB_covs_thinned)

Model 1 partial responses

# Plot the partial responses
partial_response(glm_model_1, predictors = train_PB_covs_thinned[,predictors], main='GLM')

# Plot inflated response curves:
inflated_response(glm_model_1, predictors = train_PB_covs_thinned[,predictors], method = "stat3", lwd = 3, main='GLM')

Model 2 partial responses

# Plot the partial responses
partial_response(glm_model_2, predictors = train_PB_covs_thinned[,predictors], main='GLM')

# Plot inflated response curves:
inflated_response(glm_model_2, predictors = train_PB_covs_thinned[,predictors], method = "stat3", lwd = 3, main='GLM')

GLM predictions to current environment

Model 1

# Predict the presence probability across the entire raster extent
predicted_raster_model_1 <- predicts::predict(covs_current_expert, glm_model_1, type = "response")

# Plot the species distribution raster
plot(
  predicted_raster_model_1,
  range = c(0, 1),  # Set min and max values for the color scale
  main = "Relative Probability of Occurrence of Koalas in SEQ - GLM 1"
)

Model 2

# Predict the presence probability across the entire raster extent
predicted_raster_model_2 <- predicts::predict(covs_current_expert, glm_model_2, type = "response")

# Plot the species distribution raster
plot(
  predicted_raster_model_2,
  range = c(0, 1),  # Set min and max values for the color scale
  main = "Relative Probability of Occurrence of Koalas in SEQ - GLM 2"
)

Random forest

# Make a folder to save outputs

dir.create("Outputs/RF_outputs", showWarnings = F)

Data preparation

Calculate the case weights (down/up-weighting)

Because we have a ‘class imbalance’ (uneven number of presences and background points), we are making their ‘weight’ in the model equal.

presNum <- as.numeric(table(train_PB_covs_thinned$Presence)["1"]) # number of presences
bgNum <- as.numeric(table(train_PB_covs_thinned$Presence)["0"]) # number of backgrounds
weight <- ifelse(train_PB_covs_thinned$Presence == 1, 1, presNum / bgNum) # down-weighting

Prepare for fitting the RF model(s)

# If wanting to fit a classification model
# Convert the response to factor for producing class relative likelihood
train_PB_covs_thinned$Presence_Factor <- as.factor(train_PB_covs_thinned$Presence)

# For down-sampling, the number of background (0s) in each bootstrap sample should the same as presences
# (1s). For this, we use sampsize argument to do this.
# We need to choose the SMALLER number out of the two classes
sample_size <- c("0" = bgNum, "1" = bgNum)
sample_size <- c(bgNum)

Random Forest Model - expert covariates

Classification or regression model?

# # Specify the model formula - classification
# model_formula <- as.formula(Presence_Factor ~ 
#                               BIO5_Max_Temp_Warmest_Month + 
#                               BIO6_Min_Temp_Coldest_Month + 
#                               BIO12_Annual_Precipitation + 
#                               BIO15_Precip_Seasonality)

# Specify the model formula - regression
model_formula <- as.formula(Presence ~ 
                              BIO5_Max_Temp_Warmest_Month + 
                              BIO6_Min_Temp_Coldest_Month + 
                              BIO12_Annual_Precipitation + 
                              BIO15_Precip_Seasonality)

Fit the random forest model

This will throw a warning as we only have two unique values (0s and 1s), but in our case that is fine, and you can ignore the warning.

rf_1 <- randomForest::randomForest(formula = model_formula,
                                   data = train_PB_covs_thinned,
                                   weights = weight,
                                   ntree = 1000,
                                   sampsize = sample_size,
                                   replace = T, 
                                   importance=TRUE)

Check the model results

# Model summary
rf_1


Call:
 randomForest(formula = model_formula, data = train_PB_covs_thinned,      weights = weight, ntree = 1000, sampsize = sample_size, replace = T,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 1000
No. of variables tried at each split: 1

          Mean of squared residuals: 0.08564252
                    % Var explained: 23.12

# Variable importance:
importance(rf_1)

                              %IncMSE IncNodePurity
BIO5_Max_Temp_Warmest_Month  91.55258      70.67321
BIO6_Min_Temp_Coldest_Month 113.37257     101.99034
BIO12_Annual_Precipitation   79.51614      82.59679
BIO15_Precip_Seasonality     78.85705      47.42284

# Plot variable importance
varImpPlot(rf_1, type = 1)

# Look at single trees:
head(getTree(rf_1,1,T))

  left daughter right daughter                   split var split point status
1             2              3 BIO5_Max_Temp_Warmest_Month   30.558750     -3
2             4              5 BIO6_Min_Temp_Coldest_Month    7.933356     -3
3             6              7 BIO5_Max_Temp_Warmest_Month   33.011837     -3
4             8              9    BIO15_Precip_Seasonality   94.776566     -3
5            10             11 BIO5_Max_Temp_Warmest_Month   29.037033     -3
6            12             13  BIO12_Annual_Precipitation  789.708954     -3
  prediction
1  0.5116120
2  0.7076923
3  0.2439418
4  0.0781250
5  0.8200837
6  0.2903846

Partial dependence plots

# Now, we plot response curves in the same way as we did for GLMs above:
partial_response(rf_1, predictors = train_PB_covs_thinned[,predictors], main='Random Forest')

# Plot inflated response curves:
inflated_response(rf_1, predictors = train_PB_covs_thinned[,predictors], method = "stat3", lwd = 3, main='Random Forest')

Random Forest predictions

# Predict the presence probability across the entire raster extent
predicted_raster_RF_1 <- predicts::predict(covs_current_expert, rf_1, type = "response")

# Plot the species distribution raster
plot(
  predicted_raster_RF_1,
  range = c(0, 1),  # Set min and max values for the color scale
  main = "Relative Probability of Occurrence of Koalas in SEQ - RF 1"
)

Model evaluation with spatial block cross-validation

# Convert training data to sf
train_PB_covs_thinned_sf <- st_as_sf(train_PB_covs_thinned[, c("x", "y", "Presence")], coords = c("x", "y"), crs = "EPSG:3112")

# Generate spatial blocks
spblock <- cv_spatial(x = train_PB_covs_thinned_sf, 
                      column = "Presence",
                      r = NULL,
                      size = 50000, # Size of the blocks in metres
                      k = 5,
                      hexagon = TRUE,
                      selection = "random",
                      iteration = 100, # to find evenly-dispersed folds
                      biomod2 = FALSE)


  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
  train_0 train_1 test_0 test_1
1    1163    5134    301   4866
2    1169    9148    295    852
3    1175    9479    289    521
4    1148    8014    316   1986
5    1201    8225    263   1775

cv_plot(cv = spblock,
        x = train_PB_covs_thinned_sf,
        points_alpha = 0.5,
        nrow = 2)

# Extract the folds to save 
spfolds <- spblock$folds_list

# We now have a list of 5 folds, where the first object is the training data, and the second is the testing data
str(spfolds)

List of 5
 $ :List of 2
  ..$ : int [1:6297] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
  ..$ : int [1:5167] 10329 10576 10444 10364 10534 10404 10571 10365 10621 10401 ...
 $ :List of 2
  ..$ : int [1:10317] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
  ..$ : int [1:1147] 10479 10437 10660 10613 10662 8631 10661 10612 10480 10614 ...
 $ :List of 2
  ..$ : int [1:10654] 10479 10437 10660 10613 10662 8631 10661 10612 10480 10614 ...
  ..$ : int [1:810] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
 $ :List of 2
  ..$ : int [1:9162] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
  ..$ : int [1:2302] 11271 11083 11228 11181 11085 11231 11185 11270 11138 11232 ...
 $ :List of 2
  ..$ : int [1:9426] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
  ..$ : int [1:2038] 10495 10498 3540 2917 10496 10500 10337 1630 9411 10633 ...

Run the model for every fold and evaluate

Model evaluation - metrics

Typically, it helps to evaluate your model with several metrics that describe different features of model performance and prediction. Here, we define a function to feed in a model prediction and calculate several evaluation metrics.

The metrics are:

-Area under the receiver operating characteristic curve (AUC ROC)

Higher values of this (closer to 1) suggest a model is good at distinguishing presence points from the background.

-Continuous boyce index

Higher values of this (closer to 1) suggest a model is good at predicting higher suitability at spots where there were presences.

# Start a dataframe to save results
eval_df <- data.frame(fold = numeric(),
                      ROC = numeric(),
                      boyce = numeric())

for(f in seq_along(spfolds)) {
  
  # Subset the training and testing data (spatial cross validation) (for the fth fold)
  
  train_PB_covs_scv <- train_PB_covs_thinned[spfolds[[f]][[1]], ]
  test_PB_covs_scv <- train_PB_covs_thinned[spfolds[[f]][[2]], ]
  
  glm_model_1 <- glm(Presence ~ 
                     BIO5_Max_Temp_Warmest_Month + 
                     BIO6_Min_Temp_Coldest_Month + 
                     BIO12_Annual_Precipitation + 
                     BIO15_Precip_Seasonality,
                   data=train_PB_covs_scv,
                   family = binomial(link = "logit"))
  
    # Predict to the testing data of fold f
  test_PB_covs_scv$pred <- predict(glm_model_1, newdata = test_PB_covs_scv, type = "response")

  # Evaluate prediction on test set
  ROC = precrec::auc(precrec::evalmod(scores = test_PB_covs_scv$pred, labels = test_PB_covs_scv$Presence))[1,4]
 
  boyce = ecospat::ecospat.boyce(fit = test_PB_covs_scv$pred, 
                                 obs = test_PB_covs_scv$pred[which(test_PB_covs_scv$Presence==1)], 
                                 nclass = 0, # Calculate continuous index
                                 method = "pearson",
                                 PEplot = F)[["cor"]]
  
  # Add results to dataframe
  eval_df <- eval_df %>% add_row(fold = f, ROC = ROC, boyce = boyce)

  
}

Summarise the evaluation metrics

# Mean AUC & boyce
eval_df %>% 
  summarise(mean_AUC = mean(ROC),
            mean_boyce = mean(boyce),
            sd_AUC = sd(ROC),
            sd_boyce = sd(boyce))

   mean_AUC mean_boyce     sd_AUC  sd_boyce
1 0.8581919     0.6092 0.07396525 0.4851667

Model evaluation for Random Forest with spatial block cross-validation

We’re going to use the same spatial folds as before.

Run the RF model for every fold and evaluate

# Start a dataframe to save results
eval_df.RF <- data.frame(fold = numeric(),
                      ROC = numeric(),
                      boyce = numeric())

for(f in seq_along(spfolds)) {
  
  # Subset the training and testing data (spatial cross validation) (for the fth fold)
  
  train_PB_covs_scv <- train_PB_covs_thinned[spfolds[[f]][[1]], ]
  test_PB_covs_scv <- train_PB_covs_thinned[spfolds[[f]][[2]], ]
  
  presNum <- as.numeric(table(train_PB_covs_scv$Presence)["1"]) # number of presences
  bgNum <- as.numeric(table(train_PB_covs_scv$Presence)["0"]) # number of backgrounds
  weight <- ifelse(train_PB_covs_scv$Presence == 1, 1, presNum / bgNum) # down-weighting
  
  sample_size <- c("0" = bgNum, "1" = bgNum)
  sample_size <- c(bgNum)

  
  rf_1 <- randomForest::randomForest(formula = model_formula,
                                   data = train_PB_covs_scv,
                                   weights = weight,
                                   ntree = 1000,
                                   sampsize = sample_size,
                                   replace = T, 
                                   importance=TRUE)
  
  
    # Predict to the testing data of fold f
  test_PB_covs_scv$pred <- predict(rf_1, newdata = test_PB_covs_scv, type = "response")

  # Evaluate prediction on test set
  ROC = precrec::auc(precrec::evalmod(scores = test_PB_covs_scv$pred, labels = test_PB_covs_scv$Presence))[1,4]
 
  boyce = ecospat::ecospat.boyce(fit = test_PB_covs_scv$pred, 
                                 obs = test_PB_covs_scv$pred[which(test_PB_covs_scv$Presence==1)], 
                                 nclass = 0, # Calculate continuous index
                                 method = "pearson",
                                 PEplot = F)[["cor"]]
  
  # Add results to dataframe
  eval_df.RF <- eval_df.RF %>% add_row(fold = f, ROC = ROC, boyce = boyce)

  
}

Summarise the evaluation metrics

# Mean AUC & boyce
eval_df.RF %>% 
  summarise(mean_AUC = mean(ROC),
            mean_boyce = mean(boyce),
            sd_AUC = sd(ROC),
            sd_boyce = sd(boyce))

   mean_AUC mean_boyce    sd_AUC  sd_boyce
1 0.7988609     0.6852 0.1004777 0.2349228

Make predictions to future climates

Load future environmental data

covs_future <- rast("Data/Environmental_variables/future_bioclim.2090.SSP370.tif")
names(covs_future) <- layer_names
covs_future

class       : SpatRaster 
dimensions  : 47, 55, 19  (nrow, ncol, nlyr)
resolution  : 5123.954, 5123.954  (x, y)
extent      : 1621552, 1903370, -3349594, -3108768  (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112) 
source      : future_bioclim.2090.SSP370.tif 
names       : BIO1_~_Temp, BIO2_~Range, BIO3_~ality, BIO4_~ality, BIO5_~Month, BIO6_~Month, ... 
min values  :   0.8177757,   0.3121004,    1.564732,    12.43535,    1.130271,   0.4521889, ... 
max values  :  24.5521488,  14.5486765,   50.989292,   528.63293,   37.389927,  15.7586107, ...

covs_future <- terra::mask(covs_future, covs_current) # Crop to SEQ extent

covs_future_expert <- subset(covs_future, names(covs_future) %in% c("BIO5_Max_Temp_Warmest_Month", 
                                                                    "BIO6_Min_Temp_Coldest_Month", 
                                                                    "BIO12_Annual_Precipitation", 
                                                                    "BIO15_Precip_Seasonality"))

Plot the future rasters

for(i in 1:nlyr(covs_future)) {
  terra::plot(covs_future[[i]], main = names(covs_future)[[i]])
}

# Plot to compare the difference between the current and future rasters

plot(covs_future_expert - covs_current_expert)

Test the environmental distance between current data and future conditions

mess <- predicts::mess(covs_future_expert, 
                       train_PB_covs_thinned[, c("BIO5_Max_Temp_Warmest_Month", 
                                            "BIO6_Min_Temp_Coldest_Month", 
                                            "BIO12_Annual_Precipitation", 
                                            "BIO15_Precip_Seasonality")])

plot(mess, axes = F)

r_mess_mask <- mess < 0
plot(r_mess_mask, axes=F)

Test which areas you might mask out because they are ‘novel’ in environmental space and therefore require model extrapolation.

analog_fut <- predicted_raster_model_1

values(analog_fut)[values(mess)<0] <- NA

plot(analog_fut, 
     range = c(0, 1),  # Set min and max values for the color scale
     main = "Koala relative occurrence in regions with analogue conditions")

novel_fut <- predicted_raster_model_1

values(novel_fut)[values(mess)>0] <- NA

plot(novel_fut, 
     range = c(0, 1),  # Set min and max values for the color scale
     main = "Koala relative occurrence in regions with novel conditions")

GLM future predictions

Model 1

# Predict the presence probability across the entire raster extent
predicted_raster_model_1 <- predict(covs_future_expert, glm_model_1, type = "response")

# Plot the species distribution raster
plot(
  predicted_raster_model_1,
  range = c(0, 1),  # Set min and max values for the color scale
  main = "Relative Probability of Occurrence of Koalas in SEQ - GLM1"
)

Model 2

# Predict the presence probability across the entire raster extent
predicted_raster_model_2 <- predict(covs_future_expert, glm_model_2, type = "response")

# Plot the species distribution raster
plot(
  predicted_raster_model_2,
  range = c(0, 1),  # Set min and max values for the color scale
  main = "Relative Probability of Occurrence of Koalas in SEQ - GLM2"
)

Random Forest future predictions

Model 1

# Predict the presence probability across the entire raster extent
predicted_raster_RF_1 <- predicts::predict(covs_future_expert, rf_1, type = "response", na.rm=TRUE)

# Plot the species distribution raster
plot(
  predicted_raster_RF_1,
  range = c(0, 1),  # Set min and max values for the color scale
  main = "Relative Probability of Occurrence of Koalas in SEQ - RF"
)

Presenting predictions with uncertainty

There are many sources of model uncertainty that should be explored and ideally, presented alongside model predictions.

One that we’ll focus on here is climate scenario uncertainty. We do so by fitting a second model to future climate data from a lower emission shared socioeconomic path scenario (SSP 1.26).

Load future environmental data (SSP 1.26)

covs_future_SSP126 <- rast("Data/Environmental_variables/future_bioclim.2090.SSP126.tif")
names(covs_future_SSP126) <- layer_names
covs_future_SSP126

class       : SpatRaster 
dimensions  : 47, 55, 19  (nrow, ncol, nlyr)
resolution  : 5123.954, 5123.954  (x, y)
extent      : 1621552, 1903370, -3349594, -3108768  (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112) 
source      : future_bioclim.2090.SSP126.tif 
names       : BIO1_~_Temp, BIO2_~Range, BIO3_~ality, BIO4_~ality, BIO5_~Month, BIO6_~Month, ... 
min values  :   0.7471204,   0.3128485,    1.585051,    12.19436,    1.046093,   0.3754203, ... 
max values  :  22.4384842,  14.8543510,   51.254021,   544.83429,   35.546104,  13.5217409, ...

covs_future_SSP126_expert <- subset(covs_future_SSP126, names(covs_future_SSP126) %in% c("BIO5_Max_Temp_Warmest_Month", 
                                                                                          "BIO6_Min_Temp_Coldest_Month", 
                                                                                          "BIO12_Annual_Precipitation", 
                                                                                          "BIO15_Precip_Seasonality"))

Plot to compare the variables across the two scenarios

plot(covs_future_expert)

plot(covs_future_SSP126_expert)

GLM future predictions (SSP 1.26)

Model 1

# Predict the presence probability across the entire raster extent
predicted_raster_model_1_SSP126 <- predict(covs_future_SSP126_expert, glm_model_1, type = "response")

# Plot the species distribution raster
plot(
  predicted_raster_model_1,
  range = c(0,1),
  main = "SSP 3.70"
)

plot(
  predicted_raster_model_1_SSP126,
  range = c(0,1),
  main = "SSP 1.26"
)

Model uncertainty

Another element of uncertainty that can be represented is model uncertainty, or the standard error around the coefficient estimates.

# Extract standard errors of coefficients

coef_se <- summary(glm_model_1)$coefficients[, "Std. Error"]

print(coef_se)

                (Intercept) BIO5_Max_Temp_Warmest_Month 
               1.9611565762                0.0618157348 
BIO6_Min_Temp_Coldest_Month  BIO12_Annual_Precipitation 
               0.0334897580                0.0003411282 
   BIO15_Precip_Seasonality 
               0.0127273370

covs_df <- as.data.frame(covs_future_expert, na.rm = FALSE)

pred_link <- predict(glm_model_1, newdata = covs_df, type = "link", se.fit = TRUE)

# Linear predictor (eta)
eta <- pred_link$fit
se_eta <- pred_link$se.fit

# Confidence intervals (95%)
z <- 1.96
eta_lower <- eta - z * se_eta
eta_upper <- eta + z * se_eta

# Transform back to response scale
linkinv <- glm_model_1$family$linkinv
predicted <- linkinv(eta)
lower_ci <- linkinv(eta_lower)
upper_ci <- linkinv(eta_upper)


# Add to covs_df
covs_df$predicted <- predicted
covs_df$lower_ci <- lower_ci
covs_df$upper_ci <- upper_ci


predicted_r <- setValues(rast(covs_future_expert, nlyr = 1), predicted)
lower_ci_r <- setValues(rast(covs_future_expert, nlyr = 1), lower_ci)
upper_ci_r <- setValues(rast(covs_future_expert, nlyr = 1), upper_ci)

# Step 2: Name the layers
names(predicted_r) <- "predicted"
names(lower_ci_r) <- "lower_CI"
names(upper_ci_r) <- "upper_CI"

prediction_w_uncertainty <- c(predicted_r, lower_ci_r, upper_ci_r)

plot(prediction_w_uncertainty, range = c(0, 1))