# install.packages("galah")
Workshop given in preparation for the International Congress for Conservation Biology held mid-June in Brisbane 2025.
Workshop slides
Species Data
Install and import packages
library(dplyr)
library(purrr)
library(ggplot2)
library(galah)
library(terra)
library(sf)
library(predicts)
library(tidyterra)
To cite the galah
package use the following code.
citation(package = "galah")
To cite galah in publications use:
Westgate M, Kellie D, Stevenson M, Newman P (2025). _galah:
Biodiversity Data from the GBIF Node Network_. R package version
2.1.1, <https://CRAN.R-project.org/package=galah>.
A BibTeX entry for LaTeX users is
@Manual{,
title = {galah: Biodiversity Data from the GBIF Node Network},
author = {Martin Westgate and Dax Kellie and Matilda Stevenson and Peggy Newman},
year = {2025},
note = {R package version 2.1.1},
url = {https://CRAN.R-project.org/package=galah},
}
Load South East Queensland (SEQ) boundary
We made this boundary in the ‘ICCB_Environmental_data.qmd’ script. Also load local government area polygons (LGAs) just for plotting.
<- vect("Data/Environmental_variables/SEQ_extent.shp")
SEQ_extent.vect
# Define an sf object as well
<- st_as_sf(SEQ_extent.vect,
SEQ_extent coords = c("x", "y"),
crs = 3112)
# Load the study area shapefile
<- st_read("Data/Environmental_variables/Local_Government_Areas.shp") LGA
Reading layer `Local_Government_Areas' from data source
`C:\Users\kimc7\OneDrive - Queensland University of Technology\Outreach\GeospatialShare\geocommunity_blog\posts\2025-04-16-species-distribution-modelling\Data\Environmental_variables\Local_Government_Areas.shp'
using driver `ESRI Shapefile'
Simple feature collection with 78 features and 8 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 137.9946 ymin: -29.17927 xmax: 153.5519 ymax: -9.087991
Geodetic CRS: GDA94
# Check the coordinate reference system (CRS)
st_crs(LGA)
Coordinate Reference System:
User input: GDA94
wkt:
GEOGCRS["GDA94",
DATUM["Geocentric Datum of Australia 1994",
ELLIPSOID["GRS 1980",6378137,298.257222101,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
USAGE[
SCOPE["Horizontal component of 3D system."],
AREA["Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore."],
BBOX[-60.55,93.41,-8.47,173.34]],
ID["EPSG",4283]]
# Convert to WGS84
<- LGA %>% st_transform(3112)
LGA
# Select local govt. areas for South East Queensland
<- LGA %>%
LGA_SEQ filter(lga %in% c("Brisbane City",
"Moreton Bay City",
"Logan City",
"Ipswich City",
"Redland City",
"Scenic Rim Regional",
"Somerset Regional",
"Lockyer Valley Regional",
"Gold Coast City",
"Sunshine Coast Regional",
"Toowoomba Regional",
"Noosa Shire"))
Atlas of Living Australia using “galah” package
To access records from the ALA, you will need to be registered.
There are two registration options:
If you are affiliated with an Australian institution, you should be able to register for ALA through your institution. Find your institution in the list of institutions when you select ‘Login’ on the ALA website. Follow the prompts to enter your institution email and password. If your institution is not listed, you will need to register for an ALA account.
If you are not affiliated with an Australian institution, you will need to register for an ALA account. You can do this by selecting ‘Register’ on the ALA website. Follow the prompts to enter your email and password.
galah_config(atlas = "ALA",
# username = "scott.forrest@hdr.qut.edu.au",
email = "scott.forrest@hdr.qut.edu.au"
# password = "Password"
)
Download koala data from ALA
Usually you would check that the CRS for the occurrences is the same as the shapefile. However, we know that the `galah’ package operates in WGS84.
<- galah_call() %>%
koala_occurrences galah_identify("Phascolarctos cinereus") %>%
galah_filter(
== "Queensland",
stateProvince == "PRESENT"
occurrenceStatus %>%
) atlas_occurrences()
-----
Clean koala data
# Create a map using ggplot2
ggplot() +
geom_sf(data = LGA, color = "black") +
geom_point(data = koala_occurrences,
aes(x = decimalLongitude,
y = decimalLatitude),
color = "blue", size = 0.5) + # Add points for occurrences
ggtitle("Koala occurrences across Queensland") + # Add title
theme_bw()
Filter by SEQ region
# Check for missing values in decimalLongitude and decimalLatitude
paste0("Number of NAs in 'longitude' ", sum(is.na(koala_occurrences$decimalLongitude)))
[1] "Number of NAs in 'longitude' 215"
paste0("Number of NAs in 'latitude' ", sum(is.na(koala_occurrences$decimalLatitude)))
[1] "Number of NAs in 'latitude' 215"
<- koala_occurrences %>%
koala_occ_sf drop_na(decimalLongitude, decimalLatitude) %>% # remove NA values
st_as_sf(coords = c("decimalLongitude", "decimalLatitude"),
crs = 4326) %>%
st_transform(3112) %>% # Transform to the same CRS as SEQ_extent
st_intersection(SEQ_extent) %>% # Mask to extent
distinct() # drop any duplicated records
Plot the filtered data
# Create a map using ggplot2
ggplot() +
geom_sf(data = SEQ_extent.vect, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
geom_sf(data = koala_occ_sf, # Add koala presence locations
aes(geometry = geometry),
color = "blue", size = 0.5) + # Add points for occurrences
ggtitle("Koala occurrences in South East Queensland") + # Add title
theme_bw()
Sample background points
We need to load the environmental covariate grid to use as our template grid for creating background points. We generated the environmental covariates in the ‘ICCB_Environmental_data.qmd’ script.
<- rast("Data/Environmental_variables/current_bioclim.tif")
covs
# Make a blank raster of all 1s of the same dimensions as our covariate grid
<- covs[[1]]
domain values(domain) <- 1
# Mask extent by SEQ boundary
<- terra::mask(domain, SEQ_extent.vect, updatevalue = NA)
domain
names(domain) <- "SEQ_extent"
plot(domain)
# Set the location and number of background points
# Random sampling at first
# The mask means that any NA (not SEQ) locations do not include background points
<- predicts::backgroundSample(mask = domain,
background n = 2500)
# Convert to terra SpatVector object
<- terra::vect(background, crs = "EPSG:3112")
background
# Convert background points (SpatVector) to data frame
<- as.data.frame(geom(background))
background_df
<- vect(koala_occ_sf)
koala_occ.vect
# Plot the presences (blue) and the background points (grey)
ggplot() +
geom_sf(data = SEQ_extent, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
geom_spatvector(data = background, # Add koala presence locations
color = "gray", cex = 0.5) + # Add points for occurrences
geom_spatvector(data = koala_occ.vect,
aes(geometry = geometry),
color = "blue", cex = 0.5) + # Add background points
ggtitle("Koala occurrences (blue) and background points (grey) in South East Queensland") + # Add title
theme_bw()
Save koala presences and background points
This ensures quick upload in the future and that these records can be used among models.
writeVector(koala_occ.vect, "Data/Biological_records/SEQ_koala_occurrences.shp", overwrite = T)
writeVector(background, "Data/Biological_records/background_points_2.5k_random.shp", overwrite = T)
Environmental Data
Import packages
library(terra)
library(dplyr)
library(sf)
library(ggplot2)
Load South East Queensland (SEQ) boundary
We start by defining our study area, which is the South East Queensland (SEQ) region. We will use the Local Government Areas (LGA) shapefile to define the extent of SEQ.
https://qldspatial.information.qld.gov.au/catalogue/custom/detail.page?fid={3F3DBD69-647B-4833-B0A5-CC43D5E70699}
# Load the study area shapefile
<- st_read("Data/Environmental_variables/Local_Government_Areas.shp") LGA
Reading layer `Local_Government_Areas' from data source
`C:\Users\kimc7\OneDrive - Queensland University of Technology\Outreach\GeospatialShare\geocommunity_blog\posts\2025-04-16-species-distribution-modelling\Data\Environmental_variables\Local_Government_Areas.shp'
using driver `ESRI Shapefile'
Simple feature collection with 78 features and 8 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 137.9946 ymin: -29.17927 xmax: 153.5519 ymax: -9.087991
Geodetic CRS: GDA94
# Check the coordinate reference system (CRS)
st_crs(LGA)
Coordinate Reference System:
User input: GDA94
wkt:
GEOGCRS["GDA94",
DATUM["Geocentric Datum of Australia 1994",
ELLIPSOID["GRS 1980",6378137,298.257222101,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
USAGE[
SCOPE["Horizontal component of 3D system."],
AREA["Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore."],
BBOX[-60.55,93.41,-8.47,173.34]],
ID["EPSG",4283]]
# Convert to WGS84
<- LGA %>% st_transform(3112)
LGA
# Select local govt. areas for South East Queensland
<- LGA %>%
LGA_SEQ filter(lga %in% c("Brisbane City",
"Moreton Bay City",
"Logan City",
"Ipswich City",
"Redland City",
"Scenic Rim Regional",
"Somerset Regional",
"Lockyer Valley Regional",
"Gold Coast City",
"Sunshine Coast Regional",
"Toowoomba Regional",
"Noosa Shire"))
ggplot() +
geom_sf(data = LGA, color = "black") +
geom_sf(data = LGA_SEQ, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Local Government Areas Queensland (SEQ in purple)")
ggplot() +
geom_sf(data = LGA_SEQ, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Local Government Areas South East Queensland (SEQ)")
Merge into a single polygon
# Merge the SEQ LGAs into one polygon
<- st_union(LGA_SEQ)
SEQ_extent
ggplot() +
geom_sf(data = SEQ_extent, fill = "purple3", alpha = 0.5, color = "black", size = 0.2) +
theme_minimal() +
theme(legend.position = "none") +
ggtitle("South-East Queensland Spatial Extent") +
theme_bw()
Save SEQ extent for other scripts
# Convert our SEQ extent to a SpatExtent object by converting to a SpatVector
<- terra::vect(SEQ_extent)
SEQ_extent.vect
writeVector(SEQ_extent.vect, "Data/Environmental_variables/SEQ_extent.shp", overwrite = T)
Load current environmental data
Layers were made available to us by the EcoCommons team and were created by Toombs and Ma (2025):
Toombs, N., and Ma S., 2025, A High-Resolution Dataset of 19 Bioclimatic Indices over Australia, Climate Projections and Services – Queensland Treasury, Brisbane, Queensland. [https://longpaddock.qld.gov.au/qld-future-climate/data-info/tern/]
<- list.files("Data/Environmental_variables/Current_climate_QLD",
files pattern = ".tif$",
full.names = TRUE)
# Load all bioclim rasters
<- lapply(files, terra::rast)
current_bioclim
# Make into one raster stack
<- rast(current_bioclim)
current_bioclim
plot(current_bioclim)
# Examine the resolution
current_bioclim
class : SpatRaster
dimensions : 681, 841, 19 (nrow, ncol, nlyr)
resolution : 0.05, 0.05 (x, y)
extent : 111.975, 154.025, -44.025, -9.975 (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (EPSG:4326)
sources : bioclim_01.tif
bioclim_02.tif
bioclim_03.tif
... and 16 more sources
names : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...
# Check the CRS
crs(current_bioclim)
[1] "GEOGCRS[\"WGS 84\",\n ENSEMBLE[\"World Geodetic System 1984 ensemble\",\n MEMBER[\"World Geodetic System 1984 (Transit)\"],\n MEMBER[\"World Geodetic System 1984 (G730)\"],\n MEMBER[\"World Geodetic System 1984 (G873)\"],\n MEMBER[\"World Geodetic System 1984 (G1150)\"],\n MEMBER[\"World Geodetic System 1984 (G1674)\"],\n MEMBER[\"World Geodetic System 1984 (G1762)\"],\n MEMBER[\"World Geodetic System 1984 (G2139)\"],\n ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1]],\n ENSEMBLEACCURACY[2.0]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n CS[ellipsoidal,2],\n AXIS[\"geodetic latitude (Lat)\",north,\n ORDER[1],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n AXIS[\"geodetic longitude (Lon)\",east,\n ORDER[2],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n USAGE[\n SCOPE[\"Horizontal component of 3D system.\"],\n AREA[\"World.\"],\n BBOX[-90,-180,90,180]],\n ID[\"EPSG\",4326]]"
# Update CRS
<- terra::project(current_bioclim, "EPSG:3112")
current_bioclim
# Our resolution is now ~5km by 5km
current_bioclim
class : SpatRaster
dimensions : 770, 924, 19 (nrow, ncol, nlyr)
resolution : 5123.954, 5123.954 (x, y)
extent : -2477612, 2256922, -5117358, -1171913 (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112)
source(s) : memory
names : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...
min values : 5.030256, 5.511227, 34.70108, 105.4365, 16.72381, -4.999238, ...
max values : 29.429586, 16.877110, 61.76879, 680.1939, 42.44056, 23.001247, ...
Mask to SEQ extent
<- terra::mask(current_bioclim, SEQ_extent.vect)
current_bioclim
# You can see that this has masked the area but the extent is still the same
plot(current_bioclim[[1]])
<- terra::crop(current_bioclim, SEQ_extent.vect)
current_bioclim
plot(current_bioclim)
# Save the current environmental covariates
writeRaster(current_bioclim,
filename = "Data/Environmental_variables/current_bioclim.tif",
overwrite = T)
Load future environmental data
Here we load outputs from a moderate-high emissions shared socio-economic path scenario (SSP 3.70) for the year 2090 (2080 - 2099).
<- list.files("Data/Environmental_variables/Future_climate_SSP370_2090",
files pattern = ".tif$",
full.names = TRUE)
# Load all bioclim rasters
<- lapply(files, terra::rast)
future_bioclim
# Make into one raster stack
<- rast(future_bioclim)
future_bioclim
plot(future_bioclim)
# Examine the resolution
future_bioclim
class : SpatRaster
dimensions : 681, 841, 19 (nrow, ncol, nlyr)
resolution : 0.05, 0.05 (x, y)
extent : 111.975, 154.025, -44.025, -9.975 (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (EPSG:4326)
sources : bioclim_01.tif
bioclim_02.tif
bioclim_03.tif
... and 16 more sources
names : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...
# Check the CRS
crs(future_bioclim)
[1] "GEOGCRS[\"WGS 84\",\n ENSEMBLE[\"World Geodetic System 1984 ensemble\",\n MEMBER[\"World Geodetic System 1984 (Transit)\"],\n MEMBER[\"World Geodetic System 1984 (G730)\"],\n MEMBER[\"World Geodetic System 1984 (G873)\"],\n MEMBER[\"World Geodetic System 1984 (G1150)\"],\n MEMBER[\"World Geodetic System 1984 (G1674)\"],\n MEMBER[\"World Geodetic System 1984 (G1762)\"],\n MEMBER[\"World Geodetic System 1984 (G2139)\"],\n ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1]],\n ENSEMBLEACCURACY[2.0]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n CS[ellipsoidal,2],\n AXIS[\"geodetic latitude (Lat)\",north,\n ORDER[1],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n AXIS[\"geodetic longitude (Lon)\",east,\n ORDER[2],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n USAGE[\n SCOPE[\"Horizontal component of 3D system.\"],\n AREA[\"World.\"],\n BBOX[-90,-180,90,180]],\n ID[\"EPSG\",4326]]"
# Update CRS
<- terra::project(future_bioclim, "EPSG:3112")
future_bioclim
# Our resolution is now ~5km by 5km
future_bioclim
class : SpatRaster
dimensions : 770, 924, 19 (nrow, ncol, nlyr)
resolution : 5123.954, 5123.954 (x, y)
extent : -2477612, 2256922, -5117358, -1171913 (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112)
source(s) : memory
names : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...
min values : 0.00000, 0.0000, 0.00000, 0.0000, 0.00000, -2.320518, ...
max values : 32.99174, 16.1037, 66.06882, 706.0898, 46.24387, 25.877970, ...
Mask to SEQ extent
<- terra::subst(from = 0, to = NA, future_bioclim) # Set all values of 0 to NA
future_bioclim
<- terra::mask(future_bioclim, SEQ_extent.vect)
future_bioclim
# You can see that this has masked the area but the extent is still the same
plot(future_bioclim[[1]])
<- terra::crop(future_bioclim, SEQ_extent.vect)
future_bioclim
plot(future_bioclim[[1]])
# Save the future environmental covariates
writeRaster(future_bioclim,
filename = "Data/Environmental_variables/future_bioclim.2090.SSP370.tif",
overwrite = T)
Load future environmental data 2
Here we load outputs from a low emissions shared socio-economic path scenario (SSP 1.26) for the year 2090 (2080 - 2099).
<- list.files("Data/Environmental_variables/Future_climate_SSP126_2090",
files pattern = ".tif$",
full.names = TRUE)
# Load all bioclim rasters
<- lapply(files, terra::rast)
future_bioclim
# Make into one raster stack
<- rast(future_bioclim)
future_bioclim
plot(future_bioclim)
# Examine the resolution
future_bioclim
class : SpatRaster
dimensions : 681, 841, 19 (nrow, ncol, nlyr)
resolution : 0.05, 0.05 (x, y)
extent : 111.975, 154.025, -44.025, -9.975 (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (EPSG:4326)
sources : bioclim_01.tif
bioclim_02.tif
bioclim_03.tif
... and 16 more sources
names : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...
min values : 0.00000, 0.00000, 0.0000, ? , ? , ? , ...
max values : 30.47033, 16.75114, 63.2751, ? , ? , ? , ...
# Check the CRS
crs(future_bioclim)
[1] "GEOGCRS[\"WGS 84\",\n ENSEMBLE[\"World Geodetic System 1984 ensemble\",\n MEMBER[\"World Geodetic System 1984 (Transit)\"],\n MEMBER[\"World Geodetic System 1984 (G730)\"],\n MEMBER[\"World Geodetic System 1984 (G873)\"],\n MEMBER[\"World Geodetic System 1984 (G1150)\"],\n MEMBER[\"World Geodetic System 1984 (G1674)\"],\n MEMBER[\"World Geodetic System 1984 (G1762)\"],\n MEMBER[\"World Geodetic System 1984 (G2139)\"],\n ELLIPSOID[\"WGS 84\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1]],\n ENSEMBLEACCURACY[2.0]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n CS[ellipsoidal,2],\n AXIS[\"geodetic latitude (Lat)\",north,\n ORDER[1],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n AXIS[\"geodetic longitude (Lon)\",east,\n ORDER[2],\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n USAGE[\n SCOPE[\"Horizontal component of 3D system.\"],\n AREA[\"World.\"],\n BBOX[-90,-180,90,180]],\n ID[\"EPSG\",4326]]"
# Update CRS
<- terra::project(future_bioclim, "EPSG:3112")
future_bioclim
# Our resolution is now ~5km by 5km
future_bioclim
class : SpatRaster
dimensions : 770, 924, 19 (nrow, ncol, nlyr)
resolution : 5123.954, 5123.954 (x, y)
extent : -2477612, 2256922, -5117358, -1171913 (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112)
source(s) : memory
names : bioclim_01, bioclim_02, bioclim_03, bioclim_04, bioclim_05, bioclim_06, ...
min values : 0.00000, 0.00000, 0.00000, 0.0000, 0.00000, -4.138016, ...
max values : 30.75635, 16.89703, 63.40196, 687.5078, 43.83433, 24.002493, ...
Mask to SEQ extent
<- terra::subst(from = 0, to = NA, future_bioclim) # Set all values of 0 to NA
future_bioclim
<- terra::mask(future_bioclim, SEQ_extent.vect)
future_bioclim
# You can see that this has masked the area but the extent is still the same
plot(future_bioclim[[1]])
<- terra::crop(future_bioclim, SEQ_extent.vect)
future_bioclim
plot(future_bioclim)
# Save the future environmental covariates
writeRaster(future_bioclim,
filename = "Data/Environmental_variables/future_bioclim.2090.SSP126.tif",
overwrite = T)
Modelling and Validation
We wrote this script drawing on some of the following resources:
-Ecocommons Notebooks https://www.ecocommons.org.au/notebooks/
-Damaris Zurell’s SDM Intro https://damariszurell.github.io/SDM-Intro/
https://damariszurell.github.io/EEC-MGC/b4_SDM_eval.html
Import packages
library(dplyr)
library(purrr)
library(ggplot2)
library(terra)
library(sf)
library(predicts)
library(blockCV)
library(ecospat)
library(usdm)
library(randomForest)
library(precrec)
library(corrplot)
# Install the mecofun package, used in the materials at https://damariszurell.github.io/SDM-Intro/
library(devtools)
::install_git("https://gitup.uni-potsdam.de/macroecology/mecofun.git")
devtools# Load the mecofun package
library(mecofun)
Load koala presences and background points
They are loaded as spatvectors, but we also want them as dataframes for model input requirements.
<- vect("Data/Biological_records/SEQ_koala_occurrences.shp")
koala_occ <- vect("Data/Biological_records/background_points_2.5k_random.shp")
background
# Make a dataframe of just x, y and presence
<- koala_occ %>%
koala_occ_df as.data.frame(geom = "XY") %>%
::select(x,y) %>%
dplyrmutate(Presence = 1)
head(koala_occ_df)
x y Presence
1 1868673 -3265806 1
2 1851054 -3232106 1
3 1846917 -3231913 1
4 1874167 -3262365 1
5 1857487 -3259935 1
6 1874262 -3262395 1
<- background %>%
background_df as.data.frame(geom = "XY") %>%
::select(x,y) %>%
dplyrmutate(Presence = 0)
head(background_df)
x y Presence
1 1854692 -3111330 0
2 1859816 -3111330 0
3 1864940 -3111330 0
4 1870064 -3111330 0
5 1875188 -3111330 0
6 1880312 -3111330 0
# Combine to one
<- rbind(koala_occ_df, background_df) pr_bg
Load environmental covariates
Loading current covariate rasters. We formatted these rasters in the same way as the Koala data, so that they are all in the same projection and extent. We did this in the script: ‘ICCB_Environmental_data.qmd’
<- rast("Data/Environmental_variables/current_bioclim.tif")
covs_current
# Define the BIOCLIM names for the raster layers
<- c(
layer_names "BIO1_Annual_Mean_Temp",
"BIO2_Mean_Diurnal_Temp_Range",
"BIO3_Isothermality",
"BIO4_Temperature_Seasonality",
"BIO5_Max_Temp_Warmest_Month",
"BIO6_Min_Temp_Coldest_Month",
"BIO7_Temperature_Annual_Range",
"BIO8_Mean_Temp_Wettest_Quarter",
"BIO9_Mean_Temp_Driest_Quarter",
"BIO10_Mean_Temp_Warmest_Quarter",
"BIO11_Mean_Temp_Coldest_Quarter",
"BIO12_Annual_Precipitation",
"BIO13_Precip_Wettest_Month",
"BIO14_Precip_Driest_Month",
"BIO15_Precip_Seasonality",
"BIO16_Precip_Wettest_Quarter",
"BIO17_Precip_Driest_Quarter",
"BIO18_Precip_Warmest_Quarter",
"BIO19_Precip_Coldest_Quarter")
names(covs_current) <- layer_names
Covariate selection
Option 1. Narrow down potential covariates based on ecological knowledge
For this example, we had advice from CSIRO scientists who conducted an expert elicitation to gather a set of potential covariates that are likely to be important for koalas. We use this knowledge to filter out the key bioclim variables.
We select the following: Bio5 : Max temp of the warmest month (mainly for the northern populations) Bio6 : Min temp of the coldest month (mainly for southern populations, which essentially excludes alpine regions) Bio12 : Annual Precipitation Bio15 : Precipitation seasonality (coefficient of variation)
for(i in 1:nlyr(covs_current)) {
::plot(covs_current[[i]], main = names(covs_current)[[i]])
terra }
Show the four from expert elicitation the layers
<- subset(covs_current, names(covs_current) %in% c("BIO5_Max_Temp_Warmest_Month",
covs_current_expert "BIO6_Min_Temp_Coldest_Month",
"BIO12_Annual_Precipitation",
"BIO15_Precip_Seasonality"))
for(i in 1:nlyr(covs_current_expert)) {
::plot(covs_current_expert[[i]], main = names(covs_current_expert)[[i]])
terra }
Extract environmental covariate values from presence and background locations (training locations)
<- terra::extract(covs_current, pr_bg[,c("x", "y")], xy = T)
train_PB_covs <- cbind(train_PB_covs, pr_bg["Presence"])
train_PB_covs
# Remove rows where there's values missing from at least one covariate
print(paste0("RECORDS FROM ", nrow(train_PB_covs) - sum(complete.cases(train_PB_covs)), " ROWS IN TRAINING DATA REMOVED DUE TO MISSING COVARIATE VALUES"))
[1] "RECORDS FROM 68 ROWS IN TRAINING DATA REMOVED DUE TO MISSING COVARIATE VALUES"
<- train_PB_covs[complete.cases(train_PB_covs), ]
train_PB_covs <- dplyr::select(train_PB_covs, -ID)
train_PB_covs
head(train_PB_covs)
BIO1_Annual_Mean_Temp BIO2_Mean_Diurnal_Temp_Range BIO3_Isothermality
1 20.73219 9.515615 46.85551
2 20.85025 10.291359 47.88751
3 20.61474 10.648253 48.39870
4 21.00405 9.129249 46.28033
5 20.75752 10.070734 47.49575
6 21.00405 9.129249 46.28033
BIO4_Temperature_Seasonality BIO5_Max_Temp_Warmest_Month
1 367.6388 29.73501
2 379.8710 30.47122
3 384.8382 30.53297
4 363.1416 29.72843
5 378.2968 30.20078
6 363.1416 29.72843
BIO6_Min_Temp_Coldest_Month BIO7_Temperature_Annual_Range
1 9.386996 20.34801
2 8.940289 21.53093
3 8.487451 22.04552
4 9.958574 19.76986
5 8.959633 21.24114
6 9.958574 19.76986
BIO8_Mean_Temp_Wettest_Quarter BIO9_Mean_Temp_Driest_Quarter
1 22.98827 18.36451
2 23.55152 18.09120
3 23.31077 17.78633
4 23.01914 18.89500
5 23.32381 18.13480
6 23.01914 18.89500
BIO10_Mean_Temp_Warmest_Quarter BIO11_Mean_Temp_Coldest_Quarter
1 24.94183 15.92675
2 25.18869 15.87509
3 24.99932 15.57214
4 25.18043 16.27002
5 25.08320 15.80299
6 25.18043 16.27002
BIO12_Annual_Precipitation BIO13_Precip_Wettest_Month
1 1238.362 327.7901
2 1165.235 327.1804
3 1161.138 327.4446
4 1222.618 311.3793
5 1109.354 305.5583
6 1222.618 311.3793
BIO14_Precip_Driest_Month BIO15_Precip_Seasonality
1 10.800349 92.89355
2 8.330686 98.80420
3 8.048150 99.45329
4 11.913177 88.93709
5 8.327950 97.22968
6 11.913177 88.93709
BIO16_Precip_Wettest_Quarter BIO17_Precip_Driest_Quarter
1 606.3058 94.95189
2 598.7009 77.30684
3 598.9535 75.86132
4 583.3614 100.51526
5 558.9021 77.90620
6 583.3614 100.51526
BIO18_Precip_Warmest_Quarter BIO19_Precip_Coldest_Quarter x y
1 449.8074 189.9305 1870064 -3265048
2 444.0957 147.6409 1849568 -3234305
3 445.4260 145.7036 1844444 -3234305
4 448.1904 200.5528 1875188 -3259924
5 408.9403 153.3001 1859816 -3259924
6 448.1904 200.5528 1875188 -3259924
Presence
1 1
2 1
3 1
4 1
5 1
6 1
Thin the koala presence points (for tutorial only)
We now thin the presences to reduce the number of points to a manageable size for plotting and modelling. This is not a recommended step for real data, but is done here to make the tutorial run faster and to make the plots clearer.
<- train_PB_covs %>% filter(Presence == 1)
train_PB_covs_pres <- train_PB_covs %>% filter(Presence == 0)
train_PB_covs_bg
# Thin the presences for plotting
<- train_PB_covs_pres[sample(nrow(train_PB_covs_pres), 10000), ]
train_PB_covs_pres_thin
# Combine back into both presence and background
<- rbind(train_PB_covs_pres_thin, train_PB_covs_bg) train_PB_covs_thinned
Check correlation and multicollinearity of covariates
Correlation plot
There are several different methods for creating correlation plots.
ecospat.cor.plot(covs_current_expert)
Using the corplots package
For a simple and quick plot.
<- ENMTools::raster.cor.plot(covs_current_expert)
corplots $cor.mds.plot corplots
$cor.heatmap corplots
Here, we use the corrplot
package to create a correlation plot of the selected covariates (this is taken from an EcoCommons Australia notebook).
# Select columns by their names
<- train_PB_covs[, names(train_PB_covs) %in% c("BIO5_Max_Temp_Warmest_Month",
cor_data "BIO6_Min_Temp_Coldest_Month",
"BIO12_Annual_Precipitation",
"BIO15_Precip_Seasonality")]
# Check the structure of the numeric data
str(cor_data)
'data.frame': 88075 obs. of 4 variables:
$ BIO5_Max_Temp_Warmest_Month: num 29.7 30.5 30.5 29.7 30.2 ...
$ BIO6_Min_Temp_Coldest_Month: num 9.39 8.94 8.49 9.96 8.96 ...
$ BIO12_Annual_Precipitation : num 1238 1165 1161 1223 1109 ...
$ BIO15_Precip_Seasonality : num 92.9 98.8 99.5 88.9 97.2 ...
# Calculate the correlation matrix for the numeric columns
<- cor(cor_data, use = "complete.obs", method = "pearson")
cor_matrix
corrplot(cor_matrix,
method = "color", # Use colored squares for correlation
type = "upper", # Show upper triangle only
order = "hclust", # Reorder variables hierarchically
addCoef.col = "black", # Show correlation coefficients in black
number.cex = 0.5, # Reduce the size of correlation labels
tl.col = "black", # Text label color
tl.srt = 30, # Rotate labels slightly for readability
tl.cex = 0.5, # Reduce text size of variable labels (set smaller valu)
cl.cex = 0.8, # Reduce text size of color legend
diag = FALSE, # Hide diagonal
col = colorRampPalette(c("#11aa96", "#61c6fa", "#f6aa70"))(200),
sig.level = 0.01, insig = "blank")
Variance Inflation Factor (VIF)
If you find corrplot is hard for you to make decisions, we can use Variance Inflation Factor (VIF). VIF is another statistical measure used to detect multicollinearity in a set of explanatory (independent) variables in a regression model.
Interpretation:
- VIF = 1: No correlation
- VIF > 1 and <= 5: Moderate correlation; may not require corrective action.
- VIF > 5: Indicates high correlation. Multicollinearity may be problematic, and further investigation is recommended.
- VIF > 10: Strong multicollinearity. The variable is highly collinear with others, and steps should be taken to address this.
# usdm::vif(covs_current_expert) # just VIF for all covariates
::vifstep(covs_current_expert) # Variance Inflation Factor and test for multicollinearity usdm
No variable from the 4 input variables has collinearity problem.
The linear correlation coefficients ranges between:
min correlation ( BIO15_Precip_Seasonality ~ BIO6_Min_Temp_Coldest_Month ): -0.2756697
max correlation ( BIO12_Annual_Precipitation ~ BIO6_Min_Temp_Coldest_Month ): 0.8769718
---------- VIFs of the remained variables --------
Variables VIF
1 BIO5_Max_Temp_Warmest_Month 2.751370
2 BIO6_Min_Temp_Coldest_Month 4.528719
3 BIO12_Annual_Precipitation 6.888077
4 BIO15_Precip_Seasonality 1.263385
Exploration of the koala presence and background data
It is good practice to assess where in the environmental space the presence and background points are located. This can help to identify if there are any potential issues with the data, such as a lack of background points in certain areas of environmental space, and should show any patterns in the data that the model should pick up.
# Iterate over all of the variables to create density plots of the background and presence data
for(i in 1:ncol(train_PB_covs_thinned)) {
print(ggplot() +
geom_density(data = train_PB_covs_thinned,
aes(x = .data[[names(train_PB_covs_thinned)[i]]], fill = as.factor(Presence)),
alpha = 0.5) +
theme_bw() +
labs(title = names(train_PB_covs_thinned)[i]))
}
First Model: GLM model fitting
# Make a folder to save outputs
dir.create("Outputs/GLM_outputs", showWarnings = F)
Null model
Null model: no explanatory variables or predictors are included.
It is always helpful to create a null model as a benchmark to assess how the inclusion of explanatory variables improves the model.
# Fit a null model with only the intercept
<- glm(Presence ~ 1,
null_model data = train_PB_covs,
family = binomial(link = "logit"))
# Check the model results
summary(null_model)
Call:
glm(formula = Presence ~ 1, family = binomial(link = "logit"),
data = train_PB_covs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.08025 0.02636 154.8 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 14900 on 88074 degrees of freedom
Residual deviance: 14900 on 88074 degrees of freedom
AIC: 14902
Number of Fisher Scoring iterations: 7
GLM Model 1 - expert variables
<- glm(Presence ~
glm_model_1 +
BIO5_Max_Temp_Warmest_Month +
BIO6_Min_Temp_Coldest_Month +
BIO12_Annual_Precipitation
BIO15_Precip_Seasonality,data=train_PB_covs_thinned,
family = binomial(link = "logit"))
# Check the model results
summary(glm_model_1)
Call:
glm(formula = Presence ~ BIO5_Max_Temp_Warmest_Month + BIO6_Min_Temp_Coldest_Month +
BIO12_Annual_Precipitation + BIO15_Precip_Seasonality, family = binomial(link = "logit"),
data = train_PB_covs_thinned)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.8956819 1.5630088 -3.132 0.00173 **
BIO5_Max_Temp_Warmest_Month -0.6341523 0.0514780 -12.319 < 2e-16 ***
BIO6_Min_Temp_Coldest_Month 0.8653363 0.0284251 30.443 < 2e-16 ***
BIO12_Annual_Precipitation -0.0030684 0.0002791 -10.992 < 2e-16 ***
BIO15_Precip_Seasonality 0.2437024 0.0106358 22.913 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8758.5 on 11463 degrees of freedom
Residual deviance: 5758.2 on 11459 degrees of freedom
AIC: 5768.2
Number of Fisher Scoring iterations: 6
# These response curves don't look very helpful
::response(glm_model_1) dismo
GLM Model 2 - expert variables with quadratics
In this model, we include quadratic terms for the covariates. This is a common approach in species distribution modelling to account for non-linear relationships between the predictors and the response variable. This increases the complexity of the model and allows for more flexibility in fitting the data.
<- glm(Presence ~
glm_model_2 + I(BIO5_Max_Temp_Warmest_Month^2) +
BIO5_Max_Temp_Warmest_Month + I(BIO6_Min_Temp_Coldest_Month^2) +
BIO6_Min_Temp_Coldest_Month + I(BIO12_Annual_Precipitation^2) +
BIO12_Annual_Precipitation + I(BIO15_Precip_Seasonality^2),
BIO15_Precip_Seasonality data=train_PB_covs_thinned,
family = binomial(link = "logit"))
# Check the model results
summary(glm_model_2)
Call:
glm(formula = Presence ~ BIO5_Max_Temp_Warmest_Month + I(BIO5_Max_Temp_Warmest_Month^2) +
BIO6_Min_Temp_Coldest_Month + I(BIO6_Min_Temp_Coldest_Month^2) +
BIO12_Annual_Precipitation + I(BIO12_Annual_Precipitation^2) +
BIO15_Precip_Seasonality + I(BIO15_Precip_Seasonality^2),
family = binomial(link = "logit"), data = train_PB_covs_thinned)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.304e+01 1.887e+01 -4.930 8.22e-07 ***
BIO5_Max_Temp_Warmest_Month -6.996e-01 1.229e+00 -0.569 0.569266
I(BIO5_Max_Temp_Warmest_Month^2) 3.164e-03 1.989e-02 0.159 0.873623
BIO6_Min_Temp_Coldest_Month -1.309e+00 1.884e-01 -6.951 3.63e-12 ***
I(BIO6_Min_Temp_Coldest_Month^2) 1.557e-01 1.272e-02 12.244 < 2e-16 ***
BIO12_Annual_Precipitation 4.063e-03 1.942e-03 2.092 0.036403 *
I(BIO12_Annual_Precipitation^2) -2.504e-06 6.972e-07 -3.592 0.000329 ***
BIO15_Precip_Seasonality 2.044e+00 2.360e-01 8.660 < 2e-16 ***
I(BIO15_Precip_Seasonality^2) -9.028e-03 1.261e-03 -7.160 8.08e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8758.5 on 11463 degrees of freedom
Residual deviance: 5578.7 on 11455 degrees of freedom
AIC: 5596.7
Number of Fisher Scoring iterations: 6
# These response curves don't look very helpful
::response(glm_model_2) dismo
Model effect evaluation
Here we use a function presented in an EcoCommons Australia notebook to evaluate the model performance. The notebook can be found on their GitHub: https://github.com/EcoCommonsAustralia/notebooks/tree/main/notebooks.
# Function to plot effect size graph
<- function(glm_model) {
plot_effect_size # Check if required libraries are installed
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("Please install the 'ggplot2' package to use this function.")
}library(ggplot2)
# Extract effect sizes (coefficients) from the model
<- summary(glm_model)$coefficients
coefs <- data.frame(
effect_sizes Variable = rownames(coefs)[-1], # Exclude the intercept
Effect_Size = coefs[-1, "Estimate"],
Std_Error = coefs[-1, "Std. Error"]
)
# Sort by effect size
<- effect_sizes[order(-abs(effect_sizes$Effect_Size)), ]
effect_sizes
# Plot the effect sizes with error bars
ggplot(effect_sizes, aes(x = reorder(Variable, Effect_Size), y = Effect_Size)) +
geom_bar(stat = "identity", fill = "#11aa96") +
geom_errorbar(aes(ymin = Effect_Size - Std_Error, ymax = Effect_Size + Std_Error), width = 0.2) +
coord_flip() +
labs(
title = "Effect Sizes of Variables",
x = "Variable",
y = "Effect Size (Coefficient Estimate)"
+
) theme_minimal()
}
Use the function to check the effect sizes
We need to be careful when interpreting the effect sizes of models with quadratic terms however, as the response curve depends on the linear and the quadratic term.
# Example usage of effect size plot
plot_effect_size(glm_model_1)
plot_effect_size(glm_model_2)
Response curves
Again, we can use a function from the EcoCommons notebook to plot the response curves from the model, although for quadratics we need to adjust the function or use something else.
<- function(glm_model, predictors, data) {
plot_species_response # Check if required libraries are installed
if (!requireNamespace("ggplot2", quietly = TRUE) || !requireNamespace("gridExtra", quietly = TRUE)) {
stop("Please install the 'ggplot2' and 'gridExtra' packages to use this function.")
}library(ggplot2)
library(gridExtra)
# Create empty list to store response plots
<- list()
response_plots
# Loop through each predictor variable
for (predictor in predictors) {
# Create new data frame to vary predictor while keeping others constant
<- seq(
pred_range min(data[[predictor]], na.rm = TRUE),
max(data[[predictor]], na.rm = TRUE),
length.out = 100
)<- data[1, , drop = FALSE] # Use first row to keep other predictors constant
const_data <- const_data[rep(1, 100), ] # Duplicate the row
response_data <- pred_range
response_data[[predictor]]
# Predict probabilities
<- predict(glm_model, newdata = response_data, type = "response")
predicted_response
# Create data frame for plotting
<- data.frame(
plot_data Predictor_Value = pred_range,
Predicted_Probability = predicted_response
)
# Add presence and absence data
<- data.frame(
presence_absence_data Predictor_Value = data[[predictor]],
Presence_Absence = data$Presence
)
# Generate the response plot
<- ggplot() +
p
geom_line(data = plot_data,
aes(x = Predictor_Value, y = Predicted_Probability),
color = "#61c6fa", linewidth = 1) +
geom_point(data = presence_absence_data[presence_absence_data$Presence_Absence == 1, ],
aes(x = Predictor_Value, y = Presence_Absence),
color = "#11aa96", alpha = 0.6) +
geom_point(data = presence_absence_data[presence_absence_data$Presence_Absence == 0, ],
aes(x = Predictor_Value, y = Presence_Absence),
color = "#f6aa70", alpha = 0.6) +
labs(x = predictor, y = NULL) +
theme_minimal() +
theme(axis.title.y = element_blank())
# Store the plot in the list
<- p
response_plots[[predictor]]
}
# Arrange all plots in one combined plot with a single shared y-axis label
grid.arrange(
grobs = response_plots,
ncol = 3,
left = "Predicted Probability / Presence-Absence"
) }
Use the response curve function
<- c("BIO5_Max_Temp_Warmest_Month",
predictors "BIO6_Min_Temp_Coldest_Month",
"BIO12_Annual_Precipitation",
"BIO15_Precip_Seasonality")
plot_species_response(glm_model_1, predictors, train_PB_covs_thinned)
Model 1 partial responses
# Plot the partial responses
partial_response(glm_model_1, predictors = train_PB_covs_thinned[,predictors], main='GLM')
# Plot inflated response curves:
inflated_response(glm_model_1, predictors = train_PB_covs_thinned[,predictors], method = "stat3", lwd = 3, main='GLM')
Model 2 partial responses
# Plot the partial responses
partial_response(glm_model_2, predictors = train_PB_covs_thinned[,predictors], main='GLM')
# Plot inflated response curves:
inflated_response(glm_model_2, predictors = train_PB_covs_thinned[,predictors], method = "stat3", lwd = 3, main='GLM')
GLM predictions to current environment
Model 1
# Predict the presence probability across the entire raster extent
<- predicts::predict(covs_current_expert, glm_model_1, type = "response")
predicted_raster_model_1
# Plot the species distribution raster
plot(
predicted_raster_model_1,range = c(0, 1), # Set min and max values for the color scale
main = "Relative Probability of Occurrence of Koalas in SEQ - GLM 1"
)
Model 2
# Predict the presence probability across the entire raster extent
<- predicts::predict(covs_current_expert, glm_model_2, type = "response")
predicted_raster_model_2
# Plot the species distribution raster
plot(
predicted_raster_model_2,range = c(0, 1), # Set min and max values for the color scale
main = "Relative Probability of Occurrence of Koalas in SEQ - GLM 2"
)
Random forest
# Make a folder to save outputs
dir.create("Outputs/RF_outputs", showWarnings = F)
Data preparation
Calculate the case weights (down/up-weighting)
Because we have a ‘class imbalance’ (uneven number of presences and background points), we are making their ‘weight’ in the model equal.
<- as.numeric(table(train_PB_covs_thinned$Presence)["1"]) # number of presences
presNum <- as.numeric(table(train_PB_covs_thinned$Presence)["0"]) # number of backgrounds
bgNum <- ifelse(train_PB_covs_thinned$Presence == 1, 1, presNum / bgNum) # down-weighting weight
Prepare for fitting the RF model(s)
# If wanting to fit a classification model
# Convert the response to factor for producing class relative likelihood
$Presence_Factor <- as.factor(train_PB_covs_thinned$Presence)
train_PB_covs_thinned
# For down-sampling, the number of background (0s) in each bootstrap sample should the same as presences
# (1s). For this, we use sampsize argument to do this.
# We need to choose the SMALLER number out of the two classes
<- c("0" = bgNum, "1" = bgNum)
sample_size <- c(bgNum) sample_size
Random Forest Model - expert covariates
Classification or regression model?
# # Specify the model formula - classification
# model_formula <- as.formula(Presence_Factor ~
# BIO5_Max_Temp_Warmest_Month +
# BIO6_Min_Temp_Coldest_Month +
# BIO12_Annual_Precipitation +
# BIO15_Precip_Seasonality)
# Specify the model formula - regression
<- as.formula(Presence ~
model_formula +
BIO5_Max_Temp_Warmest_Month +
BIO6_Min_Temp_Coldest_Month +
BIO12_Annual_Precipitation BIO15_Precip_Seasonality)
Fit the random forest model
This will throw a warning as we only have two unique values (0s and 1s), but in our case that is fine, and you can ignore the warning.
<- randomForest::randomForest(formula = model_formula,
rf_1 data = train_PB_covs_thinned,
weights = weight,
ntree = 1000,
sampsize = sample_size,
replace = T,
importance=TRUE)
Check the model results
# Model summary
rf_1
Call:
randomForest(formula = model_formula, data = train_PB_covs_thinned, weights = weight, ntree = 1000, sampsize = sample_size, replace = T, importance = TRUE)
Type of random forest: regression
Number of trees: 1000
No. of variables tried at each split: 1
Mean of squared residuals: 0.08564252
% Var explained: 23.12
# Variable importance:
importance(rf_1)
%IncMSE IncNodePurity
BIO5_Max_Temp_Warmest_Month 91.55258 70.67321
BIO6_Min_Temp_Coldest_Month 113.37257 101.99034
BIO12_Annual_Precipitation 79.51614 82.59679
BIO15_Precip_Seasonality 78.85705 47.42284
# Plot variable importance
varImpPlot(rf_1, type = 1)
# Look at single trees:
head(getTree(rf_1,1,T))
left daughter right daughter split var split point status
1 2 3 BIO5_Max_Temp_Warmest_Month 30.558750 -3
2 4 5 BIO6_Min_Temp_Coldest_Month 7.933356 -3
3 6 7 BIO5_Max_Temp_Warmest_Month 33.011837 -3
4 8 9 BIO15_Precip_Seasonality 94.776566 -3
5 10 11 BIO5_Max_Temp_Warmest_Month 29.037033 -3
6 12 13 BIO12_Annual_Precipitation 789.708954 -3
prediction
1 0.5116120
2 0.7076923
3 0.2439418
4 0.0781250
5 0.8200837
6 0.2903846
Partial dependence plots
# Now, we plot response curves in the same way as we did for GLMs above:
partial_response(rf_1, predictors = train_PB_covs_thinned[,predictors], main='Random Forest')
# Plot inflated response curves:
inflated_response(rf_1, predictors = train_PB_covs_thinned[,predictors], method = "stat3", lwd = 3, main='Random Forest')
Random Forest predictions
# Predict the presence probability across the entire raster extent
<- predicts::predict(covs_current_expert, rf_1, type = "response")
predicted_raster_RF_1
# Plot the species distribution raster
plot(
predicted_raster_RF_1,range = c(0, 1), # Set min and max values for the color scale
main = "Relative Probability of Occurrence of Koalas in SEQ - RF 1"
)
Model evaluation with spatial block cross-validation
# Convert training data to sf
<- st_as_sf(train_PB_covs_thinned[, c("x", "y", "Presence")], coords = c("x", "y"), crs = "EPSG:3112")
train_PB_covs_thinned_sf
# Generate spatial blocks
<- cv_spatial(x = train_PB_covs_thinned_sf,
spblock column = "Presence",
r = NULL,
size = 50000, # Size of the blocks in metres
k = 5,
hexagon = TRUE,
selection = "random",
iteration = 100, # to find evenly-dispersed folds
biomod2 = FALSE)
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
train_0 train_1 test_0 test_1
1 1163 5134 301 4866
2 1169 9148 295 852
3 1175 9479 289 521
4 1148 8014 316 1986
5 1201 8225 263 1775
cv_plot(cv = spblock,
x = train_PB_covs_thinned_sf,
points_alpha = 0.5,
nrow = 2)
# Extract the folds to save
<- spblock$folds_list
spfolds
# We now have a list of 5 folds, where the first object is the training data, and the second is the testing data
str(spfolds)
List of 5
$ :List of 2
..$ : int [1:6297] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
..$ : int [1:5167] 10329 10576 10444 10364 10534 10404 10571 10365 10621 10401 ...
$ :List of 2
..$ : int [1:10317] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
..$ : int [1:1147] 10479 10437 10660 10613 10662 8631 10661 10612 10480 10614 ...
$ :List of 2
..$ : int [1:10654] 10479 10437 10660 10613 10662 8631 10661 10612 10480 10614 ...
..$ : int [1:810] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
$ :List of 2
..$ : int [1:9162] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
..$ : int [1:2302] 11271 11083 11228 11181 11085 11231 11185 11270 11138 11232 ...
$ :List of 2
..$ : int [1:9426] 10859 11019 10911 10912 10657 10856 10803 10860 10753 10806 ...
..$ : int [1:2038] 10495 10498 3540 2917 10496 10500 10337 1630 9411 10633 ...
Run the model for every fold and evaluate
Model evaluation - metrics
Typically, it helps to evaluate your model with several metrics that describe different features of model performance and prediction. Here, we define a function to feed in a model prediction and calculate several evaluation metrics.
The metrics are:
-Area under the receiver operating characteristic curve (AUC ROC)
Higher values of this (closer to 1) suggest a model is good at distinguishing presence points from the background.
-Continuous boyce index
Higher values of this (closer to 1) suggest a model is good at predicting higher suitability at spots where there were presences.
# Start a dataframe to save results
<- data.frame(fold = numeric(),
eval_df ROC = numeric(),
boyce = numeric())
for(f in seq_along(spfolds)) {
# Subset the training and testing data (spatial cross validation) (for the fth fold)
<- train_PB_covs_thinned[spfolds[[f]][[1]], ]
train_PB_covs_scv <- train_PB_covs_thinned[spfolds[[f]][[2]], ]
test_PB_covs_scv
<- glm(Presence ~
glm_model_1 +
BIO5_Max_Temp_Warmest_Month +
BIO6_Min_Temp_Coldest_Month +
BIO12_Annual_Precipitation
BIO15_Precip_Seasonality,data=train_PB_covs_scv,
family = binomial(link = "logit"))
# Predict to the testing data of fold f
$pred <- predict(glm_model_1, newdata = test_PB_covs_scv, type = "response")
test_PB_covs_scv
# Evaluate prediction on test set
= precrec::auc(precrec::evalmod(scores = test_PB_covs_scv$pred, labels = test_PB_covs_scv$Presence))[1,4]
ROC
= ecospat::ecospat.boyce(fit = test_PB_covs_scv$pred,
boyce obs = test_PB_covs_scv$pred[which(test_PB_covs_scv$Presence==1)],
nclass = 0, # Calculate continuous index
method = "pearson",
PEplot = F)[["cor"]]
# Add results to dataframe
<- eval_df %>% add_row(fold = f, ROC = ROC, boyce = boyce)
eval_df
}
Summarise the evaluation metrics
# Mean AUC & boyce
%>%
eval_df summarise(mean_AUC = mean(ROC),
mean_boyce = mean(boyce),
sd_AUC = sd(ROC),
sd_boyce = sd(boyce))
mean_AUC mean_boyce sd_AUC sd_boyce
1 0.8581919 0.6092 0.07396525 0.4851667
Model evaluation for Random Forest with spatial block cross-validation
We’re going to use the same spatial folds as before.
Run the RF model for every fold and evaluate
# Start a dataframe to save results
<- data.frame(fold = numeric(),
eval_df.RF ROC = numeric(),
boyce = numeric())
for(f in seq_along(spfolds)) {
# Subset the training and testing data (spatial cross validation) (for the fth fold)
<- train_PB_covs_thinned[spfolds[[f]][[1]], ]
train_PB_covs_scv <- train_PB_covs_thinned[spfolds[[f]][[2]], ]
test_PB_covs_scv
<- as.numeric(table(train_PB_covs_scv$Presence)["1"]) # number of presences
presNum <- as.numeric(table(train_PB_covs_scv$Presence)["0"]) # number of backgrounds
bgNum <- ifelse(train_PB_covs_scv$Presence == 1, 1, presNum / bgNum) # down-weighting
weight
<- c("0" = bgNum, "1" = bgNum)
sample_size <- c(bgNum)
sample_size
<- randomForest::randomForest(formula = model_formula,
rf_1 data = train_PB_covs_scv,
weights = weight,
ntree = 1000,
sampsize = sample_size,
replace = T,
importance=TRUE)
# Predict to the testing data of fold f
$pred <- predict(rf_1, newdata = test_PB_covs_scv, type = "response")
test_PB_covs_scv
# Evaluate prediction on test set
= precrec::auc(precrec::evalmod(scores = test_PB_covs_scv$pred, labels = test_PB_covs_scv$Presence))[1,4]
ROC
= ecospat::ecospat.boyce(fit = test_PB_covs_scv$pred,
boyce obs = test_PB_covs_scv$pred[which(test_PB_covs_scv$Presence==1)],
nclass = 0, # Calculate continuous index
method = "pearson",
PEplot = F)[["cor"]]
# Add results to dataframe
<- eval_df.RF %>% add_row(fold = f, ROC = ROC, boyce = boyce)
eval_df.RF
}
Summarise the evaluation metrics
# Mean AUC & boyce
%>%
eval_df.RF summarise(mean_AUC = mean(ROC),
mean_boyce = mean(boyce),
sd_AUC = sd(ROC),
sd_boyce = sd(boyce))
mean_AUC mean_boyce sd_AUC sd_boyce
1 0.7988609 0.6852 0.1004777 0.2349228
Make predictions to future climates
Load future environmental data
<- rast("Data/Environmental_variables/future_bioclim.2090.SSP370.tif")
covs_future names(covs_future) <- layer_names
covs_future
class : SpatRaster
dimensions : 47, 55, 19 (nrow, ncol, nlyr)
resolution : 5123.954, 5123.954 (x, y)
extent : 1621552, 1903370, -3349594, -3108768 (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112)
source : future_bioclim.2090.SSP370.tif
names : BIO1_~_Temp, BIO2_~Range, BIO3_~ality, BIO4_~ality, BIO5_~Month, BIO6_~Month, ...
min values : 0.8177757, 0.3121004, 1.564732, 12.43535, 1.130271, 0.4521889, ...
max values : 24.5521488, 14.5486765, 50.989292, 528.63293, 37.389927, 15.7586107, ...
<- terra::mask(covs_future, covs_current) # Crop to SEQ extent
covs_future
<- subset(covs_future, names(covs_future) %in% c("BIO5_Max_Temp_Warmest_Month",
covs_future_expert "BIO6_Min_Temp_Coldest_Month",
"BIO12_Annual_Precipitation",
"BIO15_Precip_Seasonality"))
Plot the future rasters
for(i in 1:nlyr(covs_future)) {
::plot(covs_future[[i]], main = names(covs_future)[[i]])
terra }
# Plot to compare the difference between the current and future rasters
plot(covs_future_expert - covs_current_expert)
Test the environmental distance between current data and future conditions
<- predicts::mess(covs_future_expert,
mess c("BIO5_Max_Temp_Warmest_Month",
train_PB_covs_thinned[, "BIO6_Min_Temp_Coldest_Month",
"BIO12_Annual_Precipitation",
"BIO15_Precip_Seasonality")])
plot(mess, axes = F)
<- mess < 0
r_mess_mask plot(r_mess_mask, axes=F)
Test which areas you might mask out because they are ‘novel’ in environmental space and therefore require model extrapolation.
<- predicted_raster_model_1
analog_fut
values(analog_fut)[values(mess)<0] <- NA
plot(analog_fut,
range = c(0, 1), # Set min and max values for the color scale
main = "Koala relative occurrence in regions with analogue conditions")
<- predicted_raster_model_1
novel_fut
values(novel_fut)[values(mess)>0] <- NA
plot(novel_fut,
range = c(0, 1), # Set min and max values for the color scale
main = "Koala relative occurrence in regions with novel conditions")
GLM future predictions
Model 1
# Predict the presence probability across the entire raster extent
<- predict(covs_future_expert, glm_model_1, type = "response")
predicted_raster_model_1
# Plot the species distribution raster
plot(
predicted_raster_model_1,range = c(0, 1), # Set min and max values for the color scale
main = "Relative Probability of Occurrence of Koalas in SEQ - GLM1"
)
Model 2
# Predict the presence probability across the entire raster extent
<- predict(covs_future_expert, glm_model_2, type = "response")
predicted_raster_model_2
# Plot the species distribution raster
plot(
predicted_raster_model_2,range = c(0, 1), # Set min and max values for the color scale
main = "Relative Probability of Occurrence of Koalas in SEQ - GLM2"
)
Random Forest future predictions
Model 1
# Predict the presence probability across the entire raster extent
<- predicts::predict(covs_future_expert, rf_1, type = "response", na.rm=TRUE)
predicted_raster_RF_1
# Plot the species distribution raster
plot(
predicted_raster_RF_1,range = c(0, 1), # Set min and max values for the color scale
main = "Relative Probability of Occurrence of Koalas in SEQ - RF"
)
Presenting predictions with uncertainty
There are many sources of model uncertainty that should be explored and ideally, presented alongside model predictions.
One that we’ll focus on here is climate scenario uncertainty. We do so by fitting a second model to future climate data from a lower emission shared socioeconomic path scenario (SSP 1.26).
Load future environmental data (SSP 1.26)
<- rast("Data/Environmental_variables/future_bioclim.2090.SSP126.tif")
covs_future_SSP126 names(covs_future_SSP126) <- layer_names
covs_future_SSP126
class : SpatRaster
dimensions : 47, 55, 19 (nrow, ncol, nlyr)
resolution : 5123.954, 5123.954 (x, y)
extent : 1621552, 1903370, -3349594, -3108768 (xmin, xmax, ymin, ymax)
coord. ref. : GDA94 / Geoscience Australia Lambert (EPSG:3112)
source : future_bioclim.2090.SSP126.tif
names : BIO1_~_Temp, BIO2_~Range, BIO3_~ality, BIO4_~ality, BIO5_~Month, BIO6_~Month, ...
min values : 0.7471204, 0.3128485, 1.585051, 12.19436, 1.046093, 0.3754203, ...
max values : 22.4384842, 14.8543510, 51.254021, 544.83429, 35.546104, 13.5217409, ...
<- subset(covs_future_SSP126, names(covs_future_SSP126) %in% c("BIO5_Max_Temp_Warmest_Month",
covs_future_SSP126_expert "BIO6_Min_Temp_Coldest_Month",
"BIO12_Annual_Precipitation",
"BIO15_Precip_Seasonality"))
Plot to compare the variables across the two scenarios
plot(covs_future_expert)
plot(covs_future_SSP126_expert)
GLM future predictions (SSP 1.26)
Model 1
# Predict the presence probability across the entire raster extent
<- predict(covs_future_SSP126_expert, glm_model_1, type = "response")
predicted_raster_model_1_SSP126
# Plot the species distribution raster
plot(
predicted_raster_model_1,range = c(0,1),
main = "SSP 3.70"
)
plot(
predicted_raster_model_1_SSP126,range = c(0,1),
main = "SSP 1.26"
)
Model uncertainty
Another element of uncertainty that can be represented is model uncertainty, or the standard error around the coefficient estimates.
# Extract standard errors of coefficients
<- summary(glm_model_1)$coefficients[, "Std. Error"]
coef_se
print(coef_se)
(Intercept) BIO5_Max_Temp_Warmest_Month
1.9611565762 0.0618157348
BIO6_Min_Temp_Coldest_Month BIO12_Annual_Precipitation
0.0334897580 0.0003411282
BIO15_Precip_Seasonality
0.0127273370
<- as.data.frame(covs_future_expert, na.rm = FALSE)
covs_df
<- predict(glm_model_1, newdata = covs_df, type = "link", se.fit = TRUE)
pred_link
# Linear predictor (eta)
<- pred_link$fit
eta <- pred_link$se.fit
se_eta
# Confidence intervals (95%)
<- 1.96
z <- eta - z * se_eta
eta_lower <- eta + z * se_eta
eta_upper
# Transform back to response scale
<- glm_model_1$family$linkinv
linkinv <- linkinv(eta)
predicted <- linkinv(eta_lower)
lower_ci <- linkinv(eta_upper)
upper_ci
# Add to covs_df
$predicted <- predicted
covs_df$lower_ci <- lower_ci
covs_df$upper_ci <- upper_ci
covs_df
<- setValues(rast(covs_future_expert, nlyr = 1), predicted)
predicted_r <- setValues(rast(covs_future_expert, nlyr = 1), lower_ci)
lower_ci_r <- setValues(rast(covs_future_expert, nlyr = 1), upper_ci)
upper_ci_r
# Step 2: Name the layers
names(predicted_r) <- "predicted"
names(lower_ci_r) <- "lower_CI"
names(upper_ci_r) <- "upper_CI"
<- c(predicted_r, lower_ci_r, upper_ci_r)
prediction_w_uncertainty
plot(prediction_w_uncertainty, range = c(0, 1))