This script generates the pressure incurred from invasive species for each OHI region.
Reference: Pagad, S., Genovesi, P., Carnevali, L. et al. Introducing the Global Register of Introduced and Invasive Species. Sci Data 5, 170202 (2018). https://doi.org/10.1038/sdata.2017.202
Downloaded: 2023-07-03
Description:
Harmonised, representative data on the state of biological invasions
remain inadequate at country and global scales, particularly for taxa
that affect biodiversity and ecosystems. Information is not readily
available in a form suitable for policy and reporting. The Global
Register of Introduced and Invasive Species (GRIIS) provides the first
country-wise checklists of introduced (naturalised) and invasive
species. GRIIS was conceived to provide a sustainable platform for
information delivery to support national governments. We outline the
rationale and methods underpinning GRIIS, to facilitate transparent,
repeatable analysis and reporting. Twenty country checklists are
presented as exemplars; GRIIS Checklists for close to all countries
globally will be submitted through the same process shortly. Over 11000
species records are currently in the 20 country exemplars alone, with
environmental impact evidence for just over 20% of these. GRIIS provides
significant support for countries to identify and prioritise invasive
alien species, and establishes national and global baselines. In future
this will enable a global system for sustainable monitoring of trends in
biological invasions that affect the environment.
Time range: 2018-2020, 2022 (all unique eventDates from binded_df below)
Download link: https://griis.org/download - There is a download script in this file, making manual download unnecessary.
Variables: The most important variable names are:
scientific_name
: the species name, sometimes with
taxonomic source
habitat
: the species habitat (marine, brackish,
terrestrial, freshwater, or a combination of them)
is_invasive
: whether or not the species is
considered invasive (harmful) or simply alien (introduced, but not
harmful)
Download the data and clean it up
Identify habitat for species with none listed
Match listed countries to OHI regions
Gapfill by regional averages
Calulate the pressure score for each country
Save prepared data and associated gapfilling datasets
Check how pressure scores have changed from previous assesments
knitr::opts_chunk$set(message = FALSE, warning = FALSE, eval = FALSE, echo = TRUE)
if (!require(librarian)){install.packages("librarian")}
if (!require(ohicore)){devtools::install_github('ohi-science/ohicore@dev')}
librarian::shelf(
tidyverse,
here,
janitor,
sf,
plotly,
countrycode,
ohicore,
rvest,
httr,
tictoc,
rgbif
)
### directory paths and relevant files
current_year <- 2023
prev_year <- current_year - 1
version_year <- paste0("v", current_year)
data_year <- paste0("d", current_year)
source(here::here('workflow', 'R', 'common.R'))
### Mazu
dir_here <- here::here('globalprep', 'prs_alien', version_year)
dir_here_prev <- here::here('globalprep', 'prs_alien', paste0('v', prev_year))
dir_data <- file.path(dir_M, 'git-annex', 'globalprep', '_raw_data', 'griis', data_year)
# rgns <- ohicore::georegion_labels # Kiribati is broke v2022
rgns <- read.csv("https://raw.githubusercontent.com/OHI-Science/ohiprep/master/globalprep/spatial/v2017/output/georegion_labels.csv")
rgns_eez <- read.csv("https://raw.githubusercontent.com/OHI-Science/ohiprep_v2022/gh-pages/globalprep/spatial/v2017/output/rgn_labels.csv") %>%
dplyr::filter(type == "eez") %>%
dplyr::select(-type, rgn_name = label)
# Set up directory for the data download and go into the working directory
mkdir /home/shares/ohi/git-annex/globalprep/_raw_data/griis/d2023 && cd $_ # UPDATE TO LATEST YEAR; v2023 had to manually make it and cd due to access issues
# Set url of page desired to get content of
url <- "https://cloud.gbif.org/griis/inventory/dataset"
# Read/parse webpage
page <- read_html(url)
# Grab the page content
content <- page %>%
html_nodes("body") %>%
html_text()
# Get matches associated with the data we want
matches <- regmatches(content, gregexpr("archive\\.do\\?r=([^\" ]+)", content))
# Remove the "archive.do?r=" part from each match
matches <- unlist(lapply(matches, function(match) gsub("archive\\.do\\?r=", "", match)))
base_url <- "https://cloud.gbif.org/griis/archive.do?r="
# Loop through the matches and get the content
for (match in matches) {
url_to_download <- paste0(base_url, match)
file_path <- paste0(dir_data, "/", match, ".zip")
GET(url_to_download, write_disk(file_path, overwrite = TRUE))
unzip(file_path, exdir = file.path(dir_data, match), overwrite = TRUE)
file.remove(file_path)
}
Read in country codes from the countrycode
package.
codes <- countrycode::codelist %>%
janitor::clean_names() %>%
dplyr::select(country = country_name_en, country_code = iso2c) %>%
tidyr::drop_na()
Loop through the raw data files and join them into one large file.
# Get folder paths (containing 3 data files, by region)
folders <- list.files(dir_data, full.names = TRUE)
# Set up empty list to add dfs to
joined_subsets <- list()
# Loop through the folders and combine the data files for each region
for (folder in folders) {
#print(paste("Starting", folder))
file_subset <- list.files(path = folder, pattern = "\\.txt$", full.names = TRUE)
distribution_path <- grep("distribution.txt", file_subset, value = TRUE)
species_profile_path <- grep("speciesprofile.txt", file_subset, value = TRUE)
taxon_path <- grep("taxon.txt", file_subset, value = TRUE)
distribution_df <- read_delim(distribution_path, col_types = cols(.default = "c"))
species_profile_df <- read_delim(species_profile_path, col_types = cols(.default = "c"))
taxon_df <- read_delim(taxon_path, col_types = cols(.default = "c"))
joined_subset_partial <- full_join(distribution_df, species_profile_df, by = "id")
joined_subset_full <- full_join(joined_subset_partial, taxon_df, by = "id")
joined_subsets <- append(joined_subsets, list(joined_subset_full))
#print(paste("Done with", folder))
}
# Set up function to get all dfs to have the same columns
make_all_same_columns <- function(df_list) {
# Find all column name options in df list
all_columns <- unique(unlist(lapply(df_list, colnames)))
# For each dataframe, add missing columns
for (i in seq_along(df_list)) {
missing_columns <- setdiff(all_columns, colnames(df_list[[i]]))
if (length(missing_columns) > 0) {
for (column in missing_columns) {
df_list[[i]][[column]] <- NA
}
}
# Reorder columns so all have same order
df_list[[i]] <- df_list[[i]][, all_columns, drop = FALSE]
}
return(df_list)
}
# Run the function and save to new variable
joined_subsets_same_cols <- make_all_same_columns(joined_subsets)
# After they all have the same columns (unused ones filled with NAs), bind rows into one df
binded_df <- bind_rows(joined_subsets_same_cols)
Make the data consistent and tidy. Some of this is unnecessary but facilitates richer data exploration.
clean_output <- binded_df %>%
janitor::clean_names() %>%
dplyr::select(
id, taxon_id, scientific_name, kingdom, phylum, class, order,
family, taxon_rank, taxonomic_status, is_invasive, habitat,
location_id, country_code, occurrence_status, establishment_means) %>%
dplyr::left_join(codes) %>%
dplyr::mutate(
habitat = tolower(habitat),
habitat = str_replace_all(habitat, "/", "|"),
habitat = str_replace_all(habitat, "terrestriali", "terrestrial\\|"),
habitat = str_replace_all(habitat, "terrestrial,", "terrestrial"),
habitat = str_replace_all(habitat, "terrestrial\\| freshwater", "terrestrial\\|freshwater"),
habitat = str_replace_all(habitat, "terrestrialfreshwater", "terrestrial\\|freshwater"),
habitat = str_replace_all(habitat, "freshwatetr", "freshwater"),
habitat = str_replace_all(habitat, "terrestrial \\|freshwater \\|brackish", "terrestrial\\|freshwater\\|brackish"),
habitat = str_replace_all(habitat, "freshwater\\|marine", "marine\\|freshwater"),
habitat = str_replace_all(habitat, "brackish\\|marine\\|freshwater", "marine\\|freshwater\\|brackish"),
habitat = str_replace_all(habitat, "freshhwater\\|brackish\\|marine", "marine\\|freshwater\\|brackish"),
habitat = str_replace_all(habitat, "freshwater\\|brackish\\|marine", "marine\\|freshwater\\|brackish"),
habitat = str_replace_all(habitat, "freshwater\\|brackish", "brackish\\|freshwater"),
establishment_means = case_when(establishment_means == "Present" ~ occurrence_status, T ~ establishment_means),
establishment_means = tolower(establishment_means),
establishment_means = str_replace_all(establishment_means, "/", "|"),
establishment_means = str_replace_all(establishment_means, "uncerain", "uncertain"),
establishment_means = str_replace_all(establishment_means, "cryptogenic\\|", ""),
establishment_means = str_replace_all(establishment_means, "native\\|", ""),
establishment_means = replace_na(establishment_means, "uncertain"),
is_invasive = tolower(is_invasive),
is_invasive = replace_na(is_invasive, "0"),
is_invasive = str_replace_all(is_invasive, "null", "0"),
is_invasive = str_replace_all(is_invasive, "false", "0"),
is_invasive = str_replace_all(is_invasive, "to be evaluated", "0"),
is_invasive = str_replace_all(is_invasive, "not evaluated", "0"),
is_invasive = str_replace_all(is_invasive, "not specified", "0"),
is_invasive = str_replace_all(is_invasive, "yes", "1"),
is_invasive = str_replace_all(is_invasive, "true", "1"),
is_invasive = str_replace_all(is_invasive, "invasive", "1"),
is_invasive = str_replace_all(is_invasive, "1\\?", "1"),
is_invasive = str_replace_all(is_invasive, "1 \\?", "1"),
is_invasive = str_replace_all(is_invasive, "1 in the north of the island \\(122\\)\\.", "1"),
is_invasive = as.numeric(is_invasive),
country_code = toupper(country_code),
country_code = case_when(location_id == "Namibia" ~ "NA", T ~ country_code),
country = str_replace_all(country, "St. Lucia", "Saint Lucia"),
country = str_replace_all(country, "&", "and"),
country = str_replace_all(country, "Congo - Kinshasa", "Democratic Republic of the Congo"),
country = str_replace_all(country, "Côte d’Ivoire", "Ivory Coast"),
country = str_replace_all(country, "Curaçao", "Curacao"),
country = str_replace_all(country, "Myanmar \\(Burma\\)", "Myanmar"),
country = str_replace_all(country, "Saint Martin \\(French part\\)", "Northern Saint-Martin"),
country = str_replace_all(country, "São Tomé and Príncipe", "Sao Tome and Principe"),
country = str_replace_all(country, "St. Vincent and Grenadines", "Saint Vincent and the Grenadines"),
country = str_replace_all(country, "Svalbard & Jan Mayen", "Jan Mayen"),
country = case_when(
country_code == "EC-W" ~ "Ecuador", # NEEDS to be first
is.na(country) & !is.na(location_id) ~ location_id,
country_code == "BQ-BO" ~ "Bonaire",
country_code == "BQ-SE" ~ "Sint Eustatius",
country_code == "BQ-SA" ~ "Saba",
country_code == "FM-PNI" ~ "Micronesia",
country_code == "FM-TRK" ~ "Micronesia",
country_code == "YE-SU" ~ "Yemen",
T ~ country))
Some species from above do not come with an associated habitat type. This chunk queries GBIF and returns species names and their associated habitat.
na_df <- dplyr::filter(clean_output, is.na(habitat))
na_species <- unique(na_df$scientific_name)
spp_out <- tibble()
pb = txtProgressBar(min = 0, max = length(na_species), initial = 0, style = 3, width = 60)
tic()
for (spp in seq_along(na_species)) {
setTxtProgressBar(pb, spp)
data <- rgbif::name_lookup(query = na_species[spp])$data %>%
janitor::clean_names()
if (length(data) == 0) {
int <- tibble(scientific_name = na_species[spp], habitat_na = NA)
} else {
int <- data %>%
tidyr::drop_na(habitats) %>%
dplyr::select(scientific_name, habitat_na = habitats) %>%
dplyr::mutate(habitat_na = tolower(habitat_na)) %>%
dplyr::distinct()
}
spp_out <- rbind(int, spp_out)
}
toc()
# v2023: 1317.999 sec elapsed
close(pb)
The above chunk returns many duplicate entries, with many messy habitat type options. This matches species names exactly, unlike the above query, and then declares any species with marine habitat, a marine species, even if it has more than one listed.
spp_out_marine <- spp_out %>%
dplyr::filter(scientific_name %in% na_species) %>%
dplyr::mutate(
habitat_na = dplyr::case_when(
stringr::str_detect(habitat_na, 'marine') ~ 'marine',
stringr::str_detect(habitat_na, 'freshwater') ~ 'freshwater',
stringr::str_detect(habitat_na, 'terrestrial') ~ 'terrestrial')) %>%
dplyr::distinct() %>%
dplyr::filter(habitat_na == 'marine')
We then join these names to the data of only NA habitat species and assign the marine species a marine habitat.
marine_spp <- na_df %>%
dplyr::left_join(spp_out_marine) %>%
dplyr::distinct() %>%
dplyr::filter(habitat_na == 'marine') %>%
dplyr::mutate(habitat = habitat_na) %>%
dplyr::select(-habitat_na)
This binds the marine species back to the full dataset and does some final cleanup.
df <- clean_output %>%
rbind(marine_spp) %>%
dplyr::filter(!habitat %in% c("terrestrial", "freshwater", "terrestrial|freshwater", "freshwater|terrestrial", "host", NA)) %>%
dplyr::select(-c(location_id)) %>%
dplyr::left_join(rgns, by = c('country' = 'rgn_label')) %>%
dplyr::select(-rgn_id) %>%
dplyr::mutate(country = ifelse(country_code == "IC", "Canary Islands", country),
country = ifelse(country == "Congo - Brazzaville", "Republique du Congo", country)
)
df_ohi <- ohicore::name_2_rgn(df_in = df,
fld_name = "country",
flds_unique = c("id")) # v2023 - added this so unnecessary duplicates don't show up
# v2022
# These data were removed for not having any match in the lookup tables:
# Eswatini (Landlocked), French Southern Territories (will be split up later), North Macedonia (landlocked), and St. Barthélemy (IDK lol)
# v2023
# These data were removed for not having any match in the lookup tables:
# Congo - Brazzaville (this one is new and was added in as Republique du Congo -- this was done in the code chunk above), Eswatini (see above), French Southern Territories (see above), North Macedonia (see above), and St. Barthélemy (see above)
Break up French Southern Territories into 8 regions and Kiribati into 2, each repeating the data of the larger region.
### Report these regions at higher spatial resolution:
french_territories <- c(
'Glorioso Islands', 'Juan de Nova Island', 'Bassas da India',
'Ile Europa', 'Ile Tromelin', 'Crozet Islands',
'Amsterdam Island and Saint Paul Island', 'Kerguelen Islands')
kiribati <- c(
"Line Islands (Kiribati)", "Phoenix Islands (Kiribati)", "Gilbert Islands (Kiribati)")
country_split_data <- dplyr::tibble(
country = c(rep("French Southern Territories", 8),
rep("Kiribati", 3)),
region = c(french_territories, kiribati)) %>%
dplyr::left_join(df) %>%
dplyr::select(-c(country, r0_label, r1_label, r2_label)) %>%
dplyr::rename(country = region) %>%
dplyr::left_join(rgns, by = c('country' = 'rgn_label')) %>%
dplyr::mutate(rgn_name = country)
Bind French Southern Territories and Kiribati back to the full dataset.
alien_sp_df <- rbind(df_ohi, country_split_data) %>%
dplyr::select(-country) %>%
dplyr::filter(rgn_name != 'Kiribati') %>%
dplyr::distinct()
Summarize the number of alien/invasive species to each region and fill missing regions with NA values.
Gapfill values from regional averages.
First try “r2 regions” (i.e. Caribbean countries filled with Caribbean islands mean).
gf_step_1 <- alien_sp_summary %>%
dplyr::mutate(
r0_spp = mean(spp_count, na.rm = T),
r0_inv = mean(invasive_count, na.rm = T)) %>%
dplyr::group_by(r1_label) %>%
dplyr::mutate(
r1_spp = mean(spp_count, na.rm = T),
r1_inv = mean(invasive_count, na.rm = T)) %>%
dplyr::ungroup() %>%
dplyr::group_by(r2_label) %>%
dplyr::mutate(
r2_spp = mean(spp_count, na.rm = T),
r2_inv = mean(invasive_count, na.rm = T)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
has_data = ifelse(is.na(spp_count), 0, 1),
gapfilled = ifelse(is.na(spp_count), NA, "no"),
method = ifelse(is.na(spp_count), NA, "not gapfilled"),
spp_count = ifelse(is.na(spp_count), r2_spp, spp_count),
invasive_count = ifelse(is.na(invasive_count), r2_inv, invasive_count),
gapfilled = ifelse(has_data == 0 & !is.na(spp_count), "yes", gapfilled),
method = ifelse(has_data == 0 & !is.na(spp_count), "used r2 average", method))
In v2022 and v2023, everything was filled with step 1. Step 2 and 3 were written just in case step 1 stops filling everything.
Then try “r1 regions” (i.e. “Oceania” or “Asia”).
gf_step_2 <- gf_step_1 %>%
dplyr::mutate(
spp_count = ifelse(is.na(spp_count), r1_spp, spp_count),
invasive_count = ifelse(is.na(invasive_count), r1_inv, invasive_count),
gapfilled = ifelse(has_data == 0 & !is.na(spp_count) & is.na(gapfilled), "yes", gapfilled),
method = ifelse(has_data == 0 & !is.na(spp_count) & is.na(method), "used r1 average", method))
If there remain any that can’t be gapfilled in one of the regions, gapfill with the world score.
Note: This is an aggressive strategy but also an unlikely scenario that will ensure complete data
gf_step_3 <- gf_step_2 %>%
dplyr::mutate(
spp_count = ifelse(is.na(spp_count), r0_spp, spp_count),
invasive_count = ifelse(is.na(invasive_count), r0_inv, invasive_count),
gapfilled = ifelse(has_data == 0 & !is.na(spp_count) & is.na(gapfilled), "yes", gapfilled),
method = ifelse(has_data == 0 & !is.na(spp_count) & is.na(method), "used r0 average", method),
year = 2022) # match to latest data year; ie. this is for v2023. v2022 data was changed to use 2019 now.
It was decided, somewhat arbitrarily to use a weighted avergae where species listed as invasive have a 90% weight, and alien species get a 10% weight. This allows us to account for species not listed as invasive (harmful), but that could still be harmful, despite the data. There may be a better way to determine weights, and this is an area that could be improved upon in future iterations.
min_spp_count <- min(gf_step_3$spp_count)
min_inv_count <- min(gf_step_3$invasive_count)
max_spp_count <- max(gf_step_3$spp_count)
max_inv_count <- max(gf_step_3$invasive_count)
spp_weight <- 0.1
inv_weight <- 0.9
pressure <- gf_step_3 %>%
dplyr::group_by(rgn_id, year, rgn_name) %>%
dplyr::summarise(
pressure_score = (((spp_count-min_spp_count)/(max_spp_count-min_spp_count)) * spp_weight) +
(((invasive_count-min_inv_count)/(max_inv_count-min_inv_count)) * inv_weight)) %>%
dplyr::ungroup()
# Save gapfilling flags
gf_step_3 %>%
dplyr::select(rgn_id, rgn_name, year, gapfilled, method) %>%
readr::write_csv(here::here(dir_here, "output", "prs_alien_gf.csv"))
# Save pressure score dataset
pressure %>%
dplyr::select(rgn_id, year, pressure_score) %>%
readr::write_csv(here::here(dir_here, "output", "prs_alien.csv"))
# v2023: bind the previous version's data to the new data
old_data_to_bind <- here::here(dir_here_prev, "output", "prs_alien.csv") %>%
readr::read_csv()
old_data_to_bind_gf <- here::here(dir_here_prev, "output", "prs_alien_gf.csv") %>%
readr::read_csv()
new_data_to_bind <- here::here(dir_here, "output", "prs_alien.csv") %>%
readr::read_csv()
new_data_to_bind_gf <- here::here(dir_here, "output", "prs_alien_gf.csv") %>%
readr::read_csv()
data_to_write <- rbind(old_data_to_bind, new_data_to_bind)
data_to_write_gf <- rbind(old_data_to_bind_gf, new_data_to_bind_gf)
data_to_write %>% readr::write_csv(here::here(dir_here, "output", "prs_alien.csv"))
data_to_write_gf %>% readr::write_csv(here::here(dir_here, "output", "prs_alien_gf.csv"))
Let’s compare to 2013 and 2022 scores.
# Compare new (after changes) vs. old data (before changes)
new_data <- here::here(dir_here, "output", "prs_alien.csv") %>%
readr::read_csv() %>%
dplyr::left_join(rgns_eez)
old_data <- here::here('globalprep', 'prs_alien', 'v2013', 'data', 'p_sp_alien_2013a.csv') %>%
readr::read_csv() %>%
dplyr::rename(pressure_score = pressures.score) %>%
dplyr::left_join(rgns_eez)
compare <- new_data %>%
dplyr::bind_rows(old_data) %>%
tidyr::pivot_wider(names_from = year, values_from = pressure_score)
plot_diff <-
ggplot2::ggplot(
compare, ggplot2::aes(x = `2022`, y = `2008`, text = rgn_name, label = rgn_id), color = "black") + # make sure to update data year here
ggplot2::geom_jitter(width = 0.025, height = .025) +
ggplot2::geom_abline() +
ggplot2::labs(title = "Invasive species pressure (comparsion to before change in data)") +
ggplot2::theme_bw()
plotly::ggplotly(plot_diff, tooltip = c("rgn_id", "rgn_name", "x", "y"))
# Compare to previous year
plot_diff_prev_yr <-
ggplot2::ggplot(
compare, ggplot2::aes(x = `2022`, y = `2019`, text = rgn_name, label = rgn_id), color = "black") + # make sure to update data years here
ggplot2::geom_jitter(width = 0.025, height = .025) +
ggplot2::geom_abline() +
ggplot2::labs(title = "Invasive species pressure (comparsion to previous year)") +
ggplot2::theme_bw()
plotly::ggplotly(plot_diff_prev_yr, tooltip = c("rgn_id", "rgn_name", "x", "y"))