This script downloads WGI data and prepares it for a pressures (1 - WGI) and resilience data layer.
Reference: http://info.worldbank.org/governance/wgi/index.aspx#home
Downloaded:
August 28, 2024 (source data updated Sep 2023)
Description:
The Worldwide Governance Indicators (WGI) project reports aggregate and
individual governance indicators for 215 economies over the period
1996–2022, for six dimensions of governance:
Time range: 1996-2022
library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(tools)
library(dplyr)
library(tidyr)
library(WDI) # install.packages('WDI') # used to extract World Development Indicator (World Bank) data
library(stringr)
library(here)
library(ggplot2)
library(plotly)
library(readr)
library(janitor)
library(strex)
# check website to see what years are available: http://info.worldbank.org/governance/wgi/index.aspx#home
yr_start = 1996
yr_end = 2022
scen_year <- "2024"
Download each of the 6 WGI indicators. Note: This didn’t work in 2023 for some reason. For the 2024 assessment, check to see if it will work again. Instead. I manually downlaoded the data from here: https://databank.worldbank.org/source/worldwide-governance-indicators#. Use the uncommented chunk below for this.
## access data through the WDI package
## get description of variables:
indicators <- data.frame(WDI_data[[1]])
indicators[grep("VA.EST", indicators$indicator), ]
# Voice and Accountability: Estimate
indicators[grep("PV.EST", indicators$indicator), ]
# Political Stability and Absence of Violence/Terrorism: Estimate
indicators[grep("GE.EST", indicators$indicator), ]
# Government Effectiveness: Estimate
indicators[grep("RQ.EST", indicators$indicator), ]
# Regulatory Quality: Estimate
indicators[grep("RL.EST", indicators$indicator), ]
# Rule of Law: Estimate
indicators[grep("CC.EST", indicators$indicator), ]
# Control of Corruption: Estimate
# identify the six indicators
# WDIsearch('violence')# general search
key_voice = WDI(
indicator = WDIsearch('Voice and Accountability: Estimate', field='name')['indicator'],
country = 'all', start = yr_start, end=yr_end)
key_polst = WDI(
WDIsearch('Political Stability and Absence of Violence/Terrorism: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
key_gvtef = WDI(
WDIsearch('Government Effectiveness: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
key_regqt = WDI(
WDIsearch('Regulatory Quality: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
key_rolaw = WDI(
WDIsearch('Rule of Law: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
key_corrp = WDI(
WDIsearch('Control of Corruption: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
Combine the indicators into a single table, with a column for each indicator, and rows for each country-year pair.
wgi_raw <- read.csv(here::here("globalprep/prs_res_wgi/v2023/raw/P_Data_Extract_From_Worldwide_Governance_Indicators.csv"))
d <- wgi_raw %>%
clean_names() %>%
rename(iso3c = country_code, country = country_name) %>%
filter(str_detect(series_code, "EST")) %>%
dplyr::select(-series_name) %>%
pivot_longer(cols = starts_with("x"), values_to = "est", names_to = "year") %>%
mutate(year = as.numeric(str_after_last(year, "yr"))) %>%
mutate(est = ifelse(est == "..", NA, est)) %>%
pivot_wider(values_from = "est", names_from = "series_code")
Combine the indicators into a single table, with a column for each indicator, and rows for each country-year pair.
d = key_voice %>%
select(country, year, iso3c, indicator) %>%
left_join(key_polst %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
left_join(key_gvtef %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
left_join(key_regqt %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
left_join(key_rolaw %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
left_join(key_corrp %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))); head(d); summary(d); sapply(d, class)
colnames(d) <- c("country", "year", "iso3c", "VA.EST", "PV.EST", "GE.EST", "RQ.EST", "RL.EST", "CC.EST")
Uncomment the code chunk lines when updating WGI data, this will most likely occur when calculating for new assessment year:
The first gapfilling occurs when we use the average of previous years data within each region/indicator. This occurs when a region has data for an indicator, but not for all years.
Read in WGI data - change appended date in file name to reflect the most recent version of the saved WGI data:
d <- read.csv(here(paste0('globalprep/prs_res_wgi/v', scen_year, '/raw/worldbank_wgi_from_wdi_api_2024-05-24.csv'))) # change appended date to most recent version
d <- d %>%
pivot_longer(cols = CC.EST:VA.EST, names_to = "indicator", values_to = "value")
## each country has 20 years of data
d_gap_fill <- d %>%
group_by(country, iso3c, year) %>%
mutate(NA_count_c_y = sum(is.na(value))) %>% # gf record: NA values within a region/year prior to gapfilling, max value is 6 (meaning that a country has no data)
ungroup() %>%
group_by(country, iso3c, indicator) %>% # gapfill missing data with mean of values across years within the same region/indicator
mutate(ind_mean_c_i = mean(value, na.rm=TRUE)) %>%
ungroup() %>%
mutate(value = ifelse(is.na(value), ind_mean_c_i, value)) %>%
group_by(country, iso3c, year) %>%
mutate(NA_count_post_gf1 = sum(is.na(value))) # gf record: NA values within a region/year after within region/indicator gapfilling (i.e. indicator is gapfilled by other years of data), used to cut regions <4 indicators (below)
Once gapfilling is complete, the WGI scores are calculated as an average of the 6 indicators. However, if a country is missing 4 or more of the indicators within a year the average would be very biased. In these cases, a different method should be used to gapfill these data
(NOTE: for the 2024 assessment all regions had at least 4 of the 6 indicators).
countries_no_data <- d_gap_fill %>%
filter(NA_count_post_gf1 > 3)
countries_no_data <- unique(countries_no_data$country)
countries_no_data
## In this case, the countries with minimal data (< 4 indicators ever calculated) are deleted.
## These will be gap-filled later on if they are deleted now.
d_gap_fill <- d_gap_fill %>%
filter(!(country %in% countries_no_data))
This involves:
d_calcs <- d_gap_fill %>%
group_by(country, iso3c, year) %>%
summarize(score_wgi_scale = mean(value, na.rm=T),
NA_start = mean(NA_count_c_y), # initial mean number of NA across indicators, pre-gapfill
NA_post_gf_1 = mean(NA_count_post_gf1)) %>% # number of NA across indicators, post-gapfill across year gapfill within region/indicator
ungroup()
scores_wgi_scale
fall within the wgi range specified
below:summary(d_calcs) # checking to make sure NA values make sense
## v2024: NA_start max = 6, NA_post_gf_1 max = 0
## v2024: score_wgi_scale min = -2.409, max = 1.947... perfect
wgi_range = c(-2.5, 2.5) # historic values have been between -2.5 and 2.5
d_calcs <- d_calcs %>%
mutate(score = (score_wgi_scale - wgi_range[1]) / (wgi_range[2] - wgi_range[1])) %>%
ungroup(); head(d_calcs); summary(d_calcs)
# converts the scores between 0 and 1, using min/max to scale the data
# document gapfilling
d_calcs <- d_calcs %>%
mutate(gap_fill = NA_start - NA_post_gf_1, # if there are values in NA_post_gf_1, it means these weren't gapfilled
gap_fill = ifelse(is.na(score), 0, gap_fill)) %>% # number of values that were gapfilled
select(-NA_start, -NA_post_gf_1)
## We report these regions at a greater spatial resolution:
## Aruba is part of the Netherlands Antilles, but it is reported separately
country_split_1 <- data.frame(country = "Netherlands Antilles", region = c('Bonaire', 'Curacao', 'Saba', 'Sint Maarten', 'Sint Eustatius'))
country_split_2 <- data.frame(country = "Jersey, Channel Islands", region = c('Jersey', 'Guernsey'))
country_split <- rbind(country_split_1, country_split_2)
country_split_data <- country_split %>%
left_join(d_calcs) %>%
select(-country) %>%
rename(country = region)
d_calcs <- d_calcs %>%
filter(!(country %in% c("Netherlands Antilles", "Jersey, Channel Islands"))) %>%
rbind(country_split_data) %>%
mutate(country = as.character(country)) %>%
dplyr::select(-iso3c)
d_calcs$country[grep("Korea, Dem.", d_calcs$country)] <- "North Korea"
d_calcs$country[grep("Turkiye", d_calcs$country)] <- "Turkey"
# Maybe in future update package with country synonym list
## Function to convert to OHI region ID
d_calcs_rgn <- name_2_rgn(df_in = d_calcs,
fld_name='country',
flds_unique=c('year'))
## Combine the duplicate regions (we report these at lower resolution)
## In this case, we take the weighted average
population_weights <- data.frame(country = c("Virgin Islands (U.S.)", "Puerto Rico",
"China", "Hong Kong SAR, China", "Macao SAR, China"),
population = c(104917, 3205691,
1410710000, 7356100, 704149))
# updated population values on 30 August 2024 (source: World Bank website, 2023 values); https://databank.worldbank.org/source/population-estimates-and-projections
d_calcs_rgn <- d_calcs_rgn %>%
left_join(population_weights, by="country") %>%
mutate(population = ifelse(is.na(population), 1, population)) %>%
group_by(rgn_id, year) %>%
summarize(score = weighted.mean(score, population),
gapfill_within_rgn = weighted.mean(gap_fill, population)) %>%
ungroup() %>%
filter(rgn_id <= 250)
summary(d_calcs_rgn)
Assigning territorial region value to be the mean of parent country
value and territorial regions with data (using same
sov_id
).
## data that describes territories of countries
territory = rgn_master %>%
select(rgn_id = rgn_id_2013,
sov_id) %>%
group_by(rgn_id) %>% # remove duplicated countries from this rgn_id list
summarize(sov_id = mean(sov_id, na.rm=T)) %>% # duplicates always have the same sov_id (r2 value)
filter(rgn_id <= 250, rgn_id != 213)
## expand to include all years of data
territory <- data.frame(year=yr_start:yr_end) %>%
merge(territory, by=NULL)
## assign territories the values of their sovereign country
d_sovs = d_calcs_rgn %>%
full_join(territory, by = c('rgn_id', 'year')) %>%
group_by(sov_id, year) %>%
mutate(score_gf_territory = mean(score, na.rm=TRUE),
gapfill_within_rgn = mean(gapfill_within_rgn, na.rm=TRUE))%>%
filter(!is.na(gapfill_within_rgn)) %>%
ungroup()
# filter(d_sovs, rgn_id %in% c(1,2,3,16) & year == 2018) %>% View() # check in console to make sure Australia and territories have the same score and sov_id
Define new data object from d_sovs which includes gapfill method and gapfilled scores:
d_gf2 <- d_sovs %>%
mutate(gapfill_territory = ifelse(is.na(score) & !is.na(score_gf_territory), "territory", "NA")) %>%
mutate(score = ifelse(is.na(score), score_gf_territory, score)) %>%
select(rgn_id, year, score, gapfill_within_rgn, gapfill_territory)
summary(d_gf2)
Add region names and clean the region data, and make sure we have all the regions:
# get region names
regions <- rgn_master %>%
filter(rgn_typ == "eez") %>%
filter(rgn_id_2013 <= 250) %>% # > 250 are either FAO or a disputed region
filter(rgn_id_2013 != 213) %>% # 213 is Antarctica
select(rgn_id = rgn_id_2013, rgn_name=rgn_nam_2013) %>%
unique() %>%
arrange(rgn_id)
d_gf2 <- regions %>%
left_join(d_gf2)
Comparing this year’s values against last year’s. These should be the same unless there have been updates to WGI source data or a change to methods.
Plot most recent shared year between last year and this year’s data, and look for a relationship close to a 1:1 relationship. If data are significantly off the line, look at the original (raw) data to investigate.
# Most recent shared year should be three years before prior to the assessment year (scen_year)
recent_shared_year <- as.numeric(scen_year) - 2
prev_assessment_year <- as.numeric(scen_year) - 1
new_2022_scores <- d_gf2 %>% # rename variable with respective year for clarity
filter(year == recent_shared_year) %>%
select(rgn_id, score)
old_2022_scores <- read.csv(here(paste0('globalprep/prs_res_wgi/v', prev_assessment_year, '/output/wgi_res.csv'))) %>% # rename variable
filter(year == recent_shared_year) %>%
select(rgn_id, old_score=resilience_score)
score_compare_2022 <- old_2022_scores %>%
full_join(new_2022_scores)
score_compare_plot <- ggplot(score_compare_2022, aes(x = old_score, y = score, text = rgn_id)) +
geom_point() +
geom_abline(slope=1, intercept = 0, col = "red")
score_compare_plot
# v2024: looks good
Look at top/bottom 10 regions to make sure these seem reasonable:
## Top/Bottom 10 scorers:
tmp <- d_gf2 %>%
filter(year==2022) %>%
arrange(score) %>%
select(rgn_id, score) %>%
left_join(regions)
tmp[1:10, ] # makes sense.. mostly African countries or countries with authoritarian regimes
tmp[211:220, ] # makes sense... mostly countries with democratic socialist policies or small islands. No surprise Nordic countries are the best
hist(tmp$score)
Look at a summary to confirm scores are between 0 and 1, there are 220 regions, and there are no NAs (for this particular dataset):
Save gapfilling and data for this assessment year.
tmp_data_res <- d_gf2 %>%
select(rgn_id, year, resilience_score = score)
write.csv(tmp_data_res, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_res.csv")), row.names=FALSE)
tmp_data_prs <- d_gf2 %>%
mutate(score = 1 - score) %>%
select(rgn_id, year, pressure_score = score)
write.csv(tmp_data_prs, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_prs.csv")), row.names=FALSE)
# gapfilled data
tmp_gf <- d_gf2 %>%
select(rgn_id, year, gapfill_within_rgn, gapfill_territory) %>%
mutate(gapfill_within_rgn = ifelse(gapfill_within_rgn == 0, NA, gapfill_within_rgn)) %>%
mutate(gapfill_within_rgn = ifelse(!is.na(gapfill_within_rgn),
paste0("gapfill_within_rgn: ", gapfill_within_rgn), NA)) %>%
mutate(gapfill_territory = ifelse(gapfill_territory == "territory", "territory, mean of admin countries", NA)) %>%
mutate(method = paste(gapfill_within_rgn, gapfill_territory, sep="; ")) %>%
mutate(method = gsub("NA; ", "", method)) %>%
mutate(method = gsub("; NA", "", method)) %>%
mutate(method = gsub("NA", NA, method)) %>%
mutate(gapfilled = ifelse(is.na(method), 0, 1)) %>%
select(rgn_id, year, gapfilled, method)
write.csv(tmp_gf, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_gf.csv")), row.names=FALSE)