ohi logo
OHI Science | Citation policy

REFERENCE RMD FILE

1 Summary

This script downloads WGI data and prepares it for a pressures (1 - WGI) and resilience data layer.

2 Updates from previous assessment


3 Data Source

Reference: http://info.worldbank.org/governance/wgi/index.aspx#home

Downloaded:

August 28, 2024 (source data updated Sep 2023)

Description:
The Worldwide Governance Indicators (WGI) project reports aggregate and individual governance indicators for 215 economies over the period 1996–2022, for six dimensions of governance:

  • Voice and Accountability
  • Political Stability and Absence of Violence
  • Government Effectiveness
  • Regulatory Quality
  • Rule of Law
  • Control of Corruption

Time range: 1996-2022


library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(tools)
library(dplyr)
library(tidyr)
library(WDI) # install.packages('WDI')  # used to extract World Development Indicator (World Bank) data 
library(stringr)
library(here)
library(ggplot2)
library(plotly)
library(readr)
library(janitor)
library(strex)


# check website to see what years are available: http://info.worldbank.org/governance/wgi/index.aspx#home
yr_start = 1996
yr_end   = 2022

scen_year <- "2024"

4 Obtain the WGI data

Download each of the 6 WGI indicators. Note: This didn’t work in 2023 for some reason. For the 2024 assessment, check to see if it will work again. Instead. I manually downlaoded the data from here: https://databank.worldbank.org/source/worldwide-governance-indicators#. Use the uncommented chunk below for this.

## access data through the WDI package

## get description of variables:
indicators <-  data.frame(WDI_data[[1]])
indicators[grep("VA.EST", indicators$indicator), ] 
# Voice and Accountability: Estimate
indicators[grep("PV.EST", indicators$indicator), ] 
# Political Stability and Absence of Violence/Terrorism: Estimate
indicators[grep("GE.EST", indicators$indicator), ] 
# Government Effectiveness: Estimate
indicators[grep("RQ.EST", indicators$indicator), ] 
# Regulatory Quality: Estimate
indicators[grep("RL.EST", indicators$indicator), ] 
# Rule of Law: Estimate
indicators[grep("CC.EST", indicators$indicator), ] 
# Control of Corruption: Estimate

# identify the six indicators
# WDIsearch('violence')# general search
key_voice = WDI(
  indicator = WDIsearch('Voice and Accountability: Estimate', field='name')['indicator'],
  country = 'all', start = yr_start, end=yr_end)

key_polst = WDI(
  WDIsearch('Political Stability and Absence of Violence/Terrorism: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_gvtef = WDI(
  WDIsearch('Government Effectiveness: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_regqt = WDI(
  WDIsearch('Regulatory Quality: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_rolaw = WDI(
  WDIsearch('Rule of Law: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_corrp = WDI(
  WDIsearch('Control of Corruption: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

Combine the indicators into a single table, with a column for each indicator, and rows for each country-year pair.

wgi_raw <- read.csv(here::here("globalprep/prs_res_wgi/v2023/raw/P_Data_Extract_From_Worldwide_Governance_Indicators.csv"))

d <- wgi_raw %>%
  clean_names() %>%
  rename(iso3c = country_code, country = country_name) %>%
  filter(str_detect(series_code, "EST")) %>%
  dplyr::select(-series_name) %>%
  pivot_longer(cols = starts_with("x"), values_to = "est", names_to = "year") %>%
  mutate(year = as.numeric(str_after_last(year, "yr"))) %>%
  mutate(est = ifelse(est == "..", NA, est)) %>%
  pivot_wider(values_from = "est", names_from = "series_code") 

Combine the indicators into a single table, with a column for each indicator, and rows for each country-year pair.

d = key_voice %>%
  select(country, year, iso3c, indicator) %>%
  left_join(key_polst %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
  left_join(key_gvtef %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
  left_join(key_regqt %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
  left_join(key_rolaw %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))) %>%
  left_join(key_corrp %>% select(-iso2c), by=(c('country', 'iso3c', 'year'))); head(d); summary(d); sapply(d, class)

colnames(d) <- c("country", "year", "iso3c", "VA.EST", "PV.EST", "GE.EST",  "RQ.EST",   "RL.EST", "CC.EST")

4.1 Save a record of any new raw data for archival purposes

Uncomment the code chunk lines when updating WGI data, this will most likely occur when calculating for new assessment year:

date <- Sys.Date()
write.csv(d, here(sprintf('globalprep/prs_res_wgi/v%s/raw/worldbank_wgi_from_wdi_api_%s.csv', scen_year, date)), row.names=FALSE)


# This dataset currently has non-OHI regions included

5 Gapfill, part 1: filling missing years of data for indicators, within countries

The first gapfilling occurs when we use the average of previous years data within each region/indicator. This occurs when a region has data for an indicator, but not for all years.

Read in WGI data - change appended date in file name to reflect the most recent version of the saved WGI data:

d <- read.csv(here(paste0('globalprep/prs_res_wgi/v', scen_year, '/raw/worldbank_wgi_from_wdi_api_2024-05-24.csv'))) # change appended date to most recent version
d <- d %>% 
  pivot_longer(cols = CC.EST:VA.EST, names_to = "indicator", values_to = "value")


## each country has 20 years of data
d_gap_fill  <- d %>%
  group_by(country, iso3c, year) %>%
  mutate(NA_count_c_y = sum(is.na(value))) %>% # gf record: NA values within a region/year prior to gapfilling, max value is 6 (meaning that a country has no data)
  ungroup() %>%
  group_by(country, iso3c, indicator) %>% # gapfill missing data with mean of values across years within the same region/indicator
  mutate(ind_mean_c_i = mean(value, na.rm=TRUE)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.na(value), ind_mean_c_i, value)) %>%
  group_by(country, iso3c, year) %>% 
  mutate(NA_count_post_gf1 = sum(is.na(value))) # gf record: NA values within a region/year after within region/indicator gapfilling (i.e. indicator is gapfilled by other years of data), used to cut regions <4 indicators (below)    

5.1 Safeguard: cut regions with < 4 indicators (if any) to calculate score.

Once gapfilling is complete, the WGI scores are calculated as an average of the 6 indicators. However, if a country is missing 4 or more of the indicators within a year the average would be very biased. In these cases, a different method should be used to gapfill these data

(NOTE: for the 2024 assessment all regions had at least 4 of the 6 indicators).

countries_no_data <- d_gap_fill %>%
  filter(NA_count_post_gf1 > 3)

countries_no_data <- unique(countries_no_data$country)
countries_no_data

## In this case, the countries with minimal data (< 4 indicators ever calculated) are deleted.  
## These will be gap-filled later on if they are deleted now.
d_gap_fill <- d_gap_fill %>%
  filter(!(country %in% countries_no_data))

6 Calculate overall WGI score for each country

This involves:

  • taking the average of the 6 indicators (assuming there are at least 4 of the 6 indicators)
  • rescaling the data from 0 to 1
d_calcs <- d_gap_fill %>%
  group_by(country, iso3c, year) %>%
  summarize(score_wgi_scale = mean(value, na.rm=T),
            NA_start = mean(NA_count_c_y), # initial mean number of NA across indicators, pre-gapfill 
            NA_post_gf_1 = mean(NA_count_post_gf1)) %>% # number of NA across indicators, post-gapfill across year gapfill within region/indicator
  ungroup() 

6.1 Check that the values in scores_wgi_scale fall within the wgi range specified below:

summary(d_calcs) # checking to make sure NA values make sense
  ## v2024: NA_start max = 6, NA_post_gf_1 max = 0

## v2024: score_wgi_scale min = -2.409, max = 1.947... perfect
wgi_range = c(-2.5, 2.5) # historic values have been between -2.5 and 2.5
d_calcs <- d_calcs %>%
  mutate(score =  (score_wgi_scale - wgi_range[1]) / (wgi_range[2] - wgi_range[1])) %>%
  ungroup(); head(d_calcs); summary(d_calcs)
# converts the scores between 0 and 1, using min/max to scale the data

# document gapfilling
d_calcs <- d_calcs %>%
  mutate(gap_fill = NA_start - NA_post_gf_1,   # if there are values in NA_post_gf_1, it means these weren't gapfilled
         gap_fill = ifelse(is.na(score), 0, gap_fill)) %>% # number of values that were gapfilled
  select(-NA_start, -NA_post_gf_1)

6.2 Explore & check intermediate d_calcs data table

d_calcs[d_calcs$gap_fill>0, ] %>% View()
d_calcs[d_calcs$country == "Niue", ] %>% View() # should have gap-fill values between 0-6
d_calcs[d_calcs$country == "American Samoa",] %>% View() # should have gap-fill values between 0-6

6.3 Save intermediate file

## save intermediate file of wgi scores pre-gapfilling (for OHI+ use)
write.csv(d_calcs %>%
            select(country, iso3c, year, score_wgi_scale, score_ohi_scale = score, gap_fill), 
          here(paste0('globalprep/prs_res_wgi/v', scen_year, '/intermediate/wgi_combined_scores_by_country.csv')),
          row.names = FALSE)

7 Convert country names to ohi regions

## We report these regions at a greater spatial resolution:

## Aruba is part of the Netherlands Antilles, but it is reported separately
country_split_1 <- data.frame(country = "Netherlands Antilles", region = c('Bonaire', 'Curacao', 'Saba', 'Sint Maarten', 'Sint Eustatius'))
country_split_2 <- data.frame(country = "Jersey, Channel Islands", region = c('Jersey', 'Guernsey'))
country_split <- rbind(country_split_1, country_split_2)

country_split_data <- country_split %>%
  left_join(d_calcs) %>%
  select(-country) %>%
  rename(country = region)

d_calcs <- d_calcs %>%
  filter(!(country %in% c("Netherlands Antilles", "Jersey, Channel Islands"))) %>%
  rbind(country_split_data)  %>%
  mutate(country = as.character(country)) %>%
  dplyr::select(-iso3c)

d_calcs$country[grep("Korea, Dem.", d_calcs$country)] <- "North Korea"

d_calcs$country[grep("Turkiye", d_calcs$country)] <- "Turkey"
# Maybe in future update package with country synonym list


## Function to convert to OHI region ID
d_calcs_rgn <- name_2_rgn(df_in = d_calcs, 
                       fld_name='country', 
                       flds_unique=c('year'))


## Combine the duplicate regions (we report these at lower resolution)
## In this case, we take the weighted average
population_weights <- data.frame(country = c("Virgin Islands (U.S.)", "Puerto Rico",
                                             "China", "Hong Kong SAR, China", "Macao SAR, China"),
                                 population = c(104917, 3205691,
                                         1410710000, 7356100, 704149))
# updated population values on 30 August 2024 (source: World Bank website, 2023 values); https://databank.worldbank.org/source/population-estimates-and-projections


d_calcs_rgn <- d_calcs_rgn %>%
  left_join(population_weights, by="country") %>%
  mutate(population = ifelse(is.na(population), 1, population)) %>% 
  group_by(rgn_id, year) %>%
  summarize(score = weighted.mean(score, population),
            gapfill_within_rgn = weighted.mean(gap_fill, population)) %>%
  ungroup() %>%
  filter(rgn_id <= 250)

summary(d_calcs_rgn)

8 Gapfill, part 2: Filling in missing territorial region value

Assigning territorial region value to be the mean of parent country value and territorial regions with data (using same sov_id).

## data that describes territories of countries
territory = rgn_master %>% 
  select(rgn_id = rgn_id_2013,
         sov_id) %>%               
  group_by(rgn_id) %>% # remove duplicated countries from this rgn_id list
  summarize(sov_id = mean(sov_id, na.rm=T)) %>% # duplicates always have the same sov_id (r2 value)
  filter(rgn_id <= 250, rgn_id != 213)

    
## expand to include all years of data
territory <- data.frame(year=yr_start:yr_end) %>% 
  merge(territory, by=NULL) 


## assign territories the values of their sovereign country
d_sovs = d_calcs_rgn %>% 
  full_join(territory, by = c('rgn_id', 'year')) %>%
  group_by(sov_id, year) %>%
  mutate(score_gf_territory = mean(score, na.rm=TRUE),
         gapfill_within_rgn = mean(gapfill_within_rgn, na.rm=TRUE))%>%
   filter(!is.na(gapfill_within_rgn)) %>%
  ungroup()

# filter(d_sovs, rgn_id %in% c(1,2,3,16) & year == 2018) %>% View() # check in console to make sure Australia and territories have the same score and sov_id

Define new data object from d_sovs which includes gapfill method and gapfilled scores:

d_gf2 <- d_sovs %>%
  mutate(gapfill_territory = ifelse(is.na(score) & !is.na(score_gf_territory), "territory", "NA")) %>%
  mutate(score = ifelse(is.na(score), score_gf_territory, score)) %>%
  select(rgn_id, year, score, gapfill_within_rgn, gapfill_territory)

summary(d_gf2)

Add region names and clean the region data, and make sure we have all the regions:

# get region names
regions <- rgn_master %>%
  filter(rgn_typ == "eez") %>%
  filter(rgn_id_2013 <= 250) %>% # > 250 are either FAO or a disputed region
  filter(rgn_id_2013 != 213) %>% # 213 is Antarctica
  select(rgn_id = rgn_id_2013, rgn_name=rgn_nam_2013) %>%
  unique() %>%
  arrange(rgn_id) 

d_gf2 <- regions %>%
  left_join(d_gf2)

8.1 Look at data table for the territories (gapfilled)

head(d_sovs)
summary(d_sovs) 
  ## v2021: score has 1029 NAs
  ## v2022: 1127 NAs
  ## v2024: 1176 NAs

8.2 Look at table with scores and gapfilling methods

## check for NA values within "score" variable
## if so, need to gapfill using UN geopolitical regions
summary(d_gf2) 
d_gf2[is.na(d_gf2$score), ]
  ## v2021: score has 0 NAs
  ## v2022: 0 NAs
  ## v2023: 0 NAs
  ## v2024: 0 NAs! 

9 Check data

Comparing this year’s values against last year’s. These should be the same unless there have been updates to WGI source data or a change to methods.

Plot most recent shared year between last year and this year’s data, and look for a relationship close to a 1:1 relationship. If data are significantly off the line, look at the original (raw) data to investigate.

# Most recent shared year should be three years before prior to the assessment year (scen_year)
recent_shared_year <- as.numeric(scen_year) - 2 
prev_assessment_year <- as.numeric(scen_year) - 1

new_2022_scores <- d_gf2 %>% # rename variable with respective year for clarity
  filter(year == recent_shared_year) %>% 
  select(rgn_id, score)

old_2022_scores <- read.csv(here(paste0('globalprep/prs_res_wgi/v', prev_assessment_year, '/output/wgi_res.csv'))) %>% # rename variable
  filter(year == recent_shared_year) %>%
  select(rgn_id, old_score=resilience_score)

score_compare_2022 <- old_2022_scores %>% 
  full_join(new_2022_scores)


score_compare_plot <- ggplot(score_compare_2022, aes(x = old_score, y = score, text = rgn_id)) +
  geom_point() +
  geom_abline(slope=1, intercept = 0, col = "red")
   
score_compare_plot
# v2024: looks good

Look at top/bottom 10 regions to make sure these seem reasonable:

## Top/Bottom 10 scorers:

tmp <- d_gf2 %>%
  filter(year==2022) %>%
  arrange(score) %>%
  select(rgn_id, score) %>%
  left_join(regions)

tmp[1:10, ] # makes sense.. mostly African countries or countries with authoritarian regimes
tmp[211:220, ] # makes sense... mostly countries with democratic socialist policies or small islands. No surprise Nordic countries are the best

hist(tmp$score)

Look at a summary to confirm scores are between 0 and 1, there are 220 regions, and there are no NAs (for this particular dataset):

summary(d_gf2) # scores are between 0-1, gapfilled info present, no NAs
length(unique(d_gf2$rgn_id)) # 220 regions
c(min(d_gf2$score), max(d_gf2$score)) # checking for score min and max; making sure they're between 0-1

10 Save the data

Save gapfilling and data for this assessment year.

tmp_data_res <- d_gf2 %>%
  select(rgn_id, year, resilience_score = score)
write.csv(tmp_data_res, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_res.csv")), row.names=FALSE)

tmp_data_prs <- d_gf2 %>%
  mutate(score = 1 - score) %>%
  select(rgn_id, year, pressure_score = score)
write.csv(tmp_data_prs, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_prs.csv")), row.names=FALSE)

# gapfilled data
tmp_gf <- d_gf2 %>%
  select(rgn_id, year, gapfill_within_rgn, gapfill_territory) %>%
  mutate(gapfill_within_rgn = ifelse(gapfill_within_rgn == 0, NA, gapfill_within_rgn)) %>%
  mutate(gapfill_within_rgn = ifelse(!is.na(gapfill_within_rgn), 
                                            paste0("gapfill_within_rgn: ", gapfill_within_rgn), NA)) %>%
  mutate(gapfill_territory = ifelse(gapfill_territory == "territory", "territory, mean of admin countries", NA)) %>%
  mutate(method = paste(gapfill_within_rgn, gapfill_territory, sep="; ")) %>%
  mutate(method = gsub("NA; ", "", method)) %>%
  mutate(method = gsub("; NA", "", method)) %>%
  mutate(method = gsub("NA", NA, method)) %>%
  mutate(gapfilled = ifelse(is.na(method), 0, 1)) %>%
  select(rgn_id, year, gapfilled, method)


write.csv(tmp_gf, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_gf.csv")), row.names=FALSE)