This script downloads WGI data and prepares it for a pressures (1 - WGI) and resilience data layer.
No methods updates; additional year of data added.
Potential improvement for future assessments: Explore a better way to gapfill… based on a linear model (~line 166)
Reference: http://info.worldbank.org/governance/wgi/index.aspx#home
Downloaded:
October 3, 2022 (source data updated Sep 2022)
Description:
The Worldwide Governance Indicators (WGI) project reports aggregate and
individual governance indicators for 215 economies over the period
1996–2021, for six dimensions of governance:
Time range: 1996-2021
library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(tools)
library(dplyr)
library(tidyr)
library(WDI) # install.packages('WDI') # used to extract World Development Indicator (World Bank) data
library(stringr)
library(here)
library(ggplot2)
library(plotly)
library(readr)
# check website to see what years are available: http://info.worldbank.org/governance/wgi/index.aspx#home
= 1996
yr_start = 2021
yr_end
<- "2022" scen_year
Download each of the 6 WGI indicators:
## access data through the WDI package
## get description of variables:
<- data.frame(WDI_data[[1]])
indicators grep("VA.EST", indicators$indicator), ]
indicators[grep("PV.EST", indicators$indicator), ]
indicators[grep("GE.EST", indicators$indicator), ]
indicators[grep("RQ.EST", indicators$indicator), ]
indicators[grep("RL.EST", indicators$indicator), ]
indicators[grep("CC.EST", indicators$indicator), ]
indicators[
# identify the six indicators
# WDIsearch('violence')# general search
= WDI(
key_voice indicator = WDIsearch('Voice and Accountability: Estimate', field='name')['indicator'],
country = 'all', start = yr_start, end=yr_end)
= WDI(
key_polst WDIsearch('Political Stability and Absence of Violence/Terrorism: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
= WDI(
key_gvtef WDIsearch('Government Effectiveness: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
= WDI(
key_regqt WDIsearch('Regulatory Quality: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
= WDI(
key_rolaw WDIsearch('Rule of Law: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
= WDI(
key_corrp WDIsearch('Control of Corruption: Estimate', field='name')['indicator'],
country='all',start = yr_start, end=yr_end)
Combine the indicators into a single table, with a column for each indicator, and rows for each country-year pair.
= key_voice %>%
d select(country, year, indicator) %>%
left_join(key_polst %>% select(-iso2c), by=(c('country', 'year'))) %>%
left_join(key_gvtef %>% select(-iso2c), by=(c('country', 'year'))) %>%
left_join(key_regqt %>% select(-iso2c), by=(c('country', 'year'))) %>%
left_join(key_rolaw %>% select(-iso2c), by=(c('country', 'year'))) %>%
left_join(key_corrp %>% select(-iso2c), by=(c('country', 'year'))); head(d); summary(d); sapply(d, class)
colnames(d) <- c("country", "year", "VA.EST", "PV.EST", "GE.EST", "RQ.EST", "RL.EST", "CC.EST")
Uncomment the code chunk lines when updating WGI data, this will most likely occur when calculating for new assessment year:
<- Sys.Date()
date write.csv(d, here(sprintf('globalprep/prs_res_wgi/v%s/raw/worldbank_wgi_from_wdi_api_%s.csv', scen_year, date)), row.names=FALSE)
# This dataset currently has non-OHI regions included
The first gapfilling occurs when we use the average of previous years data within each region/indicator. This occurs when a region has data for an indicator, but not for all years.
Read in WGI data - change appended date in file name to reflect the most recent version of the saved WGI data:
<- read.csv(here(paste0('globalprep/prs_res_wgi/v', scen_year, '/raw/worldbank_wgi_from_wdi_api_2022-10-03.csv'))) # change appended date to most recent version d
<- d %>%
d pivot_longer(cols = VA.EST:CC.EST, names_to = "indicator", values_to = "value")
## each country has 20 years of data
<- d %>%
d_gap_fill group_by(country, year) %>%
mutate(NA_count_c_y = sum(is.na(value))) %>% # gf record: NA values within a region/year prior to gapfilling, max value is 6 (meaning that a country has no data)
ungroup() %>%
group_by(country, indicator) %>% # gapfill missing data with mean of values across years within the same region/indicator
mutate(ind_mean_c_i = mean(value, na.rm=TRUE)) %>%
ungroup() %>%
mutate(value = ifelse(is.na(value), ind_mean_c_i, value)) %>%
group_by(country, year) %>%
mutate(NA_count_post_gf1 = sum(is.na(value))) # gf record: NA values within a region/year after within region/indicator gapfilling (i.e. indicator is gapfilled by other years of data), used to cut regions <4 indicators (below)
Once gapfilling is complete, the WGI scores are calculated as an average of the 6 indicators. However, if a country is missing 4 or more of the indicators within a year the average would be very biased. In these cases, a different method should be used to gapfill these data
(NOTE: for the 2022 assessment all regions had at least 4 of the 6 indicators).
<- d_gap_fill %>%
countries_no_data filter(NA_count_post_gf1 > 3)
<- unique(countries_no_data$country)
countries_no_data
countries_no_data
## In this case, the countries with minimal data (< 4 indicators ever calculated) are deleted.
## These will be gap-filled later on if they are deleted now.
<- d_gap_fill %>%
d_gap_fill filter(!(country %in% countries_no_data))
This involves:
<- d_gap_fill %>%
d_calcs group_by(country, year) %>%
summarize(score_wgi_scale = mean(value, na.rm=T),
NA_start = mean(NA_count_c_y), # initial mean number of NA across indicators, pre-gapfill
NA_post_gf_1 = mean(NA_count_post_gf1)) %>% # number of NA across indicators, post-gapfill across year gapfill within region/indicator
ungroup()
scores_wgi_scale
fall within the wgi range specified
below:summary(d_calcs) # checking to make sure NA values make sense
## v2022: NA_start max = 6, NA_post_gf_1 = 3
## v2022: score_wgi_sclae min = -2.45, ma = 1.97... perfect
= c(-2.5, 2.5) # historic values have been between -2.5 and 2.5 wgi_range
<- d_calcs %>%
d_calcs mutate(score = (score_wgi_scale - wgi_range[1]) / (wgi_range[2] - wgi_range[1])) %>%
ungroup(); head(d_calcs); summary(d_calcs)
# converts the scores between 0 and 1, using min/max to scale the data
# document gapfilling
<- d_calcs %>%
d_calcs mutate(gap_fill = NA_start - NA_post_gf_1, # if there are values in NA_post_gf_1, it means these weren't gapfilled
gap_fill = ifelse(is.na(score), 0, gap_fill)) %>% # number of values that were gapfilled
select(-NA_start, -NA_post_gf_1)
$gap_fill>0, ] %>% View()
d_calcs[d_calcs$country == "Niue", ] %>% View() # should have gap-fill values between 0-6
d_calcs[d_calcs$country == "American Samoa",] %>% View() # should have gap-fill values between 0-6 d_calcs[d_calcs
## save intermediate file of wgi scores pre-gapfilling (for OHI+ use)
write.csv(d_calcs %>%
select(country, year, score_wgi_scale, score_ohi_scale = score),
here(paste0('globalprep/prs_res_wgi/v', scen_year, '/intermediate/wgi_combined_scores_by_country.csv')),
row.names = FALSE)
## We report these regions at a greater spatial resolution:
## Aruba is part of the Netherlands Antilles, but it is reported separately
<- data.frame(country = "Netherlands Antilles", region = c('Bonaire', 'Curacao', 'Saba', 'Sint Maarten', 'Sint Eustatius'))
country_split_1 <- data.frame(country = "Jersey, Channel Islands", region = c('Jersey', 'Guernsey'))
country_split_2 <- rbind(country_split_1, country_split_2)
country_split
<- country_split %>%
country_split_data left_join(d_calcs) %>%
select(-country) %>%
rename(country = region)
<- d_calcs %>%
d_calcs filter(!(country %in% c("Netherlands Antilles", "Jersey, Channel Islands"))) %>%
rbind(country_split_data) %>%
mutate(country = as.character(country))
$country[grep("Korea, Dem.", d_calcs$country)] <- "North Korea"
d_calcs
$country[grep("Turkiye", d_calcs$country)] <- "Turkey"
d_calcs# Maybe in future update package with country synonym list
## Function to convert to OHI region ID
<- name_2_rgn(df_in = d_calcs,
d_calcs_rgn fld_name='country',
flds_unique=c('year'))
## Combine the duplicate regions (we report these at lower resolution)
## In this case, we take the weighted average
<- data.frame(country = c("Virgin Islands (U.S.)", "Puerto Rico",
population_weights "China", "Hong Kong SAR, China", "Macao SAR, China"),
population = c(106290, 3281538,
1410929360, 7481800, 649342))
# updated population values on 29 June 2022 (source: World Bank website, 2020 values)
<- d_calcs_rgn %>%
d_calcs_rgn left_join(population_weights, by="country") %>%
mutate(population = ifelse(is.na(population), 1, population)) %>%
group_by(rgn_id, year) %>%
summarize(score = weighted.mean(score, population),
gapfill_within_rgn = weighted.mean(gap_fill, population)) %>%
ungroup() %>%
filter(rgn_id <= 250)
summary(d_calcs_rgn)
Assigning territorial region value to be the mean of parent country
value and territorial regions with data (using same
sov_id
).
## data that describes territories of countries
= rgn_master %>%
territory select(rgn_id = rgn_id_2013,
%>%
sov_id) group_by(rgn_id) %>% # remove duplicated countries from this rgn_id list
summarize(sov_id = mean(sov_id, na.rm=T)) %>% # duplicates always have the same sov_id (r2 value)
filter(rgn_id <= 250, rgn_id != 213)
## expand to include all years of data
<- data.frame(year=yr_start:yr_end) %>%
territory merge(territory, by=NULL)
## assign territories the values of their sovereign country
= d_calcs_rgn %>%
d_sovs full_join(territory, by = c('rgn_id', 'year')) %>%
group_by(sov_id, year) %>%
mutate(score_gf_territory = mean(score, na.rm=TRUE),
gapfill_within_rgn = mean(gapfill_within_rgn, na.rm=TRUE))%>%
filter(!is.na(gapfill_within_rgn)) %>%
ungroup()
# filter(d_sovs, rgn_id %in% c(1,2,3,16) & year == 2018) %>% View() # check in console to make sure Australia and territories have the same score and sov_id
Define new data object from d_sovs which includes gapfill method and gapfilled scores:
<- d_sovs %>%
d_gf2 mutate(gapfill_territory = ifelse(is.na(score) & !is.na(score_gf_territory), "territory", "NA")) %>%
mutate(score = ifelse(is.na(score), score_gf_territory, score)) %>%
select(rgn_id, year, score, gapfill_within_rgn, gapfill_territory)
summary(d_gf2)
Add region names and clean the region data, and make sure we have all the regions:
# get region names
<- rgn_master %>%
regions filter(rgn_typ == "eez") %>%
filter(rgn_id_2013 <= 250) %>% # > 250 are either FAO or a disputed region
filter(rgn_id_2013 != 213) %>% # 213 is Antarctica
select(rgn_id = rgn_id_2013, rgn_name=rgn_nam_2013) %>%
unique() %>%
arrange(rgn_id)
<- regions %>%
d_gf2 left_join(d_gf2)
head(d_sovs)
summary(d_sovs)
## v2021: score has 1029 NAs
## v2022: 1127 NAs
## check for NA values within "score" variable
## if so, need to gapfill using UN geopolitical regions
summary(d_gf2)
is.na(d_gf2$score), ]
d_gf2[## v2021: score has 0 NAs
## v2022: 0 NAs
Comparing this year’s values against last year’s. These should be the same unless there have been updates to WGI source data or a change to methods. For this year (2019), there was a small change that affected a few territorial regions. In the past, we used the sovereign country value, but in this case, we averaged the sovereign country and the available territorial values.
Plot most recent shared year between last year and this year’s data, and look for a relationship close to a 1:1 relationship. If data are significantly off the line, look at the original (raw) data to investigate.
# Most recent shared year should be three years before prior to the assessment year (scen_year)
<- as.numeric(scen_year) - 3
recent_shared_year <- as.numeric(scen_year) - 1
prev_assessment_year
<- d_gf2 %>% # rename variable with respective year for clarity
new_2019_scores filter(year == recent_shared_year) %>%
select(rgn_id, score)
<- read.csv(here(paste0('globalprep/prs_res_wgi/v', prev_assessment_year, '/output/wgi_res.csv'))) %>% # rename variable
old_2019_scores filter(year == recent_shared_year) %>%
select(rgn_id, old_score=resilience_score)
<- old_2019_scores %>%
score_compare_2019 full_join(new_2019_scores)
<- ggplot(score_compare_2019, aes(x = old_score, y = score, text = rgn_id)) +
score_compare_plot geom_point() +
geom_abline(slope=1, intercept = 0, col = "red")
ggplotly(score_compare_plot) # looks good (no outliers), no real need for comparison
Look at top/bottom 10 regions to make sure these seem reasonable:
## Top/Bottom 10 scorers:
<- d_gf2 %>%
tmp filter(year==2019) %>%
arrange(score) %>%
select(rgn_id, score) %>%
left_join(regions)
1:10, ] # makes sense.. mostly African countries or countries with authoritarian regimes
tmp[211:220, ] # makes sense... mostly countries with democratic socialist policies or small islands. No surprise New Zealand is the best
tmp[
hist(tmp$score)
Look at a summary to confirm scores are between 0 and 1, there are 220 regions, and there are no NAs (for this particular dataset):
summary(d_gf2) # scores are between 0-1, gapfilled info present, no NAs
length(unique(d_gf2$rgn_id)) # 220 regions
c(min(d_gf2$score), max(d_gf2$score)) # checking for score min and max; making sure they're between 0-1
Save gapfilling and data for this assessment year.
<- d_gf2 %>%
tmp_data_res select(rgn_id, year, resilience_score = score)
write.csv(tmp_data_res, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_res.csv")), row.names=FALSE)
<- d_gf2 %>%
tmp_data_prs mutate(score = 1 - score) %>%
select(rgn_id, year, pressure_score = score)
write.csv(tmp_data_prs, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_prs.csv")), row.names=FALSE)
# gapfilled data
<- d_gf2 %>%
tmp_gf select(rgn_id, year, gapfill_within_rgn, gapfill_territory) %>%
mutate(gapfill_within_rgn = ifelse(gapfill_within_rgn == 0, NA, gapfill_within_rgn)) %>%
mutate(gapfill_within_rgn = ifelse(!is.na(gapfill_within_rgn),
paste0("gapfill_within_rgn: ", gapfill_within_rgn), NA)) %>%
mutate(gapfill_territory = ifelse(gapfill_territory == "territory", "territory, mean of admin countries", NA)) %>%
mutate(method = paste(gapfill_within_rgn, gapfill_territory, sep="; ")) %>%
mutate(method = gsub("NA; ", "", method)) %>%
mutate(method = gsub("; NA", "", method)) %>%
mutate(method = gsub("NA", NA, method)) %>%
mutate(gapfilled = ifelse(is.na(method), 0, 1)) %>%
select(rgn_id, year, gapfilled, method)
write.csv(tmp_gf, here(paste0("globalprep/prs_res_wgi/v", scen_year, "/output/wgi_gf.csv")), row.names=FALSE)
Checking on outliers in 2022 assessment:
## Do this after updating in global
<- read_csv(paste0("globalprep/prs_res_wgi/v", scen_year, "/raw/worldbank_wgi_from_wdi_api_2022-10-03.csv"))
worldbank_wgi_from_wdi_api
<- worldbank_wgi_from_wdi_api %>%
worldbank_2018 filter(year == 2018) %>%
filter(country == "New Zealand")
# Compare to data from 2021:
<- read_csv(paste0("globalprep/prs_res_wgi/v", prev_assessment_year, "/raw/worldbank_wgi_from_wdi_api_2021-04-29.csv"))
worldbank_wgi_from_wdi_api_2021
<- worldbank_wgi_from_wdi_api_2021 %>%
worldbank_2018_old filter(year == 2018) %>%
filter(country == "New Zealand")