ohi logo
OHI Science | Citation policy

REFERENCE RMD FILE

1 Summary

This script downloads WGI data and prepares it for a pressures (1 - WGI) and resilience data layer.

2 Updates from previous assessment

No methods updates; additional year of data added.

Consider this improvement for future assessments: create a linear model to estimate missing data rather than just taking averages of years with data (~ line 166).


3 Data Source

Reference: http://info.worldbank.org/governance/wgi/index.aspx#home

Downloaded: June 23 2020 (data updated Sep 20 2019)

Description:
The Worldwide Governance Indicators (WGI) project reports aggregate and individual governance indicators for 215 economies over the period 1996–2018, for six dimensions of governance:

  • Voice and Accountability
  • Political Stability and Absence of Violence
  • Government Effectiveness
  • Regulatory Quality
  • Rule of Law
  • Control of Corruption

Time range: 1996-2018


library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(tools)
library(dplyr)
library(tidyr)
library(WDI) # install.packages('WDI')  # used to extract World Development Indicator (World Bank) data 
library(stringr)
library(here)
library(ggplot2)
library(plotly)


# check website to see what years are available: http://info.worldbank.org/governance/wgi/index.aspx#home
yr_start = 1996
yr_end   = 2018

4 Obtain the WGI data

Download each of the 6 WGI indicators:

## access data through the WDI package

## get description of variables:
indicators <-  data.frame(WDI_data[[1]])
indicators[grep("VA.EST", indicators$indicator), ]
indicators[grep("PV.EST", indicators$indicator), ]
indicators[grep("GE.EST", indicators$indicator), ]
indicators[grep("RQ.EST", indicators$indicator), ]
indicators[grep("RL.EST", indicators$indicator), ]
indicators[grep("CC.EST", indicators$indicator), ]

# identify the six indicators
# WDIsearch('violence')# general search
key_voice = WDI(
  indicator = WDIsearch('Voice and Accountability: Estimate', field='name')['indicator'],
  country = 'all', start = yr_start, end=yr_end)

key_polst = WDI(
  WDIsearch('Political Stability and Absence of Violence/Terrorism: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_gvtef = WDI(
  WDIsearch('Government Effectiveness: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_regqt = WDI(
  WDIsearch('Regulatory Quality: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_rolaw = WDI(
  WDIsearch('Rule of Law: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

key_corrp = WDI(
  WDIsearch('Control of Corruption: Estimate', field='name')['indicator'],
  country='all',start = yr_start, end=yr_end)

Combine the indicators into a single table, with a column for each indicator, and rows for each country-year pair.

d = key_voice %>% 
  select(country, year, indicator) %>%
  left_join(key_polst %>% select(-iso2c), by=(c('country', 'year'))) %>%
  left_join(key_gvtef %>% select(-iso2c), by=(c('country', 'year'))) %>%
  left_join(key_regqt %>% select(-iso2c), by=(c('country', 'year'))) %>%
  left_join(key_rolaw %>% select(-iso2c), by=(c('country', 'year'))) %>%
  left_join(key_corrp %>% select(-iso2c), by=(c('country', 'year'))); head(d); summary(d); sapply(d, class)

colnames(d) <- c("country", "year", "VA.EST", "PV.EST", "GE.EST",   "RQ.EST",   "RL.EST", "CC.EST")

4.1 Save a record of any new raw data for archival purposes

Uncomment the code chunk lines when updating WGI data, this will most likely occur when calculating for new assessment year:

date <- Sys.Date()
write.csv(d, here(sprintf('globalprep/prs_res_wgi/v2020/raw/worldbank_wgi_from_wdi_api_%s.csv', date)), row.names=FALSE)


# This dataset currently has non-OHI regions included

5 Gapfill, part 1: filling missing years of data for indicators, within countries

The first gapfilling occurs when we use the average of previous years data within each region/indicator. This occurs when a region has data for an indicator, but not for all years.

Read in WGI data - change appended date in file name to reflect the most recent version of the saved WGI data:

d <- read.csv(here('globalprep/prs_res_wgi/v2020/raw/worldbank_wgi_from_wdi_api_2020-06-23.csv')) # change appended date so that it only changes if you do so actively
d <- gather(d, "indicator", "value", VA.EST:CC.EST)


## each country has 20 years of data
d_gap_fill  <- d %>%
  group_by(country, year) %>%
  mutate(NA_count_c_y = sum(is.na(value))) %>% # gf record: NA values within a region/year prior to gapfilling, max value is 6 (meaning that a country has no data)
  ungroup() %>%
  group_by(country, indicator) %>% # gapfill missing data with mean of values across years within the same region/indicator
  mutate(ind_mean_c_i = mean(value, na.rm=TRUE)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.na(value), ind_mean_c_i, value)) %>%
  group_by(country, year) %>% 
  mutate(NA_count_post_gf1 = sum(is.na(value))) # gf record: NA values within a region/year after within region/indicator gapfilling (i.e. indicator is gapfilled by other years of data), used to cut regions <4 indicators (below)    

Explore creating a linear model to estimate missing data rather than just taking averages of years with data (~line 166)

## explore creating a linear model to estimate missing data rather than just taking averages of years with data (~line 166)


## each country has 20 years of data
# d_gap_fill_lm  <- d %>%
#   group_by(country, year) %>%
#   mutate(NA_count_c_y = sum(is.na(value))) %>% # gf record: NA values within a region/year prior to gapfilling, max value is 6 (meaning that a country has no data)
#   ungroup() %>%
#   group_by(country, indicator) # gapfill missing data with mean of values across years within the same region/indicator
# 
# ## linear model 
# ind_lm_c_i <- lm(value ~ ., data = d, na.action = na.omit)
# summary(ind_lm_c_i)
# 
# 
#   mutate(ind_lm_c_i = lm(value ~ ., data = d)) %>%
#   ungroup() %>%
#   mutate(value = ifelse(is.na(value), ind_lm_c_i, value)) %>%
#   group_by(country, year) %>% 
#   mutate(NA_count_post_gf1 = sum(is.na(value))) # gf record: NA values within a region/year after within region/indicator gapfilling (i.e. indicator is gapfilled by other years of data), used to cut regions <4 indicators (below)

5.1 Safeguard: cut regions with < 4 indicators (if any) to calculate score.

Once gapfilling is complete, the WGI scores are calculated as an average of the 6 indicators. However, if a country is missing 4 or more of the indicators within a year the average would be very biased. In these cases, a different method should be used to gapfill these data

(NOTE: for the 2020 assessment all regions had at least 4 of the 6 indicators).

countries_no_data <- d_gap_fill %>%
  filter(NA_count_post_gf1 > 3)

countries_no_data <- unique(countries_no_data$country)
countries_no_data

## In this case, the countries with minimal data (< 4 indicators ever calculated) are deleted.  
## These will be gap-filled later on if they are deleted now.
d_gap_fill <- d_gap_fill %>%
  filter(!(country %in% countries_no_data))

6 Calculate overall WGI score for each country

This involves:

  • taking the average of the 6 indicators (assuming there are at least 4 of the 6 indicators)
  • rescaling the data from 0 to 1
d_calcs <- d_gap_fill %>%
  group_by(country, year) %>%
  summarize(score_wgi_scale = mean(value, na.rm=T),
            NA_start = mean(NA_count_c_y), # initial mean number of NA across indicators, pre-gapfill 
            NA_post_gf_1 = mean(NA_count_post_gf1)) %>% # number of NA across indicators, post-gapfill across year gapfill within region/indicator
  ungroup() 

6.1 Check that the values in scores_wgi_scale fall within the wgi range specified below:

# summary(d_calcs) - checking to make sure NA values make sense
  ## v2020: NA_start max = 6, NA_post_gf_1 = 3
wgi_range = c(-2.5, 2.5) # historic values have been between -2.5 and 2.5
d_calcs <- d_calcs %>%
  mutate(score =  (score_wgi_scale - wgi_range[1]) / (wgi_range[2] - wgi_range[1])) %>%
  ungroup(); head(d_calcs); summary(d_calcs)
# converts the scores between 0 and 1, using min/max to scale the data

# document gapfilling
d_calcs <- d_calcs %>%
  mutate(gap_fill = NA_start - NA_post_gf_1,   # if there are values in NA_post_gf_1, it means these weren't gapfilled
         gap_fill = ifelse(is.na(score), 0, gap_fill)) %>% # number of values that were gapfilled
  select(-NA_start, -NA_post_gf_1)

6.2 Explore & check intermediate d_calcs data table

d_calcs[d_calcs$gap_fill>0, ]
d_calcs[d_calcs$country == "New Caledonia", ]  # no data, was deleted earlier
d_calcs[d_calcs$country == "Niue", ] # should have gap-fill values between 0-6
d_calcs[d_calcs$country == "American Samoa",] # should have gap-fill values between 0-6

6.3 Save intermediate file

## save intermediate file of wgi scores pre-gapfilling (for OHI+ use)
write.csv(d_calcs %>%
            select(country, year, score_wgi_scale, score_ohi_scale = score), 
          here('globalprep/prs_res_wgi/v2020/intermediate/wgi_combined_scores_by_country.csv'),
          row.names = FALSE)

7 Convert country names to ohi regions

## We report these regions at a greater spatial resolution:

## Aruba is part of the Netherlands Antilles, but it is reported separately
country_split_1 <- data.frame(country = "Netherlands Antilles", region = c('Bonaire', 'Curacao', 'Saba', 'Sint Maarten', 'Sint Eustatius'))
country_split_2 <- data.frame(country = "Jersey, Channel Islands", region = c('Jersey', 'Guernsey'))
country_split <- rbind(country_split_1, country_split_2)

country_split_data <- country_split %>%
  left_join(d_calcs) %>%
  select(-country) %>%
  rename(country = region)

d_calcs <- d_calcs %>%
  filter(!(country %in% c("Netherlands Antilles", "Jersey, Channel Islands"))) %>%
  rbind(country_split_data)  %>%
  mutate(country = as.character(country))

d_calcs$country[grep("Korea, Dem.", d_calcs$country)] <- "North Korea"
# Maybe in future update package with country synonym list


## Function to convert to OHI region ID
d_calcs_rgn <- name_2_rgn(df_in = d_calcs, 
                       fld_name='country', 
                       flds_unique=c('year'))
# Eswatini is a landlocked country (aka Swaziland) in Southern Africa

## Combine the duplicate regions (we report these at lower resolution)
## In this case, we take the weighted average
population_weights <- data.frame(country = c("Virgin Islands (U.S.)", "Puerto Rico",
                                             "China", "Hong Kong SAR, China", "Macao SAR, China"),
                                 population = c(106977, 3195153,
                                         1392730000, 7451000, 631636))
# updated population values on 23 June 2020 (source: World Bank website, 2018 values)


d_calcs_rgn <- d_calcs_rgn %>%
  left_join(population_weights, by="country") %>%
  mutate(population = ifelse(is.na(population), 1, population)) %>% 
  group_by(rgn_id, year) %>%
  summarize(score = weighted.mean(score, population),
            gapfill_within_rgn = weighted.mean(gap_fill, population)) %>%
  ungroup() %>%
  filter(rgn_id <= 250)

summary(d_calcs_rgn)

8 Gapfill, part 2: Filling in missing territorial region value

Assigning territorial region value to be the mean of parent country value and territorial regions with data (using same sov_id).

## data that describes territories of countries
territory = rgn_master %>% 
  select(rgn_id = rgn_id_2013,
         sov_id) %>%               
  group_by(rgn_id) %>% # remove duplicated countries from this rgn_id list
  summarize(sov_id = mean(sov_id, na.rm=T)) %>% # duplicates always have the same sov_id (r2 value)
  filter(rgn_id <= 250, rgn_id != 213)

    
## expand to include all years of data
territory <- data.frame(year=yr_start:yr_end) %>% 
  merge(territory, by=NULL) 


## assign territories the values of their sovereign country
d_sovs = d_calcs_rgn %>% 
  full_join(territory, by = c('rgn_id', 'year')) %>%
  group_by(sov_id, year) %>%
  mutate(score_gf_territory = mean(score, na.rm=TRUE),
         gapfill_within_rgn = mean(gapfill_within_rgn, na.rm=TRUE))%>%
   filter(!is.na(gapfill_within_rgn)) %>%
  ungroup()

# filter(d_sovs, rgn_id %in% c(1,2,3,16) & year == 2018) check in console to make sure Australia and territories have the same score and sov_id

Define new data object from d_sovs which includes gapfill method and gapfilled scores:

d_gf2 <- d_sovs %>%
  mutate(gapfill_territory = ifelse(is.na(score) & !is.na(score_gf_territory), "territory", "NA")) %>%
  mutate(score = ifelse(is.na(score), score_gf_territory, score)) %>%
  select(rgn_id, year, score, gapfill_within_rgn, gapfill_territory)

Add region names and clean the region data, and make sure we have all the regions:

# get region names
regions <- rgn_master %>%
  filter(rgn_typ == "eez") %>%
  filter(rgn_id_2013 <= 250) %>% # > 250 are either FAO or a disputed region
  filter(rgn_id_2013 != 213) %>% # 213 is Antarctica
  select(rgn_id = rgn_id_2013, rgn_name=rgn_nam_2013) %>%
  unique() %>%
  arrange(rgn_id) 

d_gf2 <- regions %>%
  left_join(d_gf2)

8.1 Look at data table for the territories (gapfilled)

head(d_sovs)
summary(d_sovs) 
  ## v2020: score has 980 NAs

8.2 Look at table with scores and gapfilling methods

## check for NA values within "score" variable
## if so, need to gapfill using UN geopolitical regions
summary(d_gf2) 
  ## v2020: score has 0 NAs

9 Check data

Comparing this year’s values against last year’s. These should be the same unless there have been updates to WGI source data or a change to methods. For this year (2019), there was a small change that affected a few territorial regions. In the past, we used the sovereign country value, but in this case, we averaged the sovereign country and the available territorial values.

Plot most recent shared year between last year and this year’s data, and look for a relationship close to a 1:1 relationship. If data are significantly off the line, look at the original (raw) data to investigate.

new2017 <- d_gf2 %>%
  filter(year==2017) %>% # update to most recent shared year for comparison
  select(rgn_id, score)

old2017 <- read.csv(here('globalprep/prs_res_wgi/v2019/output/wgi_res.csv')) %>% # change filepath to the previous assessment year's, rename variable for clarity
  filter(year == 2017) %>%
  select(rgn_id, old_score=resilience_score) %>%
  full_join(new2017)


score_compare_plot <- ggplot(old2017, aes(x = old_score, y = score, text = rgn_id)) +
  geom_point() +
  geom_abline(slope=1, intercept = 0, col = "red")
   
ggplotly(score_compare_plot) # looks good (no outliers), no real need for comparison

Look at top/bottom 10 regions to make sure these seem reasonable:

## Top/Bottom 10 scorers:

tmp <- d_gf2 %>%
  filter(year==2018) %>%
  arrange(score) %>%
  select(rgn_id, score) %>%
  left_join(regions)

tmp[1:10, ]
tmp[211:220, ]

hist(tmp$score)

Look at a summary to confirm scores are between 0 and 1, there are 220 regions, and there are no NAs (for this particular dataset):

summary(d_gf2) # scores are between 0-1, gapfilled info present, no NAs
length(unique(d_gf2$rgn_id)) # 220 regions
c(min(d_gf2$score), max(d_gf2$score)) # checking for score min and max; making sure they're between 0-1

10 Save the data

Save gapfilling and data for this assessment year.

tmp_data_res <- d_gf2 %>%
  select(rgn_id, year, resilience_score = score)
write.csv(tmp_data_res, here("globalprep/prs_res_wgi/v2020/output/wgi_res.csv"), row.names=FALSE)

tmp_data_prs <- d_gf2 %>%
  mutate(score = 1 - score) %>%
  select(rgn_id, year, pressure_score = score)
write.csv(tmp_data_prs, here("globalprep/prs_res_wgi/v2020/output/wgi_prs.csv"), row.names=FALSE)

# gapfilled data
tmp_gf <- d_gf2 %>%
  select(rgn_id, year, gapfill_within_rgn, gapfill_territory) %>%
  mutate(gapfill_within_rgn = ifelse(gapfill_within_rgn == 0, NA, gapfill_within_rgn)) %>%
  mutate(gapfill_within_rgn = ifelse(!is.na(gapfill_within_rgn), 
                                            paste0("gapfill_within_rgn: ", gapfill_within_rgn), NA)) %>%
  mutate(gapfill_territory = ifelse(gapfill_territory == "territory", "territory, mean of admin countries", NA)) %>%
  mutate(method = paste(gapfill_within_rgn, gapfill_territory, sep="; ")) %>%
  mutate(method = gsub("NA; ", "", method)) %>%
  mutate(method = gsub("; NA", "", method)) %>%
  mutate(method = gsub("NA", NA, method)) %>%
  mutate(gapfilled = ifelse(is.na(method), 0, 1)) %>%
  select(rgn_id, year, gapfilled, method)


write.csv(tmp_gf, here("globalprep/prs_res_wgi/v2020/output/wgi_gf.csv"), row.names=FALSE)
  # change format to match data SOP, add method column

Checking on outliers in 2019 assessment:

## Do this after updating in global

worldbank_wgi_from_wdi_api <- read_csv("globalprep/prs_res_wgi/v2019/raw/worldbank_wgi_from_wdi_api_2019-04-15.csv")

worldbank_2016 <- worldbank_wgi_from_wdi_api %>% 
  filter(year == 2016) %>% 
  filter(country == "Australia" | country == "Marshall Islands")

# Compare to data from 2018:
worldbank_wgi_from_wdi_api_2018 <- read_csv("globalprep/prs_res_wgi/v2018/raw/worldbank_wgi_from_wdi_api_2018-04-13.csv")

worldbank_old <- worldbank_wgi_from_wdi_api_2018 %>% 
  filter(year == 2016) %>% 
  filter(country == "Australia" | country == "Marshall Islands")