ohi logo
OHI Science | Citation policy

REFERENCE RMD FILE

1 Summary

This document describes the steps for obtaining and wrangling Global Competitiveness Index data to create the li_gci layer, used in resilience calculations for multiple goals for the global assessment. The general data preparation calculations are summarized here. For in-depth explanation and context see the livelihoods and economies subgoals model summaries.

2 Updates from Previous Assessments

Slight change to gapfilling for North Korea. Score was previously assigned to be 2.8, which was the minimum score calculated in the 2014 assessment. Now score is assigned to be the minimum value from recent 6 years of global data (non-gapfilled data). No other updates to the methodology were made.

3 Data Sources

Reference: https://www.weforum.org/reports/the-global-competitiveness-report-2017-2018

Downloaded: May 4, 2018

Description: A weighted index based on 12 socioeconomic factors relevant to economic competitiveness and productivity: institutions, infrastructure, macroeconomic environment, health and primary education, higher education and training, goods market efficiency, labor market efficiency, financial market development, technological readiness, market size, business sophistication, and innovation. See interactive map. Report on the 2017-2018 global competitiveness index is located here, and data can be downloaded here.

Native data resolution: Country scores

Time range: 2007-2017

Format: Excel file


4 Setup

4.1 Load Libraries

Load all relevant libraries, source common.R, and define frequently used pathnames. Manually change scenario and data years in file pathnames code chunk to reflect the most recent data and current assessment year.

## set options for all chunks in code
knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.width = 6, fig.height = 4, fig.path = 'figs/')

## UPDATE THESE!
scenario_yr <- "v2018" # change to reflect assessment year
data_yr_gci <- "d2018" # change to reflect year of most recently downloaded data

## comment out when knitting
# setwd(paste0("globalprep/res_gci/", scenario_yr))
## source common.R
source("../../../src/R/common.R")

## define commonly used file paths
path_raw_data <- file.path(dir_M, "git-annex/globalprep/_raw_data")
if(!requireNamespace("ohicore", quietly = TRUE))
  devtools::install_github("ohi-science/ohicore")
if(!requireNamespace("dplyr", quietly = TRUE))
  install.packages("tidyverse")

pkgs <- c("ohicore", "dplyr", "tidyr", "stringr", "readr")
lapply(pkgs, require, character.only = TRUE)

4.2 Import Raw Data

dir_wef  <- list.files(file.path(path_raw_data, sprintf("WEF-Economics/%s", data_yr_gci)), 
                       pattern = "GCI_Dataset\\S{10}.csv$", 
                       full.names = TRUE)
wef_raw <- read.csv(dir_wef, skip = 3, check.names = FALSE, stringsAsFactors = FALSE)

dir_wef # check reading the correct year of downloaded data

5 Methods and Calculations

5.1 Wrangle + Tidy

wef <- wef_raw[ , names(wef_raw) != ""] # eliminate columns without names

wef <- wef %>%
  filter(Series == "Global Competitiveness Index") %>%
  filter(Attribute == "Value") %>%
  select(-(1:2), -(4:8), year = Edition) %>%
  gather(country, value, -year) %>%
  mutate(score = as.numeric(value)) %>% # global competitiveness index score
  mutate(year = as.numeric(as.character(substring(year, 1, 4)))) %>%
  select(year, country, score) %>%
  filter(year >= 2010)

5.2 Gapfilling

Gapfilling for this resilience layer consists of three main steps:

  1. If a region has only one value from 2010 onward, use that value to gapfill other years
  2. Calculate and apply linear models to gapfill years if there is > 1 missing year for a region
  3. Use UN georegions to gapfill if there are no values for the OHI region

5.2.1 Gapfill Part I - Using Single Value or LM

wef_gf <- wef %>%
  group_by(country) %>%
  mutate(N = sum(!is.na(score))) %>%
  mutate(gf_mean = mean(score, na.rm=TRUE)) %>% # fill with single value
  ungroup()

table(wef_gf$N) # how many rows in each category of 1-8 missing scores?
filter(wef_gf, N==1) # Belize is only region with just 1 year of data (2011)

wef_gf <- wef_gf %>%
  group_by(country) %>%
  do({
    mod <- lm(score ~ year, data = .)
    gf_lm <- predict(mod, newdata = .[c('year')])
    data.frame(., gf_lm) # fill with linear model if more than 1 missing year for a region
  }) %>%
  ungroup() # warning message bc some linear models predicted on just 1 year of data, but that's OK

filter(wef_gf, N==4) # check the data: filter by a few different N and countries

5.2.2 Document Gapfilling Part I

N_max <- max(wef_gf$N) # assumes at least one region with all years of data

wef_gf <- wef_gf %>%
  mutate(gapfill = ifelse(N < N_max & is.na(score), 1, 0)) %>%
  mutate(method = ifelse(gapfill == 1 & N == 1, "within region: one value all years", NA)) %>%
  mutate(method = ifelse(gapfill == 1 & N < N_max & N > 1, "within region: lm", method)) %>%
  mutate(score = ifelse(gapfill == 1 & N == 1, gf_mean, score)) %>%
  mutate(score = ifelse(gapfill == 1 & N < N_max & N > 1, gf_lm, score)) %>%
  select(year, country, score, gapfill, method)

summary(wef_gf)
length(unique(wef_gf$country)) # 159 countries (2018 assessment)

5.2.3 To OHI Regions

wef <- wef_gf %>%
  mutate(country = as.character(country)) %>%
  mutate(country = ifelse(country == "Congo, Democratic Rep.", "Democratic Republic of the Congo", country)) %>%
  mutate(country = ifelse(str_detect(country, "C\\Ste d'Ivoire"), "Ivory Coast", country))

wef_rgn <- name_2_rgn(df_in = wef, 
                       fld_name='country', 
                       flds_unique=c('country','year'))

table(wef_rgn$rgn_id)
filter(wef_rgn, rgn_id == 209) # region 209 is China

weight_data <- data.frame(country = c("China", "Hong Kong SAR"),
                          population = c(1379000000, 7347000))

wef_rgn <- wef_rgn %>%
  arrange(country, year) %>%
  left_join(weight_data, by = "country") %>%
  mutate(population = ifelse(is.na(population), 1, population)) %>%
  group_by(rgn_id, rgn_name, gapfill, method, year) %>%
  summarize(score = weighted.mean(score, population)) %>%
  select(rgn_id, rgn_name, year, gapfill, method, score)

head(wef_rgn, 10) # check
summary(wef_rgn)
setdiff(seq(1, 250), unique(wef_rgn$rgn_id)) # missing regions...

5.2.4 Gapfill Part II - Using UN Georegions

years <- data.frame(year = min(wef_rgn$year) : max(wef_rgn$year))

wef_rgn_gf_un <- georegions %>% # georegions sourced from common.R
  merge(years) %>%
  left_join(wef_rgn) # this joining adds in missing regions; will be gapfilled by methods below

## compare models to select a gapfilling method (which set of georegions to use)
mod1 <- lm(score ~ as.factor(r2), data = wef_rgn_gf_un)
mod2 <- lm(score ~ as.factor(r2) + year, data = wef_rgn_gf_un)
mod3 <- lm(score ~ as.factor(r1), data = wef_rgn_gf_un)

summary(mod1)
summary(mod2)
summary(mod3)

## include since some R2 regions have no data...
r2_regions <- unique(wef_rgn_gf_un$r2[!is.na(wef_rgn_gf_un$score)])
wef_rgn_gf_un$r2 <- ifelse(wef_rgn_gf_un$r2 %in% r2_regions, wef_rgn_gf_un$r2, NA)
r1_regions <- unique(wef_rgn_gf_un$r1[!is.na(wef_rgn_gf_un$score)])
wef_rgn_gf_un$r1 <- ifelse(wef_rgn_gf_un$r1 %in% r1_regions, wef_rgn_gf_un$r1, NA)

## estimate missing data and predict gapfilling-by-un regions scores
mod_gf_r2 <- lm(score ~ as.factor(r2), data = wef_rgn_gf_un, na.action = na.exclude)
wef_rgn_gf_un$score_pred_r2 <- predict(mod_gf_r2, newdata = data.frame(r2 = wef_rgn_gf_un$r2))
mod_gf_r1 <- lm(score ~ as.factor(r1), data = wef_rgn_gf_un, na.action = na.exclude)
wef_rgn_gf_un$score_pred_r1 <- predict(mod_gf_r1, newdata = data.frame(r1 = wef_rgn_gf_un$r1))

head(wef_rgn_gf_un) # checks
summary(wef_rgn_gf_un)

5.3 Organize and Save

5.3.1 Organize the Data

## incorporate gapfilling part II (with un georegions) including documentation
## running this line by line is helpful and view between each line; esp after last mutate(method = ...)
wef_gf2 <- wef_rgn_gf_un %>%
  mutate(gapfill = ifelse((is.na(score) & !(is.na(score_pred_r1))), 1, gapfill)) %>%
  mutate(method = ifelse((is.na(score) & !(is.na(score_pred_r2))), "UN r2 georegion", method)) %>%
  mutate(method = ifelse((is.na(score) & is.na(score_pred_r2) & !is.na(score_pred_r1)), "UN r1 georegion", method)) %>%
  mutate(score = ifelse(is.na(score), score_pred_r2, score)) %>%
  mutate(score = ifelse(is.na(score), score_pred_r1, score)) %>%
  select(rgn_id, year, gapfill, method, score)
  
summary(wef_gf2)
length(unique(wef_gf2$rgn_id)) # checks; should be 220 regions

5.3.2 Uninhabited Regions

These regions will receive an NA for their score (when established population is < 100 people).

uninhab <- read.csv("../../../src/LookupTables/rgn_uninhabited_islands.csv") %>%
  filter(is.na(est_population) | est_population < 100)

wef <- wef_gf2 %>%
  mutate(score = ifelse(rgn_id %in% uninhab$rgn_id, NA, score)) %>%
  mutate(gapfill = ifelse(rgn_id %in% uninhab$rgn_id, NA, gapfill)) %>%
  mutate(method = ifelse(rgn_id %in% uninhab$rgn_id, NA, method))

unique(wef[is.na(wef$score),"rgn_id"]) # regions missing scores
uninhab$rgn_id # check this set and above set of rgn_ids equal

5.3.3 Gapfill North Korea

## appears to have been calculated at one time with real data; fill with the old value of 2.8
## gapfilled using un regions r2 above, but replace score (special case)
nk_gf <- wef %>% 
  filter(gapfill == 0) %>% 
  filter(year >= max(year)-5) %>%
  summarize(score = min(score))

wef <- wef[-which(wef$rgn_id == 21),]
nk <- data.frame(rgn_id = 21, year = min(wef$year):max(wef$year), gapfill = 1, 
                 method = "min from recent 6 years of global data", score = nk_gf)
wef <- rbind(wef, nk)

5.4 Save Data

res_data <- wef %>%
  select(rgn_id, year, resilience_score = score) %>%
  mutate(resilience_score = resilience_score/7) # score ranges from 0-7, normalize to range of 0-1
write.csv(res_data, "output/gci_res.csv", row.names = FALSE)

gf_data <- wef %>%
  select(rgn_id, year, gapfill, method)
write.csv(gf_data, "output/gci_gf.csv", row.names = FALSE)

5.5 Additional Data Checks and/or Meta-Analysis

5.5.1 Compare with Previous Years

previous_yr <- as.numeric(substr(scenario_yr, 2, 5)) - 1
old <- read.csv(sprintf("../v%s/output/gci_res.csv", previous_yr)) 
old <- old %>%
  rename(old_score = resilience_score) %>%
  left_join(res_data, by = c("rgn_id", "year"))
plot(old$old_score, old$resilience_score)
# identify(old$old_score, old$resilience_score, labels = old$rgn_id) # to look at outliers
abline(0,1, col="red")