This script gapfills the Social Progress Index (SPI) data and formats it for the OHI global assessment.
Now 11 years of data included in SPI. Created updated spi_categories.csv in Mazu. See the 2021 Methodology Report cited below for a detailed description of the SPI changes from 2020 to 2021. This is saved in Mazu and can also be downloaded here.
Citation: http://www.socialprogress.org/
Stern, S., Harmacek, J., Htitich, M., & Krylova, P.: 2021 Social Progress Index Methodology Summary. Social Progress Imperative. Washington, DC.
Source information: https://www.socialprogress.org/download https://www.socialprogress.org/index/global/methodology
Date Downloaded: 11/02/2021
Time range: 2011-2021
Native data resolution: country scores
Format: Excel file
Description: Social Progress Index scores and components for countries.
# load libraries, set directories
library(ohicore) #devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)
library(stringr)
library(tidyr)
library(Hmisc)
library(here)
library(validate)
library(tidyverse)
library(readxl)
## comment out when knitting
setwd(here::here("globalprep","prs_res_spi","v2021"))
### Load FAO-specific user-defined functions
source('../../../workflow/R/common.R') # directory locations
set.seed(227)
WGI is a couple years behind the SPI, so we will use 2018 WGI for the 2019, 2020, and 2021 WGI There is a strong correlation betwen the WGI and SPI indicators.
<- read.csv('../../prs_res_wgi/v2020/output/wgi_res.csv') %>%
wgi select(rgn_id, year, wgi_score = resilience_score)
length(unique(wgi$rgn_id))
<- wgi %>%
wgi_2019 filter(year == 2018) %>%
mutate(year = 2019)
<- wgi %>%
wgi_2020 filter(year == 2018) %>%
mutate(year = 2020)
<- wgi %>%
wgi_2021 filter(year == 2018) %>%
mutate(year = 2021)
<- rbind(wgi, wgi_2019, wgi_2020, wgi_2021)
wgi
<- wgi %>%
wgi_spi left_join(spi_rgn, by=c("rgn_id", "year"))
plot(wgi_spi$wgi_score*100, wgi_spi$score, ylab="Social Progress Index, score", xlab="Worldwide Governance Score")
abline(0,1, col="red")
<- lm(score ~ wgi_score, data=wgi_spi)
mod summary(mod)
In this case, UN geopolitical regions and WGI scores are used to estimate regions with no data. Based on this analysis, a model that includes WGI data and r2 UN geopolitical regions is the best model to predict missing SPI values.
<- data.frame(year = min(spi_rgn$year):max(spi_rgn$year))
years
<- georegions %>%
rgns_gf_un merge(years) %>%
left_join(spi_rgn, by = c("rgn_id","year")) %>%
left_join(wgi, by = c("rgn_id", "year")) %>%
mutate(r2 = as.factor(r2)) %>%
mutate(r1 = as.factor(r1))
## Compare models to select a gapfilling method
<- lm(score ~ r2, data = rgns_gf_un, na.action="na.exclude")
mod1 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod2 <- lm(score ~ r1, data = rgns_gf_un, na.action="na.exclude")
mod3 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod4 <- lm(score ~ wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod5
## the lowest AIC score is likely the best model
AIC(mod1, mod2, mod3, mod4, mod5)
## plot the models with the two lowest AIC scores
plot(predict(mod2), rgns_gf_un$score)
abline(0,1)
## Estimate missing data and gapfill
## need to add this because some R2 regions have no data
<- unique(rgns_gf_un$r2[!is.na(rgns_gf_un$score)])
r2_regions $r2 <- ifelse(rgns_gf_un$r2 %in% r2_regions, rgns_gf_un$r2, NA)
rgns_gf_un
<- unique(rgns_gf_un$r1[!is.na(rgns_gf_un$score)])
r1_regions $r1 <- ifelse(rgns_gf_un$r1 %in% r1_regions, rgns_gf_un$r1, NA)
rgns_gf_un
## Predict scores using r2 and wgi scores
<- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action = "na.exclude")
mod_gf_r2 $score_pred_r2 <- predict(mod_gf_r2, newdata = rgns_gf_un[, c("r2", "wgi_score")])
rgns_gf_un## Predict scores using r1 and wgi scores
<- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action = na.exclude)
mod_gf_r1 $score_pred_r1 <- predict(mod_gf_r1, newdata = rgns_gf_un[, c("r1", "wgi_score")])
rgns_gf_un## Predict scores just using wgi scores
<- lm(score ~ wgi_score, data = rgns_gf_un, na.action = na.exclude)
mod_gf_wgi $score_pred_wgi <- predict(mod_gf_wgi, newdata = data.frame(wgi_score = rgns_gf_un$wgi_score))
rgns_gf_un
## Record gapfill methods
## Combine scores with predicted model scores
<- rgns_gf_un %>%
rgns_gf mutate(method = ifelse(is.na(score), "UN georgn & WGI", method)) %>%
mutate(gapfill = ifelse(is.na(score), "1", gapfill)) %>%
mutate(score = ifelse(is.na(score), score_pred_r2, score)) %>%
mutate(score = ifelse(is.na(score), score_pred_r1, score)) %>%
mutate(score = ifelse(is.na(score), score_pred_wgi, score)) %>%
select(rgn_id, year, score, method, gapfill) %>%
mutate(score = ifelse(score > 100, 100, score),
score = ifelse(score < 0, 0, score))
summary(rgns_gf) # should be no NA values
length(unique(rgns_gf$rgn_id)) # should be 220 regions
These regions will receive an NA for their score (when established population is < 3000 people).
low_pop()
<- low_pop %>%
low_pop filter(est_population < 3000 | is.na(est_population))
<- rgns_gf %>%
rgns_gf_uninhab mutate(score = ifelse(rgn_id %in% low_pop$rgn_id, NA, score)) %>%
mutate(gapfill = ifelse(rgn_id %in% low_pop$rgn_id, NA, gapfill))
#gf_2019 <- read_csv(file.path("../v2019/output/spi_gf.csv"))
<- rgns_gf_uninhab %>%
gf_data select(rgn_id, year, gapfill, method) %>%
mutate(gapfill = ifelse(is.na(method), 0, 1))
write.csv(gf_data, "output/spi_gf.csv", row.names=FALSE)
<- rgns_gf_uninhab %>%
res_data select(rgn_id, year, resilience_score=score) %>%
mutate(resilience_score = resilience_score/100)
write.csv(res_data, "output/spi_res.csv", row.names=FALSE)
<- rgns_gf_uninhab %>%
prs_data select(rgn_id, year, pressure_score=score) %>%
mutate(pressure_score = 1 - (pressure_score/100))
write.csv(prs_data, "output/spi_prs.csv", row.names=FALSE)
## Resilience comparison, filtered for 2020
<- read_csv(file.path("output/spi_res.csv")) %>%
new_spi_res filter(year == 2020) %>%
select(rgn_id, new_resilience_score = resilience_score)
<- read_csv("../v2020/output/spi_res.csv") %>%
old_new filter(year == 2020) %>%
left_join(new_spi_res, by = 'rgn_id')
plot(old_new$new_resilience_score, old_new$resilience_score)
abline(0, 1, col="red")
## Pressure score comparison, filtered for 2020
<- read_csv(file.path("output/spi_prs.csv")) %>%
new_spi_prs filter(year == 2020) %>%
select(rgn_id, new_pressure_score = pressure_score)
<- read_csv("../v2020/output/spi_prs.csv") %>%
old_new filter(year == 2020) %>%
left_join(new_spi_prs, by = 'rgn_id')
plot(old_new$new_pressure_score, old_new$pressure_score)
abline(0, 1, col="red")
## Pressure score comparison, All years
<- read_csv(file.path("output/spi_prs.csv")) %>%
new_spi_prs #filter(year == 2018) %>%
select(rgn_id, new_pressure_score = pressure_score)
<- read_csv("../v2019/output/spi_prs.csv") %>%
old_new #filter(year == 2018) %>%
left_join(new_spi_prs, by = 'rgn_id')
plot(old_new$new_pressure_score, old_new$pressure_score)
abline(0, 1, col="red")
Social Progress Index data
Organize data and gapfill missing countries that have incomplete data. This index is comprised of 3 indicators, which are each comprised of 4 subindicators, which are comprised of several datasets. If one of the subindicators are missing, the SPI is not calculated. The first round of gapfilling involves using relationships between the the subindicators to gapfill missing data. If a region is missing all subindicator data, then a second round of gapfilling is performed using relationships between UN geopolitical regions and the World Governance Indicator to gapfill the SPI score.
The following gets all years of data.
Gapfilling: Step 1
In this case, we use relationships between the subindicators to estimate missing data.
Gapfill Basic Human Need (bhn) indicator
Gapfill Opportunity (op) indicator
Gapfill Foundations of Wellbeing (fw) indicator
Combine data to get Social Progress Index
The next step averages the 3 indicators and identifies gapfilling. All the rows where all four indicators are missing
NA_tot = 4
will have an NA in thescore
column.Assign countries to OHI regions