1 Summary

This script gapfills the Social Progress Index (SPI) data and formats it for the OHI global assessment.

2 Updates

Now 6 years of data included in SPI.Created updated spi_categories.csv in Mazu. See the 2019 Methodology Report cited below for a detailed description of the SPI changes from 2018 to 2019. This is saved in Mazu and can also be downloaded here.

Citation: http://www.socialprogress.org/

Stern, S., A. Wares and T. Epner. 2019. Social Progress Index: 2019 Methodology Report.

Source information: https://www.socialprogress.org/download https://www.socialprogress.org/index/global/methodology

Date Downloaded: 9/27/2019

Time range: 2014-2019

Native data resolution: country scores

Format: Excel file

Description: Social Progress Index scores and components for countries.

# load libraries, set directories
library(ohicore)  #devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)
library(stringr)
library(tidyr)
library(Hmisc)
library(here)
library(validate)
library(tidyverse)

## comment out when knitting
setwd(here::here("globalprep","prs_res_spi","v2019"))


### Load FAO-specific user-defined functions
source('../../../workflow/R/common.R') # directory locations

## This file makes it easier to process data for the OHI global assessment
##  by creating the following objects:
## 
##  * dir_M = identifies correct file path to Mazu (internal server) based on your operating system
##  * mollCRS = the crs code for the mollweide coordinate reference system we use in the global assessment
##  * regions_shape() = function to load global shapefile for land/eez/high seas/antarctica regions
##  * ohi_rasters() = function to load two rasters: global eez regions and ocean region
##  * region_data() = function to load 2 dataframes describing global regions 
##  * rgn_syns() = function to load dataframe of region synonyms (used to convert country names to OHI regions)
##  * low_pop() = function to load dataframe of regions with low and no human population
##  * UNgeorgn = function to load dataframe of UN geopolitical designations used to gapfill missing data

set.seed(227)

3 Social Progress Index data

Organize data and gapfill missing countries that have incomplete data. This index is comprised of 3 indicators, which are each comprised of 4 subindicators, which are comprised of several datasets. If one of the subindicators are missing, the SPI is not calculated. The first round of gapfilling involves using relationships between the the subindicators to gapfill missing data. If a region is missing all subindicator data, then a second round of gapfilling is performed using relationships between UN geopolitical regions and the World Governance Indicator to gapfill the SPI score.

The following gets all years of data (currently in separate files).

ToleranceandInclusion and AccesstoInformationandCommunications

cats <- read.csv(file.path(dir_M, 'git-annex/globalprep/_raw_data/SocialProgressIndex/d2019/spi_categories.csv')) %>%
  mutate(subcategory = gsub(" ", "", subcategory))

files <- list.files(file.path(dir_M, 
          "git-annex/globalprep/_raw_data/SocialProgressIndex/d2019"),
          pattern = "spi_(\\d+)", full = TRUE)

all_spi <- data.frame()

for(spi in files){   #spi = files[6]

spi_data <- read.csv(spi, check.names=FALSE, stringsAsFactors=FALSE) 

names(spi_data) <- gsub(" ", "", names(spi_data))

yr <- str_extract(basename(spi), "(\\d+)")

spi_data <- spi_data %>%
  dplyr::select(-1, -Code, - Status) %>% ## have to deselect the first column using the term '1' because each file has a different first column name... i.e. 2014ranking, 2015ranking, etc... 
  filter(Country != "World") %>% ## don't include the 'World' ranking, as we are only interested in each country. 
  mutate(year = yr) %>%
  gather('subcategory', 'score', -Country, -year) %>%
  filter(subcategory %in% cats$subcategory) %>%
  left_join(cats, by = "subcategory")

all_spi <- rbind(all_spi, spi_data)

}

3.1 Gapfilling: Step 1

In this case, we use relationships between the subindicators to estimate missing data.

3.1.1 Gapfill Basic Human Need (bhn) indicator

set.seed(227)
bhn_subs <- all_spi %>%
  dplyr::filter(category %in% c("bhn")) %>%
  spread(subcategory, score)

bhn_tmp <- all_spi %>%
  dplyr::filter(category %in% c("bhn_score")) %>%
  select(Country, year, bhn_score=score) %>%
  left_join(bhn_subs, by=c("Country", "year"))


bhn_tmp <- bhn_tmp %>%
  rowwise() %>%
  mutate(NA_tot = sum(is.na(NutritionandBasicMedicalCare), is.na(PersonalSafety), is.na(Shelter),
             is.na(WaterandSanitation)))

## Ideally most NA_tot values are 0
hist(bhn_tmp$NA_tot)

table(bhn_tmp$NA_tot)

## 
##    0    1    2    3    4 
## 1002   48   12   84  282

#library(mice)
# md.pattern(select(bhn_tmp, -(1:4))) # mice package

imputes <- 50
bhn_gf <- aregImpute(~ WaterandSanitation + Shelter + NutritionandBasicMedicalCare + PersonalSafety, 
                         data = bhn_tmp, type = "regression", n.impute = imputes)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53

bhn_gf

## 
## Multiple Imputation using Bootstrap and PMM
## 
## aregImpute(formula = ~WaterandSanitation + Shelter + NutritionandBasicMedicalCare + 
##     PersonalSafety, data = bhn_tmp, n.impute = imputes, type = "regression")
## 
## n: 1428  p: 4    Imputations: 50     nk: 3 
## 
## Number of NAs:
##           WaterandSanitation                      Shelter 
##                          324                          378 
## NutritionandBasicMedicalCare               PersonalSafety 
##                          360                          390 
## 
##                              type d.f.
## WaterandSanitation              s    2
## Shelter                         s    2
## NutritionandBasicMedicalCare    s    2
## PersonalSafety                  s    1
## 
## Transformation of Target Variables Forced to be Linear
## 
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
##           WaterandSanitation                      Shelter 
##                        0.907                        0.910 
## NutritionandBasicMedicalCare               PersonalSafety 
##                        0.904                        0.515

# to get mean and sd of all imputations
impute_scores_all <- data.frame()

for (imp in 1:imputes){ 
   #imp = 1
  
  imputed <- impute.transcan(bhn_gf, imputation=imp, data=bhn_tmp, list.out=TRUE,
                           pr=FALSE, check=FALSE)
  
      subcat_data <- data.frame(imputed)
  
      impute_scores <- data.frame(Country = bhn_tmp$Country, 
                                  year = bhn_tmp$year,
                                  imputation = imp)
      
      impute_scores <- cbind(impute_scores, subcat_data)
    
      impute_scores_all <- rbind(impute_scores_all, impute_scores)
    }

## Convert class of value columns from `impute` to `numeric` to avoid "Warning message: attributes are not identical across measure variables; they will be dropped" when gathering columns
impute_scores_all$WaterandSanitation <- as.numeric(impute_scores_all$WaterandSanitation)
impute_scores_all$Shelter <- as.numeric(impute_scores_all$Shelter)
impute_scores_all$NutritionandBasicMedicalCare <- as.numeric(impute_scores_all$NutritionandBasicMedicalCare)
impute_scores_all$PersonalSafety <- as.numeric(impute_scores_all$PersonalSafety)

impute_scores_summary <- impute_scores_all %>%
  gather("subcategory", "score", -(1:3)) %>%
  group_by(Country, year, subcategory) %>%
  dplyr::summarize(score_predict = mean(score),
            sd_score_predict = sd(score)) %>% 
  ungroup()

bhn_tmp_long <- bhn_tmp %>%
  select(Country, year, bhn_score, NA_tot, NutritionandBasicMedicalCare, 
         PersonalSafety, Shelter, WaterandSanitation) %>%
  gather("subcategory", "score", -(1:4)) %>%
  left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

## Warning: Column `year` joining character vector and factor, coercing into
## character vector

## Should be no NAs in column sd_score_predict or score_predict
summary(bhn_tmp_long)

##    Country              year             bhn_score         NA_tot     
##  Length:5712        Length:5712        Min.   :21.79   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:58.16   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :80.75   Median :0.000  
##                                        Mean   :73.85   Mean   :1.017  
##                                        3rd Qu.:89.29   3rd Qu.:3.000  
##                                        Max.   :98.44   Max.   :4.000  
##                                        NA's   :1704                   
##  subcategory            score        score_predict    sd_score_predict
##  Length:5712        Min.   :  9.09   Min.   :  9.09   Min.   : 0.000  
##  Class :character   1st Qu.: 58.53   1st Qu.: 58.14   1st Qu.: 0.000  
##  Mode  :character   Median : 79.50   Median : 73.54   Median : 0.000  
##                     Mean   : 74.47   Mean   : 72.68   Mean   : 3.214  
##                     3rd Qu.: 93.13   3rd Qu.: 91.02   3rd Qu.: 3.838  
##                     Max.   :100.00   Max.   :100.00   Max.   :27.645  
##                     NA's   :1452

bhn_data <- bhn_tmp_long %>%
  mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize( #bhn_score_old = mean(bhn_score), # used this to test to make sure all is well, use `check_that( )` below
            score = mean(score),
            NA_tot = mean(NA_tot)) %>%
  ungroup() %>% 
  mutate(category = "bhn")

3.1.2 Gapfill Opportunity (op) indicator

set.seed(227)
op_subs <- all_spi %>%
  dplyr::filter(category %in% c("op")) %>%
  spread(subcategory, score)

op_tmp <- all_spi %>%
  dplyr::filter(category %in% c("op_score")) %>%
  select(Country, year, op_score=score) %>%
  left_join(op_subs, by=c("Country", "year"))


op_tmp <- op_tmp %>%
  rowwise() %>%
  mutate(NA_tot = sum(is.na(AccesstoAdvancedEducation), is.na(PersonalFreedomandChoice), is.na(PersonalRights),
             is.na(Inclusiveness)))

## Ideally most NA_tot values are 0
hist(op_tmp$NA_tot)

table(op_tmp$NA_tot)

## 
##   0   1   2   3   4 
## 996  36  18  42 336

#md.pattern(select(op_tmp, -(1:4))) # mice package

imputes <- 50
op_gf <- aregImpute(~ AccesstoAdvancedEducation + PersonalFreedomandChoice + PersonalRights + Inclusiveness, 
                         data = op_tmp, type = "regression", n.impute = imputes)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53

op_gf

## 
## Multiple Imputation using Bootstrap and PMM
## 
## aregImpute(formula = ~AccesstoAdvancedEducation + PersonalFreedomandChoice + 
##     PersonalRights + Inclusiveness, data = op_tmp, n.impute = imputes, 
##     type = "regression")
## 
## n: 1428  p: 4    Imputations: 50     nk: 3 
## 
## Number of NAs:
## AccesstoAdvancedEducation  PersonalFreedomandChoice 
##                       378                       378 
##            PersonalRights             Inclusiveness 
##                       378                       408 
## 
##                           type d.f.
## AccesstoAdvancedEducation    s    2
## PersonalFreedomandChoice     s    2
## PersonalRights               s    2
## Inclusiveness                s    1
## 
## Transformation of Target Variables Forced to be Linear
## 
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
## AccesstoAdvancedEducation  PersonalFreedomandChoice 
##                     0.726                     0.753 
##            PersonalRights             Inclusiveness 
##                     0.692                     0.786

# to get mean and sd of all imputations
impute_scores_all <- data.frame()

for (imp in 1:imputes){ 
  #imp = 1
  
  ## The imputed score values from 0 to 100
  imputed <- impute.transcan(op_gf, imputation=imp, data=op_tmp, list.out=TRUE,
                           pr=FALSE, check=FALSE)
  
      subcat_data <- data.frame(imputed)
  
      impute_scores <- data.frame(Country = op_tmp$Country, 
                                  year = op_tmp$year,
                                  imputation = imp)
      
      impute_scores <- cbind(impute_scores, subcat_data)
    
      impute_scores_all <- rbind(impute_scores_all, impute_scores)
    }

## Convert columns from impute to numeric just to be safe and to prevent warning message from gathering
impute_scores_all$AccesstoAdvancedEducation <- as.numeric(impute_scores_all$AccesstoAdvancedEducation)
impute_scores_all$PersonalFreedomandChoice <- as.numeric(impute_scores_all$PersonalFreedomandChoice)
impute_scores_all$PersonalRights <- as.numeric(impute_scores_all$PersonalRights)
impute_scores_all$Inclusiveness <- as.numeric(impute_scores_all$Inclusiveness)

impute_scores_summary <- impute_scores_all %>%
  gather("subcategory", "score", -(1:3)) %>%
  group_by(Country, year, subcategory) %>%
  dplyr::summarize(score_predict = mean(score),
            sd_score_predict = sd(score)) %>% 
  ungroup()


op_tmp_long <- op_tmp %>%
  select(Country, year, op_score, NA_tot, AccesstoAdvancedEducation, 
         PersonalFreedomandChoice, PersonalRights, Inclusiveness) %>%
  gather("subcategory", "score", -(1:4)) %>%
  left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

## Warning: Column `year` joining character vector and factor, coercing into
## character vector

## should have no NAs in score_predict and sd_score_predict
summary(op_tmp_long)

##    Country              year              op_score         NA_tot    
##  Length:5712        Length:5712        Min.   :14.36   Min.   :0.00  
##  Class :character   Class :character   1st Qu.:39.62   1st Qu.:0.00  
##  Mode  :character   Mode  :character   Median :49.09   Median :0.00  
##                                        Mean   :51.60   Mean   :1.08  
##                                        3rd Qu.:64.63   3rd Qu.:3.00  
##                                        Max.   :84.03   Max.   :4.00  
##                                        NA's   :1728                  
##  subcategory            score       score_predict   sd_score_predict
##  Length:5712        Min.   : 1.10   Min.   : 1.10   Min.   : 0.000  
##  Class :character   1st Qu.:32.74   1st Qu.:33.67   1st Qu.: 0.000  
##  Mode  :character   Median :51.90   Median :51.63   Median : 0.000  
##                     Mean   :51.45   Mean   :50.71   Mean   : 4.097  
##                     3rd Qu.:70.15   3rd Qu.:66.99   3rd Qu.: 9.660  
##                     Max.   :98.67   Max.   :98.67   Max.   :26.745  
##                     NA's   :1542

op_data <- op_tmp_long %>%
  mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize( #op_score_old = mean(op_score), # used this to test to make sure all is well, try `check_validate( )` below
            score = mean(score),
            NA_tot = mean(NA_tot)) %>%
  ungroup() %>% 
  mutate(category = "op")

3.1.3 Gapfill Foundations of Wellbeing (fw) indicator

set.seed(227)
fw_subs <- all_spi %>%
  dplyr::filter(category %in% c("fw")) %>%
  spread(subcategory, score)

fw_tmp <- all_spi %>%
  dplyr::filter(category %in% c("fw_score")) %>%
  select(Country, year, fw_score=score) %>%
  left_join(fw_subs, by=c("Country", "year"))


fw_tmp <- fw_tmp %>%
  rowwise() %>%
  mutate(NA_tot = sum(is.na(AccesstoBasicKnowledge), is.na(AccesstoInformationandCommunications), is.na(EnvironmentalQuality),is.na(HealthandWellness)))

## most values should be 0
hist(fw_tmp$NA_tot)

table(fw_tmp$NA_tot)

## 
##   0   1   2   3   4 
## 942  96   6  72 312

#md.pattern(select(fw_tmp, -(1:4))) # mice package

imputes <- 50
fw_gf <- aregImpute(~ AccesstoBasicKnowledge + AccesstoInformationandCommunications + EnvironmentalQuality + HealthandWellness, 
                         data = fw_tmp, type = "regression", n.impute = imputes)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53

fw_gf

## 
## Multiple Imputation using Bootstrap and PMM
## 
## aregImpute(formula = ~AccesstoBasicKnowledge + AccesstoInformationandCommunications + 
##     EnvironmentalQuality + HealthandWellness, data = fw_tmp, 
##     n.impute = imputes, type = "regression")
## 
## n: 1428  p: 4    Imputations: 50     nk: 3 
## 
## Number of NAs:
##               AccesstoBasicKnowledge AccesstoInformationandCommunications 
##                                  486                                  384 
##                 EnvironmentalQuality                    HealthandWellness 
##                                  324                                  378 
## 
##                                      type d.f.
## AccesstoBasicKnowledge                  s    2
## AccesstoInformationandCommunications    s    2
## EnvironmentalQuality                    s    2
## HealthandWellness                       s    1
## 
## Transformation of Target Variables Forced to be Linear
## 
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
##               AccesstoBasicKnowledge AccesstoInformationandCommunications 
##                                0.686                                0.734 
##                 EnvironmentalQuality                    HealthandWellness 
##                                0.337                                0.791

# to get mean and sd of all imputations
impute_scores_all <- data.frame()

for (imp in 1:imputes){ 
  #imp = 1
  
  imputed <- impute.transcan(fw_gf, imputation=imp, data=fw_tmp, list.out=TRUE,
                           pr=FALSE, check=FALSE)
  
      subcat_data <- data.frame(imputed)
  
      impute_scores <- data.frame(Country = fw_tmp$Country, 
                                  year = fw_tmp$year,
                                  imputation = imp)
      
      impute_scores <- cbind(impute_scores, subcat_data)
    
      impute_scores_all <- rbind(impute_scores_all, impute_scores)
    }

## Convert columns from impute to numeric just to be safe and to prevent warning message from gathering. Due to some impute values having *
impute_scores_all$AccesstoBasicKnowledge <- as.numeric(impute_scores_all$AccesstoBasicKnowledge)
impute_scores_all$AccesstoInformationandCommunications <- as.numeric(impute_scores_all$AccesstoInformationandCommunications)
impute_scores_all$EnvironmentalQuality <- as.numeric(impute_scores_all$EnvironmentalQuality)
impute_scores_all$HealthandWellness <- as.numeric(impute_scores_all$HealthandWellness)

impute_scores_summary <- impute_scores_all %>%
  gather("subcategory", "score", -(1:3)) %>%
  group_by(Country, year, subcategory) %>%
  dplyr::summarize(score_predict = mean(score),
            sd_score_predict = sd(score)) %>% 
  ungroup()


fw_tmp_long <- fw_tmp %>%
  select(Country, year, fw_score, NA_tot, AccesstoBasicKnowledge, 
         AccesstoInformationandCommunications, EnvironmentalQuality, HealthandWellness) %>%
  gather("subcategory", "score", -(1:4)) %>%
  left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

## Warning: Column `year` joining character vector and factor, coercing into
## character vector

summary(fw_tmp_long)

##    Country              year              fw_score         NA_tot     
##  Length:5712        Length:5712        Min.   :26.28   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:55.69   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :67.39   Median :0.000  
##                                        Mean   :66.51   Mean   :1.101  
##                                        3rd Qu.:78.34   3rd Qu.:3.000  
##                                        Max.   :92.32   Max.   :4.000  
##                                        NA's   :1944                   
##  subcategory            score       score_predict   sd_score_predict
##  Length:5712        Min.   : 2.42   Min.   : 2.42   Min.   : 0.000  
##  Class :character   1st Qu.:52.42   1st Qu.:54.19   1st Qu.: 0.000  
##  Mode  :character   Median :67.42   Median :64.75   Median : 0.000  
##                     Mean   :65.47   Mean   :64.92   Mean   : 3.875  
##                     3rd Qu.:80.03   3rd Qu.:76.34   3rd Qu.: 9.829  
##                     Max.   :98.86   Max.   :98.86   Max.   :25.828  
##                     NA's   :1572

fw_data <- fw_tmp_long %>%
  mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize( #fw_score_old = mean(fw_score), # used this to test to make sure all is well, try `check_that( )` below
            score = mean(score),
            NA_tot = mean(NA_tot)) %>%
  ungroup() %>% 
  mutate(category = "fw")

3.2 Combine data to get Social Progress Index

The next step averages the 3 indicators and identifies gapfilling. All the rows where all four indicators are missing NA_tot = 4 will have an NA in the score column.

spi_calc <- rbind(bhn_data, op_data, fw_data) %>%
  mutate(gapfill = paste(category, NA_tot, sep = "_")) %>%
  mutate(score = ifelse(score > 100, 100, score),
         score = ifelse(score < 0, 0, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize(
            score = mean(score),
            method = paste(unique(gapfill), collapse=", "),
            gapfill = sum(NA_tot)) %>%
  ungroup() %>% 
  mutate(method = ifelse(gapfill == 0, NA, method),
         gapfill = ifelse(gapfill >= 1, 1, 0)) %>% # change values to just yes or no
  data.frame()

## Some scores might still be NA because no values exist for that country

# check:
check <- spi_data %>%
  filter(subcategory=="SocialProgressIndex") %>%
  dplyr::select(Country, year, true_score=score) %>%
  left_join(spi_calc, by = c("Country", "year"))

## Perfect
plot(check$true_score, check$score)
abline(0,1, col="red")

write.csv(spi_calc, "int/Country_spi.csv", row.names=FALSE)

3.3 Assign countries to OHI regions

spi_calc <- read.csv("int/Country_spi.csv", stringsAsFactors = FALSE)

spi_calc <- spi_calc %>%
  mutate(Country = case_when(
    Country=="C<f4>te d'Ivoire" ~ "Cote d'Ivoire",
    Country=="R<e9>union" ~ "Reunion",
    Country=="Cura<e7>ao" ~ "Curacao",
    Country=="Korea, Democratic Republic of" ~ "North Korea",
    Country=="St. Helena" ~ "Saint Helena",
    Country=="St. Martin" ~ "Northern Saint-Martin",
    TRUE ~ Country # Everything else, leave it be
    ))

# Channel Islands are Jersey and Guernsey, but these are already in the data

spi_rgn <- name_2_rgn(df_in = spi_calc, 
                       fld_name='Country',
                      flds_unique = c("year"))

## 
## These data were removed for not having any match in the lookup tables:
## 
##                Channel Islands                  Côte d'Ivoire 
##                              1                              1 
##                        Curaçao                       Eswatini 
##                              1                              1 
##                    Isle of Man                   North Cyprus 
##                              1                              1 
##    Republic of North Macedonia                        Réunion 
##                              1                              1 
## Svalbard and Jan Mayen Islands 
##                              1 
## 
## These data were removed for not being of the proper rgn_type (eez,ohi_region) or mismatching region names in the lookup tables:
##                           tmp_type
## tmp_name                   disputed landlocked
##   Afghanistan                     0          6
##   Andorra                         0          6
##   Armenia                         0          6
##   Austria                         0          6
##   Belarus                         0          6
##   Bhutan                          0          6
##   Bolivia                         0          6
##   Botswana                        0          6
##   Burkina Faso                    0          6
##   Burundi                         0          6
##   Central African Republic        0          6
##   Chad                            0          6
##   Czech Republic                  0          6
##   Ethiopia                        0          6
##   Hungary                         0          6
##   Kazakhstan                      0          6
##   Kosovo                          0          6
##   Kyrgyzstan                      0          6
##   Laos                            0          6
##   Lesotho                         0          6
##   Liechtenstein                   0          6
##   Luxembourg                      0          6
##   Malawi                          0          6
##   Mali                            0          6
##   Moldova                         0          6
##   Mongolia                        0          6
##   Nepal                           0          6
##   Niger                           0          6
##   Paraguay                        0          6
##   Rwanda                          0          6
##   San Marino                      0          6
##   Serbia                          0          6
##   Slovakia                        0          6
##   South Sudan                     0          6
##   Switzerland                     0          6
##   Tajikistan                      0          6
##   Turkmenistan                    0          6
##   Uganda                          0          6
##   Uzbekistan                      0          6
##   West Bank and Gaza              6          0
##   Zambia                          0          6
##   Zimbabwe                        0          6

## 
## DUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).

##  [1] "China"                    "Guadeloupe"              
##  [3] "Guam"                     "Hong Kong"               
##  [5] "Macao"                    "Martinique"              
##  [7] "Northern Mariana Islands" "Puerto Rico"             
##  [9] "Tanzania"                 "Virgin Islands (U.S.)"   
## [11] "Zanzibar"

# Weight the following duplicates by population
# China, Guadeloupe, Guam, Hong Kong, Macao, Martinique, Northern Mariana Islands, Puerto Rico, Virgin Islands (U.S.)... just use wikipedia populations 

pop_weights <- data.frame(Country = c("China", "Hong Kong", "Macao",
                 "Guadeloupe", "Martinique",
                 "Guam", "Northern Mariana Islands",
                 "Puerto Rico", "Virgin Islands (U.S.)"),
                 pop = c(1379113306, 7482500, 622567,
                         395700, 376480,
                         164229, 55144,
                         3195153, 106405))
spi_rgn <- spi_rgn %>%
  left_join(pop_weights, by = "Country") %>%
  mutate(pop = ifelse(is.na(pop), 1, pop)) %>%
  group_by(rgn_id, rgn_name, year) %>%
  dplyr::summarize(score = weighted.mean(score, pop, na.rm=TRUE),
            method = paste(unique(method), collapse=" "),
            gapfill = weighted.mean(gapfill, pop, na.rm=TRUE))%>%
  ungroup() %>% 
  dplyr::mutate(year = as.numeric(year))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

4 Compare to WGI

WGI is a couple years behind the SPI, so we will use 2016 WGI for the 2017 and 2018 SPI. There is a strong correlation betwen the WGI and SPI indicators.

wgi <- read.csv('../../prs_res_wgi/v2019/output/wgi_res.csv') %>%
  select(rgn_id, year, wgi_score = resilience_score)

length(unique(wgi$rgn_id))

## [1] 220

wgi_2018 <- wgi %>%
  filter(year == 2016) %>%
  mutate(year = 2018)

wgi_2019 <- wgi %>%
  filter(year == 2016) %>%
  mutate(year = 2019)

wgi <- rbind(wgi, wgi_2018, wgi_2019)

wgi_spi <- wgi %>%
  left_join(spi_rgn, by=c("rgn_id", "year"))

plot(wgi_spi$wgi_score*100, wgi_spi$score, ylab="Social Progress Index, score", xlab="Worldwide Governance Score")
abline(0,1, col="red")

mod <- lm(score ~ wgi_score, data=wgi_spi)
summary(mod)

## 
## Call:
## lm(formula = score ~ wgi_score, data = wgi_spi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5595  -4.7862   0.2932   4.5554  20.8061 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.0042     0.7175   41.82   <2e-16 ***
## wgi_score    72.0805     1.3651   52.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.161 on 820 degrees of freedom
##   (3798 observations deleted due to missingness)
## Multiple R-squared:  0.7727, Adjusted R-squared:  0.7725 
## F-statistic:  2788 on 1 and 820 DF,  p-value: < 2.2e-16

5 Second round of gapfilling

In this case, UN geopolitical regions and WGI scores are used to estimate regions with no data. Based on this analysis, a model that includes WGI data and r2 UN geopolitical regions is the best model to predict missing SPI values.

years <- data.frame(year = min(spi_rgn$year):max(spi_rgn$year))

rgns_gf_un <- georegions %>%
  merge(years) %>%
  left_join(spi_rgn, by = c("rgn_id","year")) %>%
  left_join(wgi, by = c("rgn_id", "year")) %>%
  mutate(r2 = as.factor(r2)) %>%
  mutate(r1 = as.factor(r1))


## Compare models to select a gapfilling method
mod1 <- lm(score ~ r2, data = rgns_gf_un, na.action="na.exclude")
mod2 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod3 <- lm(score ~ r1, data = rgns_gf_un, na.action="na.exclude")
mod4 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod5 <- lm(score ~ wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod6 <- lm(score ~ r1 + poly(wgi_score, 2), data = rgns_gf_un, na.action = "na.exclude")
mod7 <- lm(score ~ r1 + poly(wgi_score, 3), data = rgns_gf_un, na.action = "na.exclude")


## the lowest AIC score is likely the best model
AIC(mod1, mod2, mod3, mod4, mod5, mod6, mod7)

##      df      AIC
## mod1 21 5916.654
## mod2 22 4805.127
## mod3  7 6168.161
## mod4  8 5187.071
## mod5  3 5573.209
## mod6  9 5189.037
## mod7 10 5174.475

## plot the models with the two lowest AIC scores
plot(predict(mod2), rgns_gf_un$score)
abline(0,1)

plot(predict(mod7), rgns_gf_un$score)
abline(0,1)

## Estimate missing data and gapfill
## need to add this because some R2 regions have no data
r2_regions <- unique(rgns_gf_un$r2[!is.na(rgns_gf_un$score)])
rgns_gf_un$r2 <- ifelse(rgns_gf_un$r2 %in% r2_regions, rgns_gf_un$r2, NA)

r1_regions <- unique(rgns_gf_un$r1[!is.na(rgns_gf_un$score)])
rgns_gf_un$r1 <- ifelse(rgns_gf_un$r1 %in% r1_regions, rgns_gf_un$r1, NA)


## Predict scores using r2 and wgi scores
mod_gf_r2 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action = "na.exclude")
rgns_gf_un$score_pred_r2 <- predict(mod_gf_r2, newdata = rgns_gf_un[, c("r2", "wgi_score")])
## Predict scores using r1 and wgi scores
mod_gf_r1 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_r1 <- predict(mod_gf_r1, newdata = rgns_gf_un[, c("r1", "wgi_score")])
## Predict scores just using wgi scores
mod_gf_wgi <- lm(score ~ wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_wgi <- predict(mod_gf_wgi, newdata = data.frame(wgi_score = rgns_gf_un$wgi_score))

## Record gapfill methods
## Combine scores with predicted model scores
rgns_gf <- rgns_gf_un %>%
  mutate(method = ifelse(is.na(score), "UN georgn & WGI", method)) %>%
  mutate(gapfill = ifelse(is.na(score), "1", gapfill)) %>%
  mutate(score = ifelse(is.na(score), score_pred_r2, score)) %>%
  mutate(score = ifelse(is.na(score), score_pred_r1, score)) %>%
  mutate(score = ifelse(is.na(score), score_pred_wgi, score)) %>%
  select(rgn_id, year, score, method, gapfill) %>%
  mutate(score = ifelse(score > 100, 100, score),
         score = ifelse(score < 0, 0, score))

summary(rgns_gf)  # should be no NA values

##      rgn_id            year          score          method         
##  Min.   :  1.00   Min.   :2014   Min.   :28.36   Length:1320       
##  1st Qu.: 58.75   1st Qu.:2015   1st Qu.:62.44   Class :character  
##  Median :116.50   Median :2016   Median :71.75   Mode  :character  
##  Mean   :117.64   Mean   :2016   Mean   :69.93                     
##  3rd Qu.:176.25   3rd Qu.:2018   3rd Qu.:80.67                     
##  Max.   :250.00   Max.   :2019   Max.   :93.27                     
##    gapfill         
##  Length:1320       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

length(unique(rgns_gf$rgn_id)) # should be 220 regions

## [1] 220

6 Uninhabited regions

These regions will receive an NA for their score (when established population is < 3000 people).

low_pop()

## uninhabited and low population regions

low_pop <- low_pop %>%
  filter(est_population < 3000 | is.na(est_population)) 

rgns_gf_uninhab <- rgns_gf %>%
  mutate(score = ifelse(rgn_id %in% low_pop$rgn_id, NA, score)) %>%
  mutate(gapfill = ifelse(rgn_id %in% low_pop$rgn_id, NA, gapfill))

7 Save final data

gf_2018 <- read_csv(file.path("../v2018/output/spi_gf.csv"))

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   gapfill = col_double(),
##   method = col_character()
## )

gf_data <- rgns_gf_uninhab %>%
  select(rgn_id, year, gapfill, method) %>%
  mutate(gapfill = ifelse(is.na(method), 0, 1))
write.csv(gf_data, "output/spi_gf.csv", row.names=FALSE)


res_data <- rgns_gf_uninhab %>%
  select(rgn_id, year, resilience_score=score) %>%
  mutate(resilience_score = resilience_score/100)
write.csv(res_data, "output/spi_res.csv", row.names=FALSE)

prs_data <- rgns_gf_uninhab %>%
  select(rgn_id, year, pressure_score=score) %>%
  mutate(pressure_score = 1 - (pressure_score/100))
write.csv(prs_data, "output/spi_prs.csv", row.names=FALSE)

8 Data check

## Resilience comparison, filtered for 2018
new_spi_res <- read_csv(file.path("output/spi_res.csv")) %>%
  filter(year == 2018) %>%
  select(rgn_id, new_resilience_score = resilience_score)

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   resilience_score = col_double()
## )

old_new <- read_csv("../v2018/output/spi_res.csv") %>%
  filter(year == 2018) %>% 
  left_join(new_spi_res, by = 'rgn_id')

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   resilience_score = col_double()
## )

plot(old_new$new_resilience_score, old_new$resilience_score)
abline(0, 1, col="red")

## Pressure score comparison, filtered for 2018
new_spi_prs <- read_csv(file.path("output/spi_prs.csv")) %>%
  filter(year == 2018) %>%
  select(rgn_id, new_pressure_score = pressure_score)

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   pressure_score = col_double()
## )

old_new <- read_csv("../v2018/output/spi_prs.csv") %>%
  filter(year == 2018) %>% 
  left_join(new_spi_prs, by = 'rgn_id')

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   pressure_score = col_double()
## )

plot(old_new$new_pressure_score, old_new$pressure_score)
abline(0, 1, col="red")

## Pressure score comparison, All years 
new_spi_prs <- read_csv(file.path("output/spi_prs.csv")) %>%
  #filter(year == 2018) %>%
  select(rgn_id, new_pressure_score = pressure_score)

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   pressure_score = col_double()
## )

old_new <- read_csv("../v2018/output/spi_prs.csv") %>%
  #filter(year == 2018) %>% 
  left_join(new_spi_prs, by = 'rgn_id')

## Parsed with column specification:
## cols(
##   rgn_id = col_double(),
##   year = col_double(),
##   pressure_score = col_double()
## )

plot(old_new$new_pressure_score, old_new$pressure_score)
abline(0, 1, col="red")

OHI: Social Progress Index: Exploring gapfilling

Compiled on Wed Oct 2 16:01:01 2019 by sgclawson