1 Summary

This script gapfills the social progress index data and formats it for the OHI global assessment.

2 Updates

Previously, only one year of data were available, but now there are 4 years of data. Updated method of gapfilling. Now use Hmisc’s functions to gapfill missing data. This method is used to gapfill subcategories with 1 or more subsubcategories of data. We use the mean of 50 permutations using the “regression” method.

Citation: http://www.socialprogressimperative.org/

Stern, S., A. Wares and T. Epner. 2017. Social Progress Index: 2017 Methodology Report.

Source information: http://www.socialprogressimperative.org/global-index/ –> Export

Date: 8/10/2017

Time range: 2014-2017

Native data resolution: country scores

Format: Excel file

Description: Social Progress Index scores and components for countries.

# load libraries, set directories
library(ohicore)  #devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidyr)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

## comment out when knitting
#setwd("globalprep/prs_res_spi/v2017")


### Load FAO-specific user-defined functions
source('../../../src/R/common.R') # directory locations

set.seed(227)

3 Social Progress Index data

Organize data and gapfill missing countries that have incomplete data. This index is comprised of 3 indicators, which are each comprised of 4 subindicators, which are comprised of several datasets. If one of the subindicators are missing, the SPI is not calculated. The first round of gapfilling involves using relationships between the the subindicators to gapfill missing data. If a region is missing all subindicator data, then a second round of gapfilling is performed using relationships between UN geopolitical regions and the World Governance Indicator to gapfill the SPI score.

The following gets all years of data (currently in separate files).

cats <- read.csv(file.path(dir_M, 'git-annex/globalprep/_raw_data/SocialProgressIndex/d2016/spi_categories.csv')) %>%
  mutate(category = gsub(" ", "", category))

files <- list.files(file.path(dir_M, 
          "git-annex/globalprep/_raw_data/SocialProgressIndex/d2017"),
          pattern = "spi_", full = TRUE)

all_spi <- data.frame()

for(spi in files){ #spi = files[1]

spi_data <- read.csv(spi, check.names=FALSE, stringsAsFactors=FALSE) 

names(spi_data) <- gsub(" ", "", names(spi_data))

yr <- substring(basename(spi), 5, 8)

spi_data <- spi_data %>%
  dplyr::select(-CountryCode) %>%
  mutate(year = yr) %>%
  gather('category', 'score', -Country, -year) %>%
  filter(category %in% cats$category) %>%
  left_join(cats, by = "category")

all_spi <- rbind(all_spi, spi_data)

}

3.1 Gapfilling: Step 1

In this case, we use relationships between the subindicators to estimate missing data.

3.1.1 Gapfill bhn indicator

set.seed(227)
bhn_subs <- all_spi %>%
  dplyr::filter(subcategory %in% c("bhn")) %>%
  spread(category, score)

bhn_tmp <- all_spi %>%
  dplyr::filter(subcategory %in% c("bhn_score")) %>%
  select(Country, year, bhn_score=score) %>%
  left_join(bhn_subs, by=c("Country", "year"))


bhn_tmp <- bhn_tmp %>%
rowwise() %>%
  mutate(NA_tot = sum(is.na(NutritionandBasicMedicalCare), is.na(NutritionandBasicMedicalCare), is.na(Shelter),
             is.na(WaterandSanitation)))
hist(bhn_tmp$NA_tot)

table(bhn_tmp$NA_tot)

## 
##   0   1   2   3   4 
## 595  37  35 112 165

#md.pattern(select(bhn_tmp, -(1:4))) # mice package

imputes <- 50
bhn_gf <- aregImpute(~ WaterandSanitation + Shelter + NutritionandBasicMedicalCare + PersonalSafety, 
                         data = bhn_tmp, type = "regression", n.impute = imputes)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53

bhn_gf

## 
## Multiple Imputation using Bootstrap and PMM
## 
## aregImpute(formula = ~WaterandSanitation + Shelter + NutritionandBasicMedicalCare + 
##     PersonalSafety, data = bhn_tmp, n.impute = imputes, type = "regression")
## 
## n: 944   p: 4    Imputations: 50     nk: 3 
## 
## Number of NAs:
##           WaterandSanitation                      Shelter 
##                          174                          311 
## NutritionandBasicMedicalCare               PersonalSafety 
##                          309                          322 
## 
##                              type d.f.
## WaterandSanitation              s    2
## Shelter                         s    2
## NutritionandBasicMedicalCare    s    2
## PersonalSafety                  s    1
## 
## Transformation of Target Variables Forced to be Linear
## 
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
##           WaterandSanitation                      Shelter 
##                        0.877                        0.901 
## NutritionandBasicMedicalCare               PersonalSafety 
##                        0.848                        0.527

# to get mean and sd of all imputations
impute_scores_all <- data.frame()

for (imp in 1:imputes){ #imp = 1
  
  imputed <- impute.transcan(bhn_gf, imputation=imp, data=bhn_tmp, list.out=TRUE,
                           pr=FALSE, check=FALSE)
  
      subcat_data <- data.frame(imputed)
  
      impute_scores <- data.frame(Country = bhn_tmp$Country, 
                                  year = bhn_tmp$year,
                                  imputation = imp)
      
      impute_scores <- cbind(impute_scores, subcat_data)
    
      impute_scores_all <- rbind(impute_scores_all, impute_scores)
    }

impute_scores_summary <- impute_scores_all %>%
  gather("subcategory", "score", -(1:3)) %>%
  group_by(Country, year, subcategory) %>%
  dplyr::summarize(score_predict = mean(score),
            sd_score_predict = sd(score))

## Warning: attributes are not identical across measure variables;
## they will be dropped

bhn_tmp_long <- bhn_tmp %>%
  select(Country, year, bhn_score, NA_tot, NutritionandBasicMedicalCare, 
         PersonalSafety, Shelter, WaterandSanitation) %>%
  gather("subcategory", "score", -(1:4)) %>%
  left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

## Warning: Column `year` joining character vector and factor, coercing into
## character vector

summary(bhn_tmp_long)

##    Country              year             bhn_score         NA_tot     
##  Length:3776        Length:3776        Min.   :27.82   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:60.84   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :78.43   Median :0.000  
##                                        Mean   :74.37   Mean   :1.168  
##                                        3rd Qu.:88.62   3rd Qu.:3.000  
##                                        Max.   :96.79   Max.   :4.000  
##                                        NA's   :1492                   
##  subcategory            score        score_predict    sd_score_predict
##  Length:3776        Min.   :  6.96   Min.   :  6.96   Min.   : 0.000  
##  Class :character   1st Qu.: 62.45   1st Qu.: 64.29   1st Qu.: 0.000  
##  Mode  :character   Median : 78.86   Median : 77.22   Median : 0.000  
##                     Mean   : 74.80   Mean   : 74.66   Mean   : 3.153  
##                     3rd Qu.: 92.74   3rd Qu.: 90.36   3rd Qu.: 5.971  
##                     Max.   :100.00   Max.   :100.00   Max.   :28.223  
##                     NA's   :1116

bhn_data <- bhn_tmp_long %>%
  mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize( #bhn_score_old = mean(bhn_score), # used this to test to make sure all is well
            score = mean(score),
            NA_tot = mean(NA_tot)) %>%
  mutate(subcategory = "bhn")

3.1.2 Gapfill op indicator

set.seed(227)
op_subs <- all_spi %>%
  dplyr::filter(subcategory %in% c("op")) %>%
  spread(category, score)

op_tmp <- all_spi %>%
  dplyr::filter(subcategory %in% c("op_score")) %>%
  select(Country, year, op_score=score) %>%
  left_join(op_subs, by=c("Country", "year"))


op_tmp <- op_tmp %>%
rowwise() %>%
  mutate(NA_tot = sum(is.na(AccesstoAdvancedEducation), is.na(PersonalFreedomandChoice), is.na(PersonalRights),
             is.na(ToleranceandInclusion)))
hist(op_tmp$NA_tot)

table(op_tmp$NA_tot)

## 
##   0   1   2   3   4 
## 585  41  18  30 270

#md.pattern(select(op_tmp, -(1:4))) # mice package

imputes <- 50
op_gf <- aregImpute(~ AccesstoAdvancedEducation + PersonalFreedomandChoice + PersonalRights + ToleranceandInclusion, 
                         data = op_tmp, type = "regression", n.impute = imputes)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53

op_gf

## 
## Multiple Imputation using Bootstrap and PMM
## 
## aregImpute(formula = ~AccesstoAdvancedEducation + PersonalFreedomandChoice + 
##     PersonalRights + ToleranceandInclusion, data = op_tmp, n.impute = imputes, 
##     type = "regression")
## 
## n: 944   p: 4    Imputations: 50     nk: 3 
## 
## Number of NAs:
## AccesstoAdvancedEducation  PersonalFreedomandChoice 
##                       301                       313 
##            PersonalRights     ToleranceandInclusion 
##                       298                       335 
## 
##                           type d.f.
## AccesstoAdvancedEducation    s    2
## PersonalFreedomandChoice     s    2
## PersonalRights               s    2
## ToleranceandInclusion        s    1
## 
## Transformation of Target Variables Forced to be Linear
## 
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
## AccesstoAdvancedEducation  PersonalFreedomandChoice 
##                     0.591                     0.761 
##            PersonalRights     ToleranceandInclusion 
##                     0.512                     0.640

# to get mean and sd of all imputations
impute_scores_all <- data.frame()

for (imp in 1:imputes){ #imp = 1
  
  imputed <- impute.transcan(op_gf, imputation=imp, data=op_tmp, list.out=TRUE,
                           pr=FALSE, check=FALSE)
  
      subcat_data <- data.frame(imputed)
  
      impute_scores <- data.frame(Country = op_tmp$Country, 
                                  year = op_tmp$year,
                                  imputation = imp)
      
      impute_scores <- cbind(impute_scores, subcat_data)
    
      impute_scores_all <- rbind(impute_scores_all, impute_scores)
    }

impute_scores_summary <- impute_scores_all %>%
  gather("subcategory", "score", -(1:3)) %>%
  group_by(Country, year, subcategory) %>%
  dplyr::summarize(score_predict = mean(score),
            sd_score_predict = sd(score))

## Warning: attributes are not identical across measure variables;
## they will be dropped

op_tmp_long <- op_tmp %>%
  select(Country, year, op_score, NA_tot, AccesstoAdvancedEducation, 
         PersonalFreedomandChoice, PersonalRights, ToleranceandInclusion) %>%
  gather("subcategory", "score", -(1:4)) %>%
  left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

## Warning: Column `year` joining character vector and factor, coercing into
## character vector

summary(op_tmp_long)

##    Country              year              op_score         NA_tot     
##  Length:3776        Length:3776        Min.   :21.90   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:40.69   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :50.11   Median :0.000  
##                                        Mean   :53.36   Mean   :1.321  
##                                        3rd Qu.:63.62   3rd Qu.:4.000  
##                                        Max.   :88.83   Max.   :4.000  
##                                        NA's   :1436                   
##  subcategory            score       score_predict   sd_score_predict
##  Length:3776        Min.   : 3.45   Min.   : 3.45   Min.   : 0.000  
##  Class :character   1st Qu.:38.74   1st Qu.:40.06   1st Qu.: 0.000  
##  Mode  :character   Median :53.13   Median :51.54   Median : 0.000  
##                     Mean   :52.65   Mean   :51.51   Mean   : 4.936  
##                     3rd Qu.:67.96   3rd Qu.:62.77   3rd Qu.:11.090  
##                     Max.   :97.89   Max.   :97.89   Max.   :27.051  
##                     NA's   :1247

op_data <- op_tmp_long %>%
  mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize( #op_score_old = mean(op_score), # used this to test to make sure all is well
            score = mean(score),
            NA_tot = mean(NA_tot)) %>%
  mutate(subcategory = "op")

3.1.3 Gapfill fw indicator

set.seed(227)
fw_subs <- all_spi %>%
  dplyr::filter(subcategory %in% c("fw")) %>%
  spread(category, score)

fw_tmp <- all_spi %>%
  dplyr::filter(subcategory %in% c("fw_score")) %>%
  select(Country, year, fw_score=score) %>%
  left_join(fw_subs, by=c("Country", "year"))


fw_tmp <- fw_tmp %>%
rowwise() %>%
  mutate(NA_tot = sum(is.na(AccesstoBasicKnowledge), is.na(AccesstoInformationandCommunications),
                      is.na(EnvironmentalQuality),is.na(HealthandWellness)))
hist(fw_tmp$NA_tot)

table(fw_tmp$NA_tot)

## 
##   0   1   2   3   4 
## 609  60  25  43 207

#md.pattern(select(fw_tmp, -(1:4))) # mice package

imputes <- 50
fw_gf <- aregImpute(~ AccesstoBasicKnowledge + AccesstoInformationandCommunications + EnvironmentalQuality + HealthandWellness, 
                         data = fw_tmp, type = "regression", n.impute = imputes)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53

fw_gf

## 
## Multiple Imputation using Bootstrap and PMM
## 
## aregImpute(formula = ~AccesstoBasicKnowledge + AccesstoInformationandCommunications + 
##     EnvironmentalQuality + HealthandWellness, data = fw_tmp, 
##     n.impute = imputes, type = "regression")
## 
## n: 944   p: 4    Imputations: 50     nk: 3 
## 
## Number of NAs:
##               AccesstoBasicKnowledge AccesstoInformationandCommunications 
##                                  285                                  256 
##                 EnvironmentalQuality                    HealthandWellness 
##                                  297                                  229 
## 
##                                      type d.f.
## AccesstoBasicKnowledge                  s    2
## AccesstoInformationandCommunications    s    2
## EnvironmentalQuality                    s    2
## HealthandWellness                       s    1
## 
## Transformation of Target Variables Forced to be Linear
## 
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
##               AccesstoBasicKnowledge AccesstoInformationandCommunications 
##                                0.535                                0.630 
##                 EnvironmentalQuality                    HealthandWellness 
##                                0.766                                0.553

# to get mean and sd of all imputations
impute_scores_all <- data.frame()

for (imp in 1:imputes){ #imp = 1
  
  imputed <- impute.transcan(fw_gf, imputation=imp, data=fw_tmp, list.out=TRUE,
                           pr=FALSE, check=FALSE)
  
      subcat_data <- data.frame(imputed)
  
      impute_scores <- data.frame(Country = fw_tmp$Country, 
                                  year = fw_tmp$year,
                                  imputation = imp)
      
      impute_scores <- cbind(impute_scores, subcat_data)
    
      impute_scores_all <- rbind(impute_scores_all, impute_scores)
    }

impute_scores_summary <- impute_scores_all %>%
  gather("subcategory", "score", -(1:3)) %>%
  group_by(Country, year, subcategory) %>%
  dplyr::summarize(score_predict = mean(score),
            sd_score_predict = sd(score))

## Warning: attributes are not identical across measure variables;
## they will be dropped

fw_tmp_long <- fw_tmp %>%
  select(Country, year, fw_score, NA_tot, AccesstoBasicKnowledge, 
         AccesstoInformationandCommunications, EnvironmentalQuality, HealthandWellness) %>%
  gather("subcategory", "score", -(1:4)) %>%
  left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

## Warning: Column `year` joining character vector and factor, coercing into
## character vector

summary(fw_tmp_long)

##    Country              year              fw_score         NA_tot    
##  Length:3776        Length:3776        Min.   :35.18   Min.   :0.00  
##  Class :character   Class :character   1st Qu.:59.48   1st Qu.:0.00  
##  Mode  :character   Mode  :character   Median :72.52   Median :0.00  
##                                        Mean   :70.50   Mean   :1.13  
##                                        3rd Qu.:80.67   3rd Qu.:3.00  
##                                        Max.   :91.75   Max.   :4.00  
##                                        NA's   :1340                  
##  subcategory            score       score_predict   sd_score_predict
##  Length:3776        Min.   : 7.99   Min.   : 7.99   Min.   : 0.000  
##  Class :character   1st Qu.:57.59   1st Qu.:58.24   1st Qu.: 0.000  
##  Mode  :character   Median :70.36   Median :67.62   Median : 0.000  
##                     Mean   :70.10   Mean   :69.24   Mean   : 3.313  
##                     3rd Qu.:83.65   3rd Qu.:81.55   3rd Qu.: 8.639  
##                     Max.   :99.86   Max.   :99.86   Max.   :19.815  
##                     NA's   :1067

fw_data <- fw_tmp_long %>%
  mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize( #fw_score_old = mean(fw_score), # used this to test to make sure all is well
            score = mean(score),
            NA_tot = mean(NA_tot)) %>%
  mutate(subcategory = "fw")

3.2 Combine data to get Social Progress Index

The next step averages the 3 indicators and identifies gapfilling.

spi_calc <- rbind(bhn_data, op_data, fw_data) %>%
  mutate(gapfill = paste(subcategory, NA_tot, sep = "_")) %>%
  mutate(score = ifelse(score > 100, 100, score),
         score = ifelse(score < 0, 0, score)) %>%
  group_by(Country, year) %>%
  dplyr::summarize(score = mean(score),
            method = paste(unique(gapfill), collapse=", "),
            gapfill = sum(NA_tot)) %>%
  mutate(method = ifelse(gapfill == 0, NA, method),
         gapfill = ifelse(gapfill >= 1, 1, 0)) %>%
  data.frame()

# check:
check <- spi_data %>%
  filter(category=="SocialProgressIndex") %>%
  dplyr::select(Country, year, true_score=score) %>%
  left_join(spi_calc)

## Joining, by = c("Country", "year")

plot(check$true_score, check$score)
abline(0,1, col="red")

write.csv(spi_calc, "int/Country_spi.csv", row.names=FALSE)

3.3 Assign countries to OHI regions

spi_calc <- read.csv("int/Country_spi.csv", stringsAsFactors = FALSE)
spi_calc <- spi_calc %>%
  mutate(Country = ifelse(Country=="C<f4>te d'Ivoire", "Cote d'Ivoire", Country),
         Country = ifelse(Country=="R<e9>union", "Reunion", Country),
         Country = ifelse(Country=="Cura<e7>ao", "Curacao", Country),
         Country = ifelse(Country=="Korea, Democratic Republic of", "North Korea", Country),
         Country = ifelse(Country=="St. Helena", "Saint Helena", Country),
         Country = ifelse(Country=="St. Martin", "Northern Saint-Martin", Country))

# Channel Islands are Jersey and Guernsey, but these are already in the data

        
spi_rgn <- name_2_rgn(df_in = spi_calc, 
                       fld_name='Country',
                      flds_unique = c("year"))

## 
## These data were removed for not having any match in the lookup tables:
## 
## Channel Islands    North Cyprus      Somaliland    Vatican City 
##               1               1               1               1 
## 
## These data were removed for not being of the proper rgn_type (eez,ohi_region) or mismatching region names in the lookup tables:
##                           tmp_type
## tmp_name                   disputed landlocked
##   Afghanistan                     0          4
##   Andorra                         0          4
##   Armenia                         0          4
##   Austria                         0          4
##   Belarus                         0          4
##   Bhutan                          0          4
##   Bolivia                         0          4
##   Botswana                        0          4
##   Burkina Faso                    0          4
##   Burundi                         0          4
##   Central African Republic        0          4
##   Chad                            0          4
##   Czech Republic                  0          4
##   Ethiopia                        0          4
##   Hungary                         0          4
##   Kazakhstan                      0          4
##   Kosovo                          0          4
##   Kyrgyzstan                      0          4
##   Laos                            0          4
##   Lesotho                         0          4
##   Liechtenstein                   0          4
##   Luxembourg                      0          4
##   Macedonia                       0          4
##   Malawi                          0          4
##   Mali                            0          4
##   Moldova                         0          4
##   Mongolia                        0          4
##   Nepal                           0          4
##   Niger                           0          4
##   Paraguay                        0          4
##   Rwanda                          0          4
##   San Marino                      0          4
##   Serbia                          0          4
##   Slovakia                        0          4
##   South Sudan                     0          4
##   Swaziland                       0          4
##   Switzerland                     0          4
##   Tajikistan                      0          4
##   Turkmenistan                    0          4
##   Uganda                          0          4
##   Uzbekistan                      0          4
##   West Bank and Gaza              4          0
##   Zambia                          0          4
##   Zimbabwe                        0          4

## 
## DUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).

## [1] "China"                    "Guadeloupe"              
## [3] "Guam"                     "Hong Kong"               
## [5] "Macao"                    "Martinique"              
## [7] "Northern Mariana Islands" "Puerto Rico"             
## [9] "Virgin Islands (U.S.)"

# Weight the following duplicates by population
# Northern Mariana Islands and Guam
# China, Macao, Hong Kong
# Puerto Rico and Virgin Islands (U.S.)
# Guadeloupe and Martinique

pop_weights <- data.frame(Country = c("China", "Hong Kong", "Macao",
                 "Guadeloupe", "Martinique",
                 "Guam", "Northern Mariana Islands",
                 "Puerto Rico", "Virgin Islands (U.S.)"),
                 pop = c(1389315824, 7184000, 614500,
                         404394, 399637,
                         159358, 53833,
                         3411000, 102951))
spi_rgn <- spi_rgn %>%
  left_join(pop_weights, by = "Country") %>%
  mutate(pop = ifelse(is.na(pop), 1, pop)) %>%
  group_by(rgn_id, rgn_name, year) %>%
  dplyr::summarize(score = weighted.mean(score, pop, na.rm=TRUE),
            method = paste(unique(method), collapse=" "),
            gapfill = weighted.mean(gapfill, pop, na.rm=TRUE))%>%
  dplyr::mutate(year = as.numeric(as.character(year)))

## Warning: Column `Country` joining character vector and factor, coercing
## into character vector

4 Compare to WGI

WGI is a couple years behind the SPI, so we will use 2015 WGI for the 2016 and 2017 SPI. There is a strong correlation betwen the WGI and SPI indicators.

wgi <- read.csv('../../prs_res_wgi/v2017/output/wgi_res.csv') %>%
  select(rgn_id, year, wgi_score =resilience_score)

length(unique(wgi$rgn_id))

## [1] 220

wgi_2016 <- wgi %>%
  filter(year == 2015) %>%
  mutate(year = 2016)

wgi_2017 <- wgi %>%
  filter(year == 2015) %>%
  mutate(year = 2017)

wgi <- rbind(wgi, wgi_2016, wgi_2017)

wgi_spi <- wgi %>%
  left_join(spi_rgn, by=c("rgn_id", "year"))

plot(wgi_spi$wgi_score*100, wgi_spi$score, ylab="Social Progress Index, score", xlab="Worldwide Governance Score")
abline(0,1, col="red")

mod <- lm(score ~ wgi_score, data=wgi_spi)
summary(mod)

## 
## Call:
## lm(formula = score ~ wgi_score, data = wgi_spi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0691  -3.6754   0.3509   3.8445  18.9330 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31.9488     0.8011   39.88   <2e-16 ***
## wgi_score    70.6996     1.4979   47.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.01 on 503 degrees of freedom
##   (3675 observations deleted due to missingness)
## Multiple R-squared:  0.8158, Adjusted R-squared:  0.8154 
## F-statistic:  2228 on 1 and 503 DF,  p-value: < 2.2e-16

5 Second round of gapfilling

In this case, UN geopolitical regions and WGI scores are used to estimate regions with no data. Based on this analysis, a model that includes third degree polynomial of WGI data and r1 UN geopolitical regions is the best model to predict missing SPI values.

years <- data.frame(year = min(spi_rgn$year):max(spi_rgn$year))

rgns_gf_un <-  georegions %>%
  merge(years) %>%
  left_join(spi_rgn) %>%
  left_join(wgi) %>%
  mutate(r2 = as.factor(r2)) %>%
  mutate(r1 = as.factor(r1))

## Joining, by = c("rgn_id", "year")
## Joining, by = c("rgn_id", "year")

## Compare models to select a gapfilling method
mod1 <- lm(score ~ r2, data = rgns_gf_un, na.action="na.exclude")
mod2 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action="na.exclude")

mod3 <- lm(score ~ r1, data = rgns_gf_un)
mod4 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action="na.exclude")

mod5 <- lm(score ~ wgi_score, data = rgns_gf_un)

mod6 <- lm(score ~ r1 + poly(wgi_score, 2), data = rgns_gf_un, na.action = "na.exclude")
mod7 <- lm(score ~ r1 + poly(wgi_score, 3), data = rgns_gf_un, na.action = "na.exclude")

plot(predict(mod2), rgns_gf_un$score)
abline(0,1)

plot(predict(mod4), rgns_gf_un$score)
abline(0,1)

AIC(mod1, mod2, mod3, mod4, mod5, mod6, mod7)

##      df      AIC
## mod1 20 3504.106
## mod2 21 2751.701
## mod3  7 3697.777
## mod4  8 3010.485
## mod5  3 3248.505
## mod6  9 3012.075
## mod7 10 3003.764

## Estimate missing data and gapfill
## need to add this because some R2 regions have no data
r2_regions <- unique(rgns_gf_un$r2[!is.na(rgns_gf_un$score)])
rgns_gf_un$r2 <- ifelse(rgns_gf_un$r2 %in% r2_regions, rgns_gf_un$r2, NA)

r1_regions <- unique(rgns_gf_un$r1[!is.na(rgns_gf_un$score)])
rgns_gf_un$r1 <- ifelse(rgns_gf_un$r1 %in% r1_regions, rgns_gf_un$r1, NA)


mod_gf_r2 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_r2 <- predict(mod_gf_r2, newdata = rgns_gf_un[, c("r2", "wgi_score")])

mod_gf_r1 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_r1 <- predict(mod_gf_r1, newdata = rgns_gf_un[, c("r1", "wgi_score")])

mod_gf_wgi <- lm(score ~ wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_wgi <- predict(mod_gf_wgi, newdata = data.frame(wgi_score = rgns_gf_un$wgi_score))


rgns_gf <- rgns_gf_un %>%
  mutate(method = ifelse(is.na(score), "UN georgn & WGI", method)) %>%
  mutate(gapfill = ifelse(is.na(score), "1", gapfill)) %>%
  mutate(score = ifelse(is.na(score), score_pred_r2, score)) %>%
  mutate(score = ifelse(is.na(score), score_pred_r1, score)) %>%
  mutate(score = ifelse(is.na(score), score_pred_wgi, score)) %>%
  select(rgn_id, year, score, method, gapfill) %>%
  mutate(score = ifelse(score > 100, 100, score),
         score = ifelse(score < 0, 0, score))

summary(rgns_gf)  # should be no NA values

##      rgn_id            year          score          method         
##  Min.   :  1.00   Min.   :2014   Min.   :36.22   Length:880        
##  1st Qu.: 58.75   1st Qu.:2015   1st Qu.:63.49   Class :character  
##  Median :116.50   Median :2016   Median :72.82   Mode  :character  
##  Mean   :117.64   Mean   :2016   Mean   :70.82                     
##  3rd Qu.:176.25   3rd Qu.:2016   3rd Qu.:80.83                     
##  Max.   :250.00   Max.   :2017   Max.   :93.18                     
##    gapfill         
##  Length:880        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

length(unique(rgns_gf$rgn_id)) # should be 220 regions

## [1] 220

6 Uninhabited regions

These regions will receive an NA for their score (when established population is < 100 people).

uninhab <- read.csv('../../../src/LookupTables/rgn_uninhabited_islands.csv') %>%
  filter(is.na(est_population) | est_population < 100)

rgns_gf_uninhab <- rgns_gf %>%
  mutate(score = ifelse(rgn_id %in% uninhab$rgn_id, NA, score)) %>%
  mutate(gapfill = ifelse(rgn_id %in% uninhab$rgn_id, NA, gapfill))

7 Save final data

gf_data <- rgns_gf_uninhab %>%
  select(rgn_id, year, gapfill, method) %>%
  mutate(gapfill = ifelse(is.na(method), 0, 1))
write.csv(gf_data, "output/spi_gf.csv", row.names=FALSE)


res_data <- rgns_gf_uninhab %>%
  select(rgn_id, year, resilience_score=score) %>%
  mutate(resilience_score = resilience_score/100)
write.csv(res_data, "output/spi_res.csv", row.names=FALSE)

prs_data <- rgns_gf_uninhab %>%
  select(rgn_id, year, pressure_score=score) %>%
  mutate(pressure_score = 1 - (pressure_score/100))
write.csv(prs_data, "output/spi_prs.csv", row.names=FALSE)

OHI: Social Progress Index: Exploring gapfilling