Summary
This script gapfills the social progress index data and formats it for the OHI global assessment.
Updates
Previously, only one year of data were available, but now there are 4 years of data. Updated method of gapfilling. Now use Hmisc’s functions to gapfill missing data. This method is used to gapfill subcategories with 1 or more subsubcategories of data. We use the mean of 50 permutations using the “regression” method.
Citation: http://www.socialprogressimperative.org/
Stern, S., A. Wares and T. Epner. 2017. Social Progress Index: 2017 Methodology Report.
Source information: http://www.socialprogressimperative.org/global-index/ –> Export
Date: 8/10/2017
Time range: 2014-2017
Native data resolution: country scores
Format: Excel file
Description: Social Progress Index scores and components for countries.
# load libraries, set directories
library(ohicore) #devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
## comment out when knitting
#setwd("globalprep/prs_res_spi/v2017")
### Load FAO-specific user-defined functions
source('../../../src/R/common.R') # directory locations
set.seed(227)
Social Progress Index data
Organize data and gapfill missing countries that have incomplete data. This index is comprised of 3 indicators, which are each comprised of 4 subindicators, which are comprised of several datasets. If one of the subindicators are missing, the SPI is not calculated. The first round of gapfilling involves using relationships between the the subindicators to gapfill missing data. If a region is missing all subindicator data, then a second round of gapfilling is performed using relationships between UN geopolitical regions and the World Governance Indicator to gapfill the SPI score.
The following gets all years of data (currently in separate files).
cats <- read.csv(file.path(dir_M, 'git-annex/globalprep/_raw_data/SocialProgressIndex/d2016/spi_categories.csv')) %>%
mutate(category = gsub(" ", "", category))
files <- list.files(file.path(dir_M,
"git-annex/globalprep/_raw_data/SocialProgressIndex/d2017"),
pattern = "spi_", full = TRUE)
all_spi <- data.frame()
for(spi in files){ #spi = files[1]
spi_data <- read.csv(spi, check.names=FALSE, stringsAsFactors=FALSE)
names(spi_data) <- gsub(" ", "", names(spi_data))
yr <- substring(basename(spi), 5, 8)
spi_data <- spi_data %>%
dplyr::select(-CountryCode) %>%
mutate(year = yr) %>%
gather('category', 'score', -Country, -year) %>%
filter(category %in% cats$category) %>%
left_join(cats, by = "category")
all_spi <- rbind(all_spi, spi_data)
}
Gapfilling: Step 1
In this case, we use relationships between the subindicators to estimate missing data.
Gapfill bhn indicator
set.seed(227)
bhn_subs <- all_spi %>%
dplyr::filter(subcategory %in% c("bhn")) %>%
spread(category, score)
bhn_tmp <- all_spi %>%
dplyr::filter(subcategory %in% c("bhn_score")) %>%
select(Country, year, bhn_score=score) %>%
left_join(bhn_subs, by=c("Country", "year"))
bhn_tmp <- bhn_tmp %>%
rowwise() %>%
mutate(NA_tot = sum(is.na(NutritionandBasicMedicalCare), is.na(NutritionandBasicMedicalCare), is.na(Shelter),
is.na(WaterandSanitation)))
hist(bhn_tmp$NA_tot)

##
## 0 1 2 3 4
## 595 37 35 112 165
#md.pattern(select(bhn_tmp, -(1:4))) # mice package
imputes <- 50
bhn_gf <- aregImpute(~ WaterandSanitation + Shelter + NutritionandBasicMedicalCare + PersonalSafety,
data = bhn_tmp, type = "regression", n.impute = imputes)
## Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
##
## Multiple Imputation using Bootstrap and PMM
##
## aregImpute(formula = ~WaterandSanitation + Shelter + NutritionandBasicMedicalCare +
## PersonalSafety, data = bhn_tmp, n.impute = imputes, type = "regression")
##
## n: 944 p: 4 Imputations: 50 nk: 3
##
## Number of NAs:
## WaterandSanitation Shelter
## 174 311
## NutritionandBasicMedicalCare PersonalSafety
## 309 322
##
## type d.f.
## WaterandSanitation s 2
## Shelter s 2
## NutritionandBasicMedicalCare s 2
## PersonalSafety s 1
##
## Transformation of Target Variables Forced to be Linear
##
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
## WaterandSanitation Shelter
## 0.877 0.901
## NutritionandBasicMedicalCare PersonalSafety
## 0.848 0.527
# to get mean and sd of all imputations
impute_scores_all <- data.frame()
for (imp in 1:imputes){ #imp = 1
imputed <- impute.transcan(bhn_gf, imputation=imp, data=bhn_tmp, list.out=TRUE,
pr=FALSE, check=FALSE)
subcat_data <- data.frame(imputed)
impute_scores <- data.frame(Country = bhn_tmp$Country,
year = bhn_tmp$year,
imputation = imp)
impute_scores <- cbind(impute_scores, subcat_data)
impute_scores_all <- rbind(impute_scores_all, impute_scores)
}
impute_scores_summary <- impute_scores_all %>%
gather("subcategory", "score", -(1:3)) %>%
group_by(Country, year, subcategory) %>%
dplyr::summarize(score_predict = mean(score),
sd_score_predict = sd(score))
## Warning: attributes are not identical across measure variables;
## they will be dropped
bhn_tmp_long <- bhn_tmp %>%
select(Country, year, bhn_score, NA_tot, NutritionandBasicMedicalCare,
PersonalSafety, Shelter, WaterandSanitation) %>%
gather("subcategory", "score", -(1:4)) %>%
left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))
## Warning: Column `Country` joining character vector and factor, coercing
## into character vector
## Warning: Column `year` joining character vector and factor, coercing into
## character vector
## Country year bhn_score NA_tot
## Length:3776 Length:3776 Min. :27.82 Min. :0.000
## Class :character Class :character 1st Qu.:60.84 1st Qu.:0.000
## Mode :character Mode :character Median :78.43 Median :0.000
## Mean :74.37 Mean :1.168
## 3rd Qu.:88.62 3rd Qu.:3.000
## Max. :96.79 Max. :4.000
## NA's :1492
## subcategory score score_predict sd_score_predict
## Length:3776 Min. : 6.96 Min. : 6.96 Min. : 0.000
## Class :character 1st Qu.: 62.45 1st Qu.: 64.29 1st Qu.: 0.000
## Mode :character Median : 78.86 Median : 77.22 Median : 0.000
## Mean : 74.80 Mean : 74.66 Mean : 3.153
## 3rd Qu.: 92.74 3rd Qu.: 90.36 3rd Qu.: 5.971
## Max. :100.00 Max. :100.00 Max. :28.223
## NA's :1116
bhn_data <- bhn_tmp_long %>%
mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
group_by(Country, year) %>%
dplyr::summarize( #bhn_score_old = mean(bhn_score), # used this to test to make sure all is well
score = mean(score),
NA_tot = mean(NA_tot)) %>%
mutate(subcategory = "bhn")
Gapfill op indicator
set.seed(227)
op_subs <- all_spi %>%
dplyr::filter(subcategory %in% c("op")) %>%
spread(category, score)
op_tmp <- all_spi %>%
dplyr::filter(subcategory %in% c("op_score")) %>%
select(Country, year, op_score=score) %>%
left_join(op_subs, by=c("Country", "year"))
op_tmp <- op_tmp %>%
rowwise() %>%
mutate(NA_tot = sum(is.na(AccesstoAdvancedEducation), is.na(PersonalFreedomandChoice), is.na(PersonalRights),
is.na(ToleranceandInclusion)))
hist(op_tmp$NA_tot)

##
## 0 1 2 3 4
## 585 41 18 30 270
#md.pattern(select(op_tmp, -(1:4))) # mice package
imputes <- 50
op_gf <- aregImpute(~ AccesstoAdvancedEducation + PersonalFreedomandChoice + PersonalRights + ToleranceandInclusion,
data = op_tmp, type = "regression", n.impute = imputes)
## Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
##
## Multiple Imputation using Bootstrap and PMM
##
## aregImpute(formula = ~AccesstoAdvancedEducation + PersonalFreedomandChoice +
## PersonalRights + ToleranceandInclusion, data = op_tmp, n.impute = imputes,
## type = "regression")
##
## n: 944 p: 4 Imputations: 50 nk: 3
##
## Number of NAs:
## AccesstoAdvancedEducation PersonalFreedomandChoice
## 301 313
## PersonalRights ToleranceandInclusion
## 298 335
##
## type d.f.
## AccesstoAdvancedEducation s 2
## PersonalFreedomandChoice s 2
## PersonalRights s 2
## ToleranceandInclusion s 1
##
## Transformation of Target Variables Forced to be Linear
##
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
## AccesstoAdvancedEducation PersonalFreedomandChoice
## 0.591 0.761
## PersonalRights ToleranceandInclusion
## 0.512 0.640
# to get mean and sd of all imputations
impute_scores_all <- data.frame()
for (imp in 1:imputes){ #imp = 1
imputed <- impute.transcan(op_gf, imputation=imp, data=op_tmp, list.out=TRUE,
pr=FALSE, check=FALSE)
subcat_data <- data.frame(imputed)
impute_scores <- data.frame(Country = op_tmp$Country,
year = op_tmp$year,
imputation = imp)
impute_scores <- cbind(impute_scores, subcat_data)
impute_scores_all <- rbind(impute_scores_all, impute_scores)
}
impute_scores_summary <- impute_scores_all %>%
gather("subcategory", "score", -(1:3)) %>%
group_by(Country, year, subcategory) %>%
dplyr::summarize(score_predict = mean(score),
sd_score_predict = sd(score))
## Warning: attributes are not identical across measure variables;
## they will be dropped
op_tmp_long <- op_tmp %>%
select(Country, year, op_score, NA_tot, AccesstoAdvancedEducation,
PersonalFreedomandChoice, PersonalRights, ToleranceandInclusion) %>%
gather("subcategory", "score", -(1:4)) %>%
left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))
## Warning: Column `Country` joining character vector and factor, coercing
## into character vector
## Warning: Column `year` joining character vector and factor, coercing into
## character vector
## Country year op_score NA_tot
## Length:3776 Length:3776 Min. :21.90 Min. :0.000
## Class :character Class :character 1st Qu.:40.69 1st Qu.:0.000
## Mode :character Mode :character Median :50.11 Median :0.000
## Mean :53.36 Mean :1.321
## 3rd Qu.:63.62 3rd Qu.:4.000
## Max. :88.83 Max. :4.000
## NA's :1436
## subcategory score score_predict sd_score_predict
## Length:3776 Min. : 3.45 Min. : 3.45 Min. : 0.000
## Class :character 1st Qu.:38.74 1st Qu.:40.06 1st Qu.: 0.000
## Mode :character Median :53.13 Median :51.54 Median : 0.000
## Mean :52.65 Mean :51.51 Mean : 4.936
## 3rd Qu.:67.96 3rd Qu.:62.77 3rd Qu.:11.090
## Max. :97.89 Max. :97.89 Max. :27.051
## NA's :1247
op_data <- op_tmp_long %>%
mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
group_by(Country, year) %>%
dplyr::summarize( #op_score_old = mean(op_score), # used this to test to make sure all is well
score = mean(score),
NA_tot = mean(NA_tot)) %>%
mutate(subcategory = "op")
Gapfill fw indicator
set.seed(227)
fw_subs <- all_spi %>%
dplyr::filter(subcategory %in% c("fw")) %>%
spread(category, score)
fw_tmp <- all_spi %>%
dplyr::filter(subcategory %in% c("fw_score")) %>%
select(Country, year, fw_score=score) %>%
left_join(fw_subs, by=c("Country", "year"))
fw_tmp <- fw_tmp %>%
rowwise() %>%
mutate(NA_tot = sum(is.na(AccesstoBasicKnowledge), is.na(AccesstoInformationandCommunications),
is.na(EnvironmentalQuality),is.na(HealthandWellness)))
hist(fw_tmp$NA_tot)

##
## 0 1 2 3 4
## 609 60 25 43 207
#md.pattern(select(fw_tmp, -(1:4))) # mice package
imputes <- 50
fw_gf <- aregImpute(~ AccesstoBasicKnowledge + AccesstoInformationandCommunications + EnvironmentalQuality + HealthandWellness,
data = fw_tmp, type = "regression", n.impute = imputes)
## Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
##
## Multiple Imputation using Bootstrap and PMM
##
## aregImpute(formula = ~AccesstoBasicKnowledge + AccesstoInformationandCommunications +
## EnvironmentalQuality + HealthandWellness, data = fw_tmp,
## n.impute = imputes, type = "regression")
##
## n: 944 p: 4 Imputations: 50 nk: 3
##
## Number of NAs:
## AccesstoBasicKnowledge AccesstoInformationandCommunications
## 285 256
## EnvironmentalQuality HealthandWellness
## 297 229
##
## type d.f.
## AccesstoBasicKnowledge s 2
## AccesstoInformationandCommunications s 2
## EnvironmentalQuality s 2
## HealthandWellness s 1
##
## Transformation of Target Variables Forced to be Linear
##
## R-squares for Predicting Non-Missing Values for Each Variable
## Using Last Imputations of Predictors
## AccesstoBasicKnowledge AccesstoInformationandCommunications
## 0.535 0.630
## EnvironmentalQuality HealthandWellness
## 0.766 0.553
# to get mean and sd of all imputations
impute_scores_all <- data.frame()
for (imp in 1:imputes){ #imp = 1
imputed <- impute.transcan(fw_gf, imputation=imp, data=fw_tmp, list.out=TRUE,
pr=FALSE, check=FALSE)
subcat_data <- data.frame(imputed)
impute_scores <- data.frame(Country = fw_tmp$Country,
year = fw_tmp$year,
imputation = imp)
impute_scores <- cbind(impute_scores, subcat_data)
impute_scores_all <- rbind(impute_scores_all, impute_scores)
}
impute_scores_summary <- impute_scores_all %>%
gather("subcategory", "score", -(1:3)) %>%
group_by(Country, year, subcategory) %>%
dplyr::summarize(score_predict = mean(score),
sd_score_predict = sd(score))
## Warning: attributes are not identical across measure variables;
## they will be dropped
fw_tmp_long <- fw_tmp %>%
select(Country, year, fw_score, NA_tot, AccesstoBasicKnowledge,
AccesstoInformationandCommunications, EnvironmentalQuality, HealthandWellness) %>%
gather("subcategory", "score", -(1:4)) %>%
left_join(impute_scores_summary, by=c("Country", "year", "subcategory"))
## Warning: Column `Country` joining character vector and factor, coercing
## into character vector
## Warning: Column `year` joining character vector and factor, coercing into
## character vector
## Country year fw_score NA_tot
## Length:3776 Length:3776 Min. :35.18 Min. :0.00
## Class :character Class :character 1st Qu.:59.48 1st Qu.:0.00
## Mode :character Mode :character Median :72.52 Median :0.00
## Mean :70.50 Mean :1.13
## 3rd Qu.:80.67 3rd Qu.:3.00
## Max. :91.75 Max. :4.00
## NA's :1340
## subcategory score score_predict sd_score_predict
## Length:3776 Min. : 7.99 Min. : 7.99 Min. : 0.000
## Class :character 1st Qu.:57.59 1st Qu.:58.24 1st Qu.: 0.000
## Mode :character Median :70.36 Median :67.62 Median : 0.000
## Mean :70.10 Mean :69.24 Mean : 3.313
## 3rd Qu.:83.65 3rd Qu.:81.55 3rd Qu.: 8.639
## Max. :99.86 Max. :99.86 Max. :19.815
## NA's :1067
fw_data <- fw_tmp_long %>%
mutate(score = ifelse(is.na(score) & NA_tot < 4, score_predict, score)) %>%
group_by(Country, year) %>%
dplyr::summarize( #fw_score_old = mean(fw_score), # used this to test to make sure all is well
score = mean(score),
NA_tot = mean(NA_tot)) %>%
mutate(subcategory = "fw")
Combine data to get Social Progress Index
The next step averages the 3 indicators and identifies gapfilling.
spi_calc <- rbind(bhn_data, op_data, fw_data) %>%
mutate(gapfill = paste(subcategory, NA_tot, sep = "_")) %>%
mutate(score = ifelse(score > 100, 100, score),
score = ifelse(score < 0, 0, score)) %>%
group_by(Country, year) %>%
dplyr::summarize(score = mean(score),
method = paste(unique(gapfill), collapse=", "),
gapfill = sum(NA_tot)) %>%
mutate(method = ifelse(gapfill == 0, NA, method),
gapfill = ifelse(gapfill >= 1, 1, 0)) %>%
data.frame()
# check:
check <- spi_data %>%
filter(category=="SocialProgressIndex") %>%
dplyr::select(Country, year, true_score=score) %>%
left_join(spi_calc)
## Joining, by = c("Country", "year")
plot(check$true_score, check$score)
abline(0,1, col="red")

write.csv(spi_calc, "int/Country_spi.csv", row.names=FALSE)
Assign countries to OHI regions
spi_calc <- read.csv("int/Country_spi.csv", stringsAsFactors = FALSE)
spi_calc <- spi_calc %>%
mutate(Country = ifelse(Country=="C<f4>te d'Ivoire", "Cote d'Ivoire", Country),
Country = ifelse(Country=="R<e9>union", "Reunion", Country),
Country = ifelse(Country=="Cura<e7>ao", "Curacao", Country),
Country = ifelse(Country=="Korea, Democratic Republic of", "North Korea", Country),
Country = ifelse(Country=="St. Helena", "Saint Helena", Country),
Country = ifelse(Country=="St. Martin", "Northern Saint-Martin", Country))
# Channel Islands are Jersey and Guernsey, but these are already in the data
spi_rgn <- name_2_rgn(df_in = spi_calc,
fld_name='Country',
flds_unique = c("year"))
##
## These data were removed for not having any match in the lookup tables:
##
## Channel Islands North Cyprus Somaliland Vatican City
## 1 1 1 1
##
## These data were removed for not being of the proper rgn_type (eez,ohi_region) or mismatching region names in the lookup tables:
## tmp_type
## tmp_name disputed landlocked
## Afghanistan 0 4
## Andorra 0 4
## Armenia 0 4
## Austria 0 4
## Belarus 0 4
## Bhutan 0 4
## Bolivia 0 4
## Botswana 0 4
## Burkina Faso 0 4
## Burundi 0 4
## Central African Republic 0 4
## Chad 0 4
## Czech Republic 0 4
## Ethiopia 0 4
## Hungary 0 4
## Kazakhstan 0 4
## Kosovo 0 4
## Kyrgyzstan 0 4
## Laos 0 4
## Lesotho 0 4
## Liechtenstein 0 4
## Luxembourg 0 4
## Macedonia 0 4
## Malawi 0 4
## Mali 0 4
## Moldova 0 4
## Mongolia 0 4
## Nepal 0 4
## Niger 0 4
## Paraguay 0 4
## Rwanda 0 4
## San Marino 0 4
## Serbia 0 4
## Slovakia 0 4
## South Sudan 0 4
## Swaziland 0 4
## Switzerland 0 4
## Tajikistan 0 4
## Turkmenistan 0 4
## Uganda 0 4
## Uzbekistan 0 4
## West Bank and Gaza 4 0
## Zambia 0 4
## Zimbabwe 0 4
##
## DUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).
## [1] "China" "Guadeloupe"
## [3] "Guam" "Hong Kong"
## [5] "Macao" "Martinique"
## [7] "Northern Mariana Islands" "Puerto Rico"
## [9] "Virgin Islands (U.S.)"
# Weight the following duplicates by population
# Northern Mariana Islands and Guam
# China, Macao, Hong Kong
# Puerto Rico and Virgin Islands (U.S.)
# Guadeloupe and Martinique
pop_weights <- data.frame(Country = c("China", "Hong Kong", "Macao",
"Guadeloupe", "Martinique",
"Guam", "Northern Mariana Islands",
"Puerto Rico", "Virgin Islands (U.S.)"),
pop = c(1389315824, 7184000, 614500,
404394, 399637,
159358, 53833,
3411000, 102951))
spi_rgn <- spi_rgn %>%
left_join(pop_weights, by = "Country") %>%
mutate(pop = ifelse(is.na(pop), 1, pop)) %>%
group_by(rgn_id, rgn_name, year) %>%
dplyr::summarize(score = weighted.mean(score, pop, na.rm=TRUE),
method = paste(unique(method), collapse=" "),
gapfill = weighted.mean(gapfill, pop, na.rm=TRUE))%>%
dplyr::mutate(year = as.numeric(as.character(year)))
## Warning: Column `Country` joining character vector and factor, coercing
## into character vector
Compare to WGI
WGI is a couple years behind the SPI, so we will use 2015 WGI for the 2016 and 2017 SPI. There is a strong correlation betwen the WGI and SPI indicators.
wgi <- read.csv('../../prs_res_wgi/v2017/output/wgi_res.csv') %>%
select(rgn_id, year, wgi_score =resilience_score)
length(unique(wgi$rgn_id))
## [1] 220
wgi_2016 <- wgi %>%
filter(year == 2015) %>%
mutate(year = 2016)
wgi_2017 <- wgi %>%
filter(year == 2015) %>%
mutate(year = 2017)
wgi <- rbind(wgi, wgi_2016, wgi_2017)
wgi_spi <- wgi %>%
left_join(spi_rgn, by=c("rgn_id", "year"))
plot(wgi_spi$wgi_score*100, wgi_spi$score, ylab="Social Progress Index, score", xlab="Worldwide Governance Score")
abline(0,1, col="red")

mod <- lm(score ~ wgi_score, data=wgi_spi)
summary(mod)
##
## Call:
## lm(formula = score ~ wgi_score, data = wgi_spi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0691 -3.6754 0.3509 3.8445 18.9330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.9488 0.8011 39.88 <2e-16 ***
## wgi_score 70.6996 1.4979 47.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.01 on 503 degrees of freedom
## (3675 observations deleted due to missingness)
## Multiple R-squared: 0.8158, Adjusted R-squared: 0.8154
## F-statistic: 2228 on 1 and 503 DF, p-value: < 2.2e-16
Second round of gapfilling
In this case, UN geopolitical regions and WGI scores are used to estimate regions with no data. Based on this analysis, a model that includes third degree polynomial of WGI data and r1 UN geopolitical regions is the best model to predict missing SPI values.
years <- data.frame(year = min(spi_rgn$year):max(spi_rgn$year))
rgns_gf_un <- georegions %>%
merge(years) %>%
left_join(spi_rgn) %>%
left_join(wgi) %>%
mutate(r2 = as.factor(r2)) %>%
mutate(r1 = as.factor(r1))
## Joining, by = c("rgn_id", "year")
## Joining, by = c("rgn_id", "year")
## Compare models to select a gapfilling method
mod1 <- lm(score ~ r2, data = rgns_gf_un, na.action="na.exclude")
mod2 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod3 <- lm(score ~ r1, data = rgns_gf_un)
mod4 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action="na.exclude")
mod5 <- lm(score ~ wgi_score, data = rgns_gf_un)
mod6 <- lm(score ~ r1 + poly(wgi_score, 2), data = rgns_gf_un, na.action = "na.exclude")
mod7 <- lm(score ~ r1 + poly(wgi_score, 3), data = rgns_gf_un, na.action = "na.exclude")
plot(predict(mod2), rgns_gf_un$score)
abline(0,1)

plot(predict(mod4), rgns_gf_un$score)
abline(0,1)

AIC(mod1, mod2, mod3, mod4, mod5, mod6, mod7)
## df AIC
## mod1 20 3504.106
## mod2 21 2751.701
## mod3 7 3697.777
## mod4 8 3010.485
## mod5 3 3248.505
## mod6 9 3012.075
## mod7 10 3003.764
## Estimate missing data and gapfill
## need to add this because some R2 regions have no data
r2_regions <- unique(rgns_gf_un$r2[!is.na(rgns_gf_un$score)])
rgns_gf_un$r2 <- ifelse(rgns_gf_un$r2 %in% r2_regions, rgns_gf_un$r2, NA)
r1_regions <- unique(rgns_gf_un$r1[!is.na(rgns_gf_un$score)])
rgns_gf_un$r1 <- ifelse(rgns_gf_un$r1 %in% r1_regions, rgns_gf_un$r1, NA)
mod_gf_r2 <- lm(score ~ r2 + wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_r2 <- predict(mod_gf_r2, newdata = rgns_gf_un[, c("r2", "wgi_score")])
mod_gf_r1 <- lm(score ~ r1 + wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_r1 <- predict(mod_gf_r1, newdata = rgns_gf_un[, c("r1", "wgi_score")])
mod_gf_wgi <- lm(score ~ wgi_score, data = rgns_gf_un, na.action = na.exclude)
rgns_gf_un$score_pred_wgi <- predict(mod_gf_wgi, newdata = data.frame(wgi_score = rgns_gf_un$wgi_score))
rgns_gf <- rgns_gf_un %>%
mutate(method = ifelse(is.na(score), "UN georgn & WGI", method)) %>%
mutate(gapfill = ifelse(is.na(score), "1", gapfill)) %>%
mutate(score = ifelse(is.na(score), score_pred_r2, score)) %>%
mutate(score = ifelse(is.na(score), score_pred_r1, score)) %>%
mutate(score = ifelse(is.na(score), score_pred_wgi, score)) %>%
select(rgn_id, year, score, method, gapfill) %>%
mutate(score = ifelse(score > 100, 100, score),
score = ifelse(score < 0, 0, score))
summary(rgns_gf) # should be no NA values
## rgn_id year score method
## Min. : 1.00 Min. :2014 Min. :36.22 Length:880
## 1st Qu.: 58.75 1st Qu.:2015 1st Qu.:63.49 Class :character
## Median :116.50 Median :2016 Median :72.82 Mode :character
## Mean :117.64 Mean :2016 Mean :70.82
## 3rd Qu.:176.25 3rd Qu.:2016 3rd Qu.:80.83
## Max. :250.00 Max. :2017 Max. :93.18
## gapfill
## Length:880
## Class :character
## Mode :character
##
##
##
length(unique(rgns_gf$rgn_id)) # should be 220 regions
## [1] 220
Uninhabited regions
These regions will receive an NA for their score (when established population is < 100 people).
uninhab <- read.csv('../../../src/LookupTables/rgn_uninhabited_islands.csv') %>%
filter(is.na(est_population) | est_population < 100)
rgns_gf_uninhab <- rgns_gf %>%
mutate(score = ifelse(rgn_id %in% uninhab$rgn_id, NA, score)) %>%
mutate(gapfill = ifelse(rgn_id %in% uninhab$rgn_id, NA, gapfill))
Save final data
gf_data <- rgns_gf_uninhab %>%
select(rgn_id, year, gapfill, method) %>%
mutate(gapfill = ifelse(is.na(method), 0, 1))
write.csv(gf_data, "output/spi_gf.csv", row.names=FALSE)
res_data <- rgns_gf_uninhab %>%
select(rgn_id, year, resilience_score=score) %>%
mutate(resilience_score = resilience_score/100)
write.csv(res_data, "output/spi_res.csv", row.names=FALSE)
prs_data <- rgns_gf_uninhab %>%
select(rgn_id, year, pressure_score=score) %>%
mutate(pressure_score = 1 - (pressure_score/100))
write.csv(prs_data, "output/spi_prs.csv", row.names=FALSE)
3 Social Progress Index data
Organize data and gapfill missing countries that have incomplete data. This index is comprised of 3 indicators, which are each comprised of 4 subindicators, which are comprised of several datasets. If one of the subindicators are missing, the SPI is not calculated. The first round of gapfilling involves using relationships between the the subindicators to gapfill missing data. If a region is missing all subindicator data, then a second round of gapfilling is performed using relationships between UN geopolitical regions and the World Governance Indicator to gapfill the SPI score.
The following gets all years of data (currently in separate files).
3.1 Gapfilling: Step 1
In this case, we use relationships between the subindicators to estimate missing data.
3.1.1 Gapfill bhn indicator
3.1.2 Gapfill op indicator
3.1.3 Gapfill fw indicator
3.2 Combine data to get Social Progress Index
The next step averages the 3 indicators and identifies gapfilling.
3.3 Assign countries to OHI regions