This script generates the “need” layer for the artisanal opportunities goal.
One more year of data.
Downloaded: 08/04/2021
Description:
GDP adjusted per capita by PPP (ppppcgdp) http://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD Reported at country scale.
GDP per capita based on purchasing power parity (PPP). PPP GDP is gross domestic product converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States. GDP at purchaser’s prices is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in constant international dollars based on the 2011 ICP round.
Data is available directly to R through the WDI package.
Time range: 1990-2020
::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'figs/', message = FALSE, warning = FALSE)
knitr
library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)
library(tidyr)
library(WDI) # install.packages('WDI') ## if the data has been updated you will need to re install the WDI package to get the updated data
library(stringr)
library(readr)
library(here)
library(tidyverse)
library(plotly)
# directory paths and relevant files
source(here('workflow/R/common.R'))
Skip if you have already downloaded data.
# check website to see what years are available
= 1990
yr_start = 2020
yr_end
# get description of variables (NOTE: these descriptions appear out of date, they aren't in sync with the definitions of the World Bank):
<- data.frame(WDI_data[[1]])
indicators
head(indicators)
head(WDI_data)
str(WDI_data)
grep("NY.GDP.PCAP.PP.CD", indicators$indicator), ] # current dollars (influenced by inflation, not being used)
indicators[grep("NY.GDP.PCAP.PP.KD", indicators$indicator), ] # constant dollars. grep helps identify rows to select based on a string. (used this data)
indicators[
# download the data using the WDI package - This is the data we are going to work with. Create a variable for the data frame
<- WDI(country = "all",
gdppcppp_raw indicator = "NY.GDP.PCAP.PP.KD",
start = yr_start, end=yr_end)
summary(gdppcppp_raw)
# check if 'raw', 'intermediate', and 'final' folders exist in the current assessment folder, if not, then create the folder in the working directory
if (!file.exists(here("globalprep/ao/v2021/raw"))){
dir.create(here("globalprep/ao/v2021/raw"))
}
if (!file.exists(here("globalprep/ao/v2021/intermediate"))){
dir.create(here("globalprep/ao/v2021/intermediate"))
}
if (!file.exists(here("globalprep/ao/v2021/output"))){
dir.create(here("globalprep/ao/v2021/output"))
}
<- Sys.Date()
date
#Save the file into the raw folder
write_csv(gdppcppp_raw, here(sprintf('globalprep/ao/v2021/raw/raw_gdppcppp_%s.csv', date))) # Save file with date, as WDI data changes over even short periods of time. For instance, the Mauritania GDP data changed by an order of magnitude over the course of a week. We want to preserve the date it was downloaded so that data is not being overwritten everytime we run the script.
<- "2021-08-04" # update when new data are downloaded.
last_saved_date
<- read_csv(here(sprintf('globalprep/ao/v2021/raw/raw_gdppcppp_%s.csv', last_saved_date)))
new
<- read_csv(here('globalprep/ao/v2020/raw/raw_gdppcppp_2020-07-02.csv')) %>%
old select(country, old_value = NY.GDP.PCAP.PP.KD, year)
<- left_join(new, old, by = c("country", "year")) %>%
compare filter(year==2019)
ggplot(compare, aes(x=NY.GDP.PCAP.PP.KD, y=old_value)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color="red")
When a country has some values (but not a complete series), a within-country regression model is used to predict the missing values.
# Reorganize to create cells for countries that have missing values for some years
<- "2021-08-04" # update when new data are downloaded.
last_saved_date
<- read_csv(here(sprintf('globalprep/ao/v2021/raw/raw_gdppcppp_%s.csv', last_saved_date))) %>% # change date here also to match filename above
gdppcppp_clean ::select(country, value=NY.GDP.PCAP.PP.KD, year) %>%
dplyr::filter(year >= 2005) %>%
dplyr::spread(year, value) %>%
tidyr# spread to fill in potentially missing values with NA
data.frame() %>% # this will add an X in front of the column names, allowing us to gather the values
::gather(year, value, starts_with("X")) %>%
tidyr::mutate(year = gsub("X", "", year)) %>% #substitute X for "" (nothing) in the column year
dplyr::mutate(year = as.numeric(year)) #convert the year column into a numeric format
dplyr
head(gdppcppp_clean)
summary(gdppcppp_clean) # 441 NAs
# For the first case, if there is only one value use this value for all years
# This is not ideal, but likely better than other forms of gapfilling
<- gdppcppp_clean %>%
gdppcppp_val ::group_by(country) %>%
dplyr::mutate(value_num = sum(!is.na(value))) %>% # counts the numbers of non-missing values for each country (logical TRUEs regarded as one)
dplyr::filter(value_num > 0) %>% # filter out the countries with no data between 2005 and 2019
dplyr::mutate(value_num_gf = ifelse(value_num==1, mean(value, na.rm=TRUE), NA)) %>% # mean() function is used on regions with one year of data, applies that single value to all NAs for that region
dplyr::ungroup() %>%
dplyr::mutate(value = ifelse(is.na(value), value_num_gf, value)) %>% # if no value is missing, leave it, otherwise gapfill; still have NAs where 13 > value_num > 1
dplyr::select(country, year, value, value_num) # select just these columns; to eliminate extraneous value_num_gf column
dplyr
head(gdppcppp_val)
summary(gdppcppp_val)
# Predict values using a linear regression with 'year' as an independent variable
# Create new column with these predicted values
<- gdppcppp_val %>%
gdppcppp_gf ::group_by(country) %>%
dplyr::do({
dplyr<- lm(value ~ year, data =.)
mod <- predict(mod, newdata =.[c('year')]) # value_pred = country-grouped mod$fitted.values?
value_pred data.frame(., value_pred) # do loop applies the model fitting and prediction to each country group
%>%
}) ::ungroup()
dplyr
summary(gdppcppp_gf)
head(gdppcppp_gf)
# Fill in the remaining NA values using the predicted values
<- gdppcppp_gf %>%
gdppcppp_gf2 ::ungroup() %>%
dplyr::mutate(gapfilled = ifelse(is.na(value), 1, 0)) %>% # Create column 'gapfilled', if value is currently NA, it will be gapfilled, indicated by a 1 in the gapfill column
dplyr::mutate(gapfilled = ifelse(value_num == 1, 1, gapfilled)) %>% # if value_num is 1 it was gapfilled previously and gets a 1 in the gapfill column
dplyr::mutate(value = ifelse(is.na(value), value_pred, value)) %>% # if NA in value column, input the value in value_pred column
dplyr::mutate(method = ifelse(gapfilled==1, paste("lm based on N years data:", value_num, sep=" "), NA)) %>% # Create column 'method' that indicates method of gapfilling; this puts message "lm based..." even in some rows gapfilled with one year of data
dplyr::mutate(method = ifelse(value_num == 1, "gapfilled using one year of data", method)) # this overwrites/corrects method "lm based..." for rows actually gapfilled with "one-year of data"" method
dplyr
summary(gdppcppp_gf2) # no more NAs because everything has been gap-filled.
This is performed by taking the natural log of each value and then dividing by the 95th quantile of values across all years (from 2005 to current data year).
# Values at the 95th Quantile or greater are given a rescaled score of '1' (the highest value)
<- gdppcppp_gf2 %>%
gdppcppp_rescale ::mutate(quantile_95 = quantile(value, probs=0.95)) %>% # gives a single value - the 95th quant (v2020=57245.33)
dplyr::mutate(value_stand = value/quantile_95) %>% # where does value scale relative to 95th quantile
dplyr::mutate(value_stand = ifelse(value_stand > 1, 1, value_stand)) %>%
dplyr::select(country, year, value, score=value_stand, gapfilled, method) # rename value_stand 'score'
dplyr
summary(gdppcppp_rescale)
head(gdppcppp_rescale)
# Check to see if scores make sense - anything above reference point (quant_95) should = 1, everything below it should have a value between 0 and 1
<- ggplot(gdppcppp_rescale, aes(x =value , y = score, label = country)) +
rescale_vis geom_point()
ggplotly(rescale_vis)
# Function to add OHI region ID based on country name
<- name_2_rgn(df_in = gdppcppp_rescale,
d_stand_rgn fld_name='country',
flds_unique=c('year'))
## v2021: Lots of error messages about missing regions from lookup table; lots of them are broad areas (e.g. "Arab World" and "fragile regions"), some are landlocked areas like N. Macedonia and Eswatini. Check to make sure there aren't any regions that need to be reported at different scales
## China, Hong Kong and Macao are all reported separately, combine into one
data.frame(filter(d_stand_rgn, rgn_id == 209))
##This should match the duplicate regions
# Combine the duplicate regions (we report these at lower resolution)
# In this case, we take the average score weighted by population.
# I updated this population data set for 2020 using https://population.un.org/wpp/Download/Standard/Population/
<- read_csv(file.path(here::here(), "globalprep/supplementary_information/v2021/pop_weights.csv"))
population_weights
# Weight the `score`, `value`, and `gapfilled` column by population
<- d_stand_rgn %>%
d_stand_rgn ::left_join(population_weights, by="country") %>% # does it make sense to backfill population data with static data?
dplyr::mutate(population = ifelse(is.na(population), 1, population)) %>% # If no value available, input 1 (these values will not change)
dplyr::group_by(rgn_id, year, method, gapfilled) %>%
dplyr::summarize(score = weighted.mean(score, population), # weight the single score value by pop.
dplyrvalue = weighted.mean(value, population)) %>%
ungroup()
# check again:
data.frame(filter(d_stand_rgn, rgn_id == 209))
# Removed `Azerbaijan` (255) because the adjacent body of water is a sea not the ocean - is this not done in names2region?
<- d_stand_rgn %>%
d_stand_rgn filter(rgn_id <= 250)
summary(d_stand_rgn) # no NAs
# save the cleaned gdppcppp for other goals
<- d_stand_rgn %>%
gdppcppp_data select(rgn_id, year, value)
write_csv(gdppcppp_data, here("globalprep/ao/v2021/intermediate/gdppcppp_ohi.csv"))
In this case, we gapfill regions with no data using means based on UN-designated geopolitical levels.
UNgeorgn() # how is this different from Mel's georegion function in ohicore?
head(UNgeorgn)
summary(UNgeorgn)
# Create dataframe pairing each UN geopolitical region id with a year from 2005 to current
# Assign georegion labels to each region for each level (r0, r1, r2)
<- data.frame(year=min(d_stand_rgn$year):max(d_stand_rgn$year)) %>%
d_stand_gf merge(UNgeorgn, by=NULL)
summary(d_stand_gf)
head(d_stand_gf)
# Combine the two data frames by region id and year
# Calculate means across increasing geopolitical levels (e.g. r2, r1), using the highest resolution possible
<- d_stand_gf %>%
d_stand_gf left_join(d_stand_rgn, by = c("rgn_id", "year")) %>%
group_by(r2_label, year) %>%
mutate(r2_value = mean(score, na.rm=TRUE)) %>%
ungroup() %>%
group_by(r1_label, year) %>%
mutate(r1_value = mean(score, na.rm=TRUE)) %>%
ungroup() %>%
group_by(r0_label, year) %>%
mutate(r0_value = mean(score, na.rm=TRUE)) %>%
ungroup()
summary(d_stand_gf)
low_pop()
summary(low_pop)
<- low_pop %>%
low_pop filter(est_population < 3000 | is.na(est_population)) #filter out regions that have populations > 3000 and keep NA values
summary(low_pop)
<- c(low_pop$rgn_id) #make a vector of low population areas
low_pop_vector
# For `score` cells that still have NA values (still several hundred):
# Check to see if r2 has a value, if so use that to gapfill `score`, otherwise use r1, otherwise use r0
<- d_stand_gf %>%
d_stand_gf mutate(gapfilled = ifelse(is.na(score) & !is.na(r2_value), "1", gapfilled)) %>%
mutate(method = ifelse(is.na(score) & !is.na(r2_value), "UN_geopolitical region avg, r2", method)) %>%
mutate(score = ifelse(is.na(score), r2_value, score)) %>%
mutate(gapfilled = ifelse(is.na(score) & !is.na(r1_value), "1", gapfilled)) %>%
mutate(method = ifelse(is.na(score) & !is.na(r1_value), "UN_geopolitical region avg, r1", method)) %>%
mutate(score = ifelse(is.na(score), r1_value, score)) %>%
mutate(gapfilled = ifelse(is.na(score) & !is.na(r0_value), "1", gapfilled)) %>%
mutate(method = ifelse(is.na(score) & !is.na(r0_value), "UN_geopolitical region avg, r0", method)) %>%
mutate(score = ifelse(is.na(score), r0_value, score))
$score[d_stand_gf$rgn_id %in% low_pop_vector] <- NA
d_stand_gf#should now have NA values in score column for low popuation areas
summary(d_stand_gf)
<- d_stand_gf %>%
test filter(rgn_id == 44)
# Save dataframe with adjusted, gapfilled, and rescaled score information
<- d_stand_gf %>%
final select(rgn_id, year, value=score)
write_csv(final, here("globalprep/ao/v2021/output/wb_gdppcppp_rescaled.csv"))
# Save dataframe with gapfilled method and status information
<- d_stand_gf %>%
final_gf select(rgn_id, year, gapfilled, method)
write_csv(final_gf, here("globalprep/ao/v2021/output/wb_gdppcppp_rescaled_gf.csv"))
Use most recent data year shared by current and previous assessment.
<- read.csv(here("globalprep/ao/v2020/output/wb_gdppcppp_rescaled.csv")) %>%
old_gdppcppp ::rename(old_value=value) %>%
dplyr::filter(year == 2019)
dplyrsummary(old_gdppcppp) # 20 NAs
<- read.csv(here("globalprep/ao/v2021/output/wb_gdppcppp_rescaled.csv")) %>%
compare ::filter(year == 2019) %>%
dplyr::left_join(old_gdppcppp, by="rgn_id") %>%
dplyrselect(rgn_id, value, old_value) %>%
mutate(difference = value - old_value)
summary(compare) # still 20 NAs - this is because of converting unpopulated/low population regions to NAs
ggplotly(ggplot(compare, aes(x = value, y = old_value, labels = rgn_id)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red")) # if anything was off of abline then something went wrong; however in v2019 data were positively skewed and we attributed this to source data changes.
####### Investigating outlier (v2021)
<- old_gdppcppp %>%
old_108 filter(rgn_id == 108)
<- d_stand_gf %>%
new_108 filter(rgn_id == 108) %>%
filter(year < 2019) %>%
rename(new_value = score) %>%
select(rgn_id, year, gapfilled, method, new_value)
<- read_csv(here("globalprep/ao/v2020/output/wb_gdppcppp_rescaled_gf.csv")) %>%
old_gf_108 filter(rgn_id == 108) %>%
left_join(old_108, by = c("rgn_id", "year"))
#### Result: looks like v2020 gapfilled all years of data for some reason, however, this year none of it was gapfilled.
####### Investigating outlier (v2020)
<- old_gdppcppp %>%
old_44 filter(rgn_id == 44)
<- d_stand_gf %>%
new_44 filter(rgn_id == 44) %>%
filter(year < 2020) %>%
rename(new_value = score) %>%
select(rgn_id, year, gapfilled, method, new_value)
<- read_csv(here("globalprep/ao/v2020/output/wb_gdppcppp_rescaled_gf.csv")) %>%
old_gf_44 filter(rgn_id == 44) %>%
left_join(old_44, by = c("rgn_id", "year"))