1 Summary

This script generates the “need” layer for the artisanal opportunities goal. The “access” layer, which is not updated due to a lack of a data source, is located here: globalprep/res_mora_ao/v2013/data/r_mora_s4_2013a.csv.

1.1 Updates from previous assessment

One more year of data

1.2 Data Source

Downloaded: 7/23/2018

Description:
GDP adjusted per capita by PPP (ppppcgdp) http://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD Reported at country scale.

GDP per capita based on purchasing power parity (PPP). PPP GDP is gross domestic product converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States. GDP at purchaser’s prices is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in constant international dollars based on the 2011 ICP round.

Data is available directly to R throught the WDI package.

Time range: 1990-2017

2 Methods

2.1 Setup

knitr::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'figs/', message = FALSE, warning = FALSE)

# setting up provenance
# devtools::install_github('oharac/provRmd')
# library(provRmd)
# prov_setup()

library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)
library(tidyr)
library(WDI) # install.packages('WDI')
library(stringr)
library(readr)

# comment out when knitting:
# setwd("globalprep/ao/v2018")

# directory paths
source('https://rawgit.com/OHI-Science/ohiprep_v2018/master/src/R/common.R')

2.2 Download and save data

# check website to see what years are available
yr_start = 1990
yr_end   = 2017


# get description of variables (NOTE: these descriptions appear out of date, they aren't in sync with the definitions of the World Bank):
indicators <-  data.frame(WDI_data[[1]])

head(indicators)

# indicators[grep("NY.GDP.PCAP.PP.CD", indicators$indicator), ]  # current dollars (influenced by inflation, not being used)
indicators[grep("NY.GDP.PCAP.PP.KD", indicators$indicator), ]  # constant dollars. grep helps identify rows to select based on a string. (used this data)


# download the data using the WDI package - This is the data we are going to work with. Create a variable for the data frame

gdppcppp_raw <-  WDI(country = "all",
               indicator = "NY.GDP.PCAP.PP.KD", 
               start = yr_start, end=yr_end)


# check if 'raw', 'intermediate', and 'final' exists, if not, then create the folder in the working directory
if (!file.exists("raw")){
  dir.create("raw")
} 

if (!file.exists("intermediate")){
  dir.create("intermediate")
} 

if (!file.exists("output")){
  dir.create("output")
} 

#Save the file into the raw folder
write.csv(gdppcppp_raw, 'raw/raw_gdppcppp.csv', row.names=FALSE) # once you've made sure the 'raw' folder exists, you can save your file

2.3 Gapfilling 1: Linear Regression within a country’s data

When a country has some values (but not a complete series), a within country regression model is used to predict the missing values.

# Reorganize to create cells for countries that have missing values for some years
d <- read.csv('raw/raw_gdppcppp.csv') %>%
  dplyr::select(country, value=NY.GDP.PCAP.PP.KD, year) %>%
  dplyr::filter(year >= 2005) %>%
  tidyr::spread(year, value) %>%
    # spread to fill in potentially missing values with NA
  data.frame() %>% # this will add an X in front of the column names, allowing us to gather the values
  tidyr::gather(year, value, starts_with("X")) %>%
  dplyr::mutate(year = gsub("X", "", year)) %>% #substitute X for "" (nothing) in the column year
  dplyr::mutate(year = as.numeric(year)) #convert the year column into a numeric format
  

# For the first case, if there is only one value use this value for all years
# This is not ideal, but likely better than other forms of gapfilling
d <- d %>%
  dplyr::group_by(country) %>%
  dplyr::mutate(value_num = sum(!is.na(value))) %>% # counts the numbers of non-missing values for each country (logical TRUEs regarded as one)
  dplyr::filter(value_num > 0) %>%    # filter out the countries with no data between 2005 and 2017
  dplyr::mutate(value_num_gf = ifelse(value_num==1, mean(value, na.rm=TRUE), NA)) %>%  # mean() function is just used to get the single value and remove all NAs in that group
  dplyr::ungroup() %>%
  dplyr::mutate(value = ifelse(is.na(value), value_num_gf, value)) %>% # if no value is missing, leave it, otherwise gapfill; still have NAs where 13 > value_num > 1
  dplyr::select(country, year, value, value_num)  # select just these columns; to eliminate extraneous value_num_gf column
 
  
  
# Predict values using a linear regression with 'year' as an independent variable 
# Create new column with these predicted values
d_gf <- d %>%
  dplyr::group_by(country) %>%
  dplyr::do({ 
    mod <- lm(value ~ year, data =.)
    value_pred <- predict(mod, newdata =.[c('year')]) # value_pred = country-grouped mod$fitted.values?
    data.frame(., value_pred) # do loop applies the model fitting and prediction to each country group
  }) %>% 
  dplyr::ungroup()


# Fill in the remaining NA values using the predicted values

d_gf <- d_gf %>%
  dplyr::ungroup() %>%
  dplyr::mutate(gapfilled = ifelse(is.na(value), 1, 0)) %>% # Create column 'gapfilled', if value is currently NA, it will be gapfilled, indicated by a 1 in the gapfill column
  dplyr::mutate(gapfilled = ifelse(value_num == 1, 1, gapfilled)) %>% # if value_num is 1 it was gapfilled previously and gets a 1 in the gapfill column
  dplyr::mutate(value = ifelse(is.na(value), value_pred, value)) %>% # if NA in value column, input the value in value_pred column
  dplyr::mutate(method = ifelse(gapfilled==1, paste("lm based on N years data:", value_num, sep=" "), NA)) %>% # Create column 'method' that indicates method of gapfilling; this puts message "lm based..." even in some rows gapfilled with one year of data
  dplyr::mutate(method = ifelse(value_num == 1, "gapfilled using one year of data", method)) # this overwrites/corrects method "lm based..." for rows actually gapfilled with "one-year of data"" method

2.4 Calculate rescaled values

This is performed by taking the natural log of each value and then dividing by the 95th quantile of values across all years (from 2005 to 2017).

# Values at the 95th Quantile or greater are given a rescaled score of '1' (the highest value)
d_rescale <- d_gf %>%
  dplyr::mutate(quantile_95 = quantile(value, probs=0.95)) %>% # gives a single value - the 95th quant
  dplyr::mutate(value_stand = value/quantile_95) %>% # where does value scale relative to 95th quantile
  dplyr::mutate(value_stand = ifelse(value_stand > 1, 1, value_stand)) %>% 
  dplyr::select(country, year, value, score=value_stand, gapfilled, method) # rename value_stand 'score'


##check the data with some plots
# plotData <- d_rescale %>%
#   dplyr::mutate(need = 1-score) %>%
#   dplyr::select(country, year, need)
# 
# library(googleVis)
# 
# Motion = gvisMotionChart(plotData,idvar="country", timevar="year")
# 
# plot(Motion)
# print(Motion, file = file.path('ao_need_95q.html'))

2.5 Convert country names to ohi regions

# Function to add OHI region ID based on country name
d_stand_rgn <- name_2_rgn(df_in = d_rescale, 
                       fld_name='country', 
                       flds_unique=c('year')) # cannot load look up tables. 

##Check if removed countries/regions are acording to what is expected.


# Combine the duplicate regions (we report these at lower resolution)
# In this case, we take the average score weighted by population
population_weights <- read.csv('../../../../ohiprep_v2018/src/LookupTables/Pop_weight_ChinaSAR_USVIslPRico.csv')


# Weight the `score`, `value`, and `gapfilled` column by population
d_stand_rgn <- d_stand_rgn %>%
  dplyr::left_join(population_weights, by="country") %>%
  dplyr::mutate(population = ifelse(is.na(population), 1, population)) %>% # If no value available, input 1 (these values will not change)
  dplyr::group_by(rgn_id, year, method, gapfilled) %>% 
  dplyr::summarize(score = weighted.mean(score, population), # weight the single score value by pop.
            value = weighted.mean(value, population)) %>%
  ungroup() 


# Removed `Azerbaijan` (255) because the adjacent body of water is a sea not the ocean 
d_stand_rgn <- d_stand_rgn %>%
  filter(rgn_id <= 250)

summary(d_stand_rgn)


# save the cleaned gdppcppp for other goals
gdppcppp_data <- d_stand_rgn %>%
  select(rgn_id, year, value)

write_csv(gdppcppp_data, "intermediate/gdppcppp_ohi.csv")

2.6 Gapfilling: part 2

In this case, we gapfill regions with no data using UN geopolitical means.

# Create dataframe pairing each UN geopolitical region id with a year from 2005 to current

georegions <- ohicore::georegions
d_stand_gf <- data.frame(year=min(d_stand_rgn$year):max(d_stand_rgn$year)) %>% 
  merge(georegions, by=NULL) 


# Combine the two data frames by region id and year
# Calculate means across different geopolitical levels (e.g. r2, r1)
d_stand_gf <- d_stand_gf %>%  
  left_join(d_stand_rgn, by = c("rgn_id", "year")) %>%
  group_by(r2, year) %>%
  mutate(r2_value = mean(score, na.rm=TRUE)) %>%
  ungroup() %>%
  group_by(r1, year) %>%
  mutate(r1_value = mean(score, na.rm=TRUE)) %>%
  ungroup() %>%
  group_by(r0, year) %>%
  mutate(r0_value = mean(score, na.rm=TRUE)) %>%
  ungroup()


# For `score` cells that still have NA values (still several hundred):
# Check to see if r2 has a value, if so use that to gapfill `score`, otherwise use r1, otherwise use r0
d_stand_gf <- d_stand_gf %>%
  mutate(gapfilled = ifelse(is.na(score) & !is.na(r2_value), "1", gapfilled)) %>%
  mutate(method = ifelse(is.na(score) & !is.na(r2_value), "UN_geopolitical region avg, r2", method)) %>%
  mutate(score = ifelse(is.na(score), r2_value, score)) %>%
  mutate(gapfilled = ifelse(is.na(score) & !is.na(r1_value), "1", gapfilled)) %>%
  mutate(method = ifelse(is.na(score) & !is.na(r1_value), "UN_geopolitical region avg, r1", method)) %>%
  mutate(score = ifelse(is.na(score), r1_value, score)) %>%
  mutate(gapfilled = ifelse(is.na(score) & !is.na(r0_value), "1", gapfilled)) %>%
  mutate(method = ifelse(is.na(score) & !is.na(r0_value), "UN_geopolitical region avg, r0", method)) %>%
  mutate(score = ifelse(is.na(score), r0_value, score))

# should no longer have NAs in the score column!
summary(d_stand_gf) #No NA!!

2.7 Save the data

# Save dataframe with adjusted, gapfilled, and rescaled score information
final <- d_stand_gf %>%
  select(rgn_id, year, value=score)

write.csv(final, "output/wb_gdppcppp_rescaled.csv", row.names=FALSE)


# Save dataframe with gapfilled method and status information
final_gf <- d_stand_gf %>%
  select(rgn_id, year, gapfilled, method)

write.csv(final_gf, "output/wb_gdppcppp_rescaled_gf.csv", row.names=FALSE)

2.8 Compare data to previous year

old_gdppppc <- read.csv("../v2017/output/wb_gdppcppp_rescaled.csv") %>% 
  dplyr::rename(old_value=value) %>% 
  dplyr::filter(year == 2016)

new <- read.csv("output/wb_gdppcppp_rescaled.csv") %>%
  dplyr::filter(year == 2016) %>% 
  dplyr::left_join(old_gdppppc, by="rgn_id") #by=c("rgn_id", "year")

plot(new$value, new$old_value)
abline(0,1, col="red")
identify(new$value, new$old_value, labels = new$rgn_id)

#identify(old$old_pressure, old$pressure_score, labels = old$rgn_id)

OHI 2018 - Artisanal Opportunities: Preparing need data

Compiled on Thu Sep 13 14:29:25 2018 by cvargas