This script generates the “access” layer for the artisanal opportunities goal. This prep uses the UN sustainable development goal 14.b.1, “Degree of application of a legal/regulatory/policy/institutional framework which recognizes and protects access rights for small-scale fisheries (level of implementation: 1 lowest to 5 highest)”. We will rescale the scores to be between 0 and 1, match to OHI regions, and gapfill based on larger regions within the data.
Link: https://www.fao.org/sustainable-development-goals/indicators/14b1/en/
New data source, SDG 14.b.1.
Reference: Food and Agriculture Organization of the United Nations, 2020. Progress in the degree of implementation of international instruments to promote and protect small-scale fisheries, 2020.
Downloaded: 6/25/2021
Description: Progress by countries in the degree of application of a legal/regulatory/policy/institutional framework which recognizes and protects access rights for small-scale fisheries. It is a composite indicator based on FAO member country responses to the Code of Conduct for Responsible Fisheries (CCRF) survey questionnaire which is circulated by FAO every two years to members and IGOs and INGOs. This indicator is calculated on the basis of the efforts being made by countries to implement selected key provisions of the Voluntary Guidelines for Securing Sustainable Small-Scale Fisheries in the Context of Food Security and Poverty Eradication (SSF Guidelines), as reported in a given year of the survey.
Time range: 2018, 2020
Download link: https://unstats.un.org/sdgs/UNSDG/IndDatabasePage ; Click “select indicators and country or area” and select indicator 14.b. Download all data for this indicator.
::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'figs/', message = FALSE, warning = FALSE)
knitr
library(ohicore) # devtools::install_github('ohi-science/ohicore@dev')
library(dplyr)
library(tidyr)
library(stringr)
library(readr)
library(here)
library(tidyverse)
library(readxl)
library(janitor)
# directory paths and relevant files
source(here('workflow/R/common.R'))
## read in data and wrangle
<- read_xlsx(file.path(here(), "globalprep/ao/v2021/raw/raw_sdg_14_data.xlsx"), sheet = 2) %>%
raw_data clean_names() ## Raw sdg data
<- read_xlsx(file.path(here(), "globalprep/ao/v2021/raw/raw_sdg_14_data.xlsx"), sheet = 3) %>%
codes_raw clean_names() ## Shows what each of the codes means
## Here is the link to the countries that fall under each code (saved in the "raw" folder as a csv): https://unstats.un.org/unsd/methodology/m49/
<- read_csv("raw/UNSD_Methodology.csv") %>%
region_info clean_names() %>%
mutate(country_or_area = ifelse(country_or_area == "Bonaire", "Bonaire, Sint Eustatius and Saba", country_or_area)) %>%
mutate(country_or_area = ifelse(country_or_area == "Côte d’Ivoire", "Ivory Coast", country_or_area)) ## this shows the different over arching regions for each country
<- raw_data %>%
data_df ::select(geo_area_code, geo_area_name, time_detail, value, nature, observation_status, reporting_type, units) %>%
dplyrfilter(!(geo_area_code %in% c(344, 446))) %>% # filter out hongkong/macao, they are NA anyways
left_join(region_info, by = c("geo_area_name" = "country_or_area")) %>%
filter(!(iso_alpha3_code %in% c("HK", "MO"))) %>% ## filter out macao and hong kong again
::select(geo_area_code, geo_area_name, time_detail, value, region_code, region_name, sub_region_code, sub_region_name, intermediate_region_code, intermediate_region_name, iso_alpha3_code, small_island_developing_states_sids) ## Now we have a dataset with all of the information we need to begin
dplyr
<- data_df %>%
test group_by(geo_area_name) %>%
summarise(n()) ## make sure no region has more than 2 observations (2018 and 2020)
# split the country codes into overarching geo regions and specific countries
## these are all the larger regions, like "Asia", "North America", etc. that will be used for gapfilling
<- c(1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 19, 21, 29, 30, 34, 35, 39, 53, 54, 61, 62, 135, 142, 143, 145, 150, 151, 154, 155, 199, 202, 419, 432, 485, 513, 514, 515, 518, 543, 722, 738, 746, 747, 753) ## these are all of the region codes for the larger regions
bigger_regions
<- data_df %>%
data_rescale_df mutate(region_type = ifelse(geo_area_code %in% bigger_regions, "larger region", "country")) %>%
mutate(score = case_when(
== 1 ~ 0.2,
value == 2 ~ 0.4,
value == 3 ~ 0.6,
value == 4 ~ 0.8,
value == 5 ~ 1
value
))
<- data_rescale_df %>%
large_region_df filter(region_type == "larger region") ## save a large region data frame
## Now lets check which OHI regions we are missing, so that we can gapfill them later on. We want a score for every OHI region
<- data_rescale_df %>%
test filter(region_type == "country")
region_data()
setdiff(rgns_eez$rgn_name, test$geo_area_name)
## it looks like we are missing quite a few... however, many of these are name mis-matches or regions that need to be split. We will fix these below.
Use the name2rgn function to fix some of the name mismatches. Additionally, we will manually split some regions.
Name to region function (in OHI core package) reports regions that don’t have a match in OHI region list. Here we report certain reported regions at a higher spatial scale, based on the listed regions in the error message.
<- data_rescale_df %>%
data_df filter(region_type == "country") %>%
::select(geo_area_name, time_detail, region_name, sub_region_name, intermediate_region_name, small_island_developing_states_sids, region_type, score)
dplyr
# Report these regions at higher spatial resolution:
<- data.frame(geo_area_name = "Bonaire, Sint Eustatius and Saba", region = c('Bonaire', 'Saba', 'Sint Eustatius')) ## split bonaire, saba, and sint eustatius
country_split_1
<- data.frame(geo_area_name = "French Southern Territories", region = c('Glorioso Islands', 'Juan de Nova Island', 'Bassas da India', 'Ile Europa', 'Ile Tromelin', 'Crozet Islands', 'Amsterdam Island and Saint Paul Island', 'Kerguelen Islands')) # split french southern territories
country_split_2
<- data.frame(geo_area_name = "United States Minor Outlying Islands",region = c('Wake Island', 'Jarvis Island', 'Palmyra Atoll', 'Howland Island and Baker Island', 'Johnston Atoll')) # split UMIs
country_split_3
<- data.frame(geo_area_name = "Channel Islands", region = c("Jersey", "Guernsey")) # split channel islands
country_split_4
<- data.frame(geo_area_name = "China", region = c("China", "Taiwan")) # give Taiwan and China same score (Since UN STILL doesn't recognize Taiwan...)
country_split_5
<- rbind(country_split_1, country_split_2, country_split_3, country_split_4, country_split_5)
country_split
<- country_split %>%
country_split_data left_join(data_df) %>%
::select(-geo_area_name) %>%
dplyrrename(geo_area_name = region) # looks good
# Join country split data with data_df
<- rbind(data_df, country_split_data)
data_df
## Fix the name mismatches from above
<- data_df %>%
country_region_df filter(region_type == "country") %>%
mutate(geo_area_name = case_when(
== "Curaçao" ~ "Curacao",
geo_area_name == "Réunion" ~ "Reunion",
geo_area_name == "Côte d'Ivoire" ~ "Ivory Coast",
geo_area_name == "Saint Martin (French Part)" ~ "Northern Saint-Martin",
geo_area_name == "Svalbard and Jan Mayen Islands" ~ "Jan Mayen",
geo_area_name TRUE ~ geo_area_name
))
## lots of landlocked countries included here
<- name_2_rgn(df_in = country_region_df,
match_country_data_df fld_name='geo_area_name',
flds_unique=c('time_detail'))
## removed: Aland (not OHI),Bonaire Sint Saba (fixed above), Channel Islands (fixed above), "Eswatini (not OHI), French southern territories (fixed above), Isle of man (not OHI), North Macedonia (land locked), Saint Barthelemy (not OHI), Palestine (not OHI), UMI (fixed above) - perfect!
## fix duplicates
# DUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).
# # A tibble: 11 × 1
# geo_area_name
# <chr>
# 1 China
# 2 Guadeloupe
# 3 Guam
# 4 Guernsey
# 5 Jersey
# 6 Martinique
# 7 Micronesia
# 8 Micronesia (Federated States of)
# 9 Northern Mariana Islands
# 10 Puerto Rico
# 11 United States Virgin Islands
## deal with the duplicates
<- match_country_data_df %>%
fix_dups group_by(rgn_id, time_detail, rgn_name, region_type) %>%
summarise(score = mean(score, na.rm =TRUE)) %>%
ungroup() %>%
mutate(score = ifelse(is.nan(score), NA, score)) %>%
::select(rgn_id, rgn_name, time_detail, region_type, score)
dplyr
## add in the larger regions associated with each ohi region
<- match_country_data_df %>%
rgns_data_df distinct(rgn_name, rgn_id, region_type, intermediate_region_name, sub_region_name, region_name, small_island_developing_states_sids) %>%
filter(rgn_name == "Ivory Coast" | !is.na(region_name)) %>%
left_join(fix_dups) %>%
filter(rgn_id <= 250) %>%
left_join(rgns_eez) %>%
::select(1:9)
dplyr
## There is still a region "Kiribati" which needs to be split. For some reason, trying to do so with match2rgn does not work.
# Line Islands (Kiribati) == Kiribati, Phoenix Islands (Kiribati) == Kiribati, Gilbert Islands (Kiribati) == Kiribati
## filter for these regions in country_region_df and prep so they match rgns_data_df
<- country_region_df %>%
deleted_rgns filter(geo_area_name %in% c("Kiribati"))
## so what we have to do is:
# Split "Kiribati" into "Line Islands (Kiribati)" and "Phoenix Islands (Kiribati)", Gilbert Islands (Kiribati)
## Split Kiribati
<- deleted_rgns %>%
kiribati_split filter(geo_area_name %in% c("Kiribati")) %>%
mutate(geo_area_name = ifelse(geo_area_name == "Kiribati", "Line Islands (Kiribati), Phoenix Islands (Kiribati), Gilbert Islands (Kiribati)", geo_area_name)) %>%
separate_rows(geo_area_name, sep = ", ")
## Now rbind all fixes together
<- kiribati_split %>%
all_deleted_fixes left_join(rgns_eez, by = c("geo_area_name" = "rgn_name")) %>%
::select(region_name, sub_region_name, intermediate_region_name, region_type, rgn_id, "rgn_name" = "geo_area_name", time_detail, score, small_island_developing_states_sids)
dplyr
## now bind with the rgns_data_df from before (the data that didnt need to be fixed)
<- rbind(rgns_data_df, all_deleted_fixes) %>%
all_rgns_data_df filter(rgn_name != "Kiribati") # filter out original kiribati
## Now lets look to see what OHI regions are still missing
sort(setdiff(rgns_eez$rgn_name, all_rgns_data_df$rgn_name))
# [1] "Andaman and Nicobar" "Antarctica" (dont need this one) "Ascension" "Azores" "Bouvet Island" (uninhabited, dont include)
# [6] "Canary Islands" - same as spain "Clipperton Island" (uninhabited) "Macquarie Island" (uninhabited) "Madeira" - same as portugal "Oecussi Ambeno"
# [11] "Prince Edward Islands" "Tristan da Cunha"
## None of these are located in the raw UN data. I we will have to manually assign them the appropriate regions by googling.
<- data.frame(
remaining_rgns geo_area_name = c("Andaman and Nicobar", "Ascension", "Azores", "Canary Islands", "Madeira", "Oecussi Ambeno", "Prince Edward Islands", "Tristan da Cunha"),
region_name = c("Asia", "Africa", "Europe","Europe", "Europe", "Asia", "Americas", "Africa"),
sub_region_name = c("South-eastern Asia", NA, "Western Europe", "Southern Europe", "Southern Europe", "South-eastern Asia", "Northern America", NA),
intermediate_region_name = c(NA, "Western Africa", NA, NA, NA, NA, NA, NA),
small_island_developing_states_sids = c("x", "x", NA, NA, NA, "x", NA, "x")
%>%
) mutate(region_type = "country",
score = NA) %>%
crossing(time_detail = c(2018, 2020))
## Now run the match2rgn function to get OHI regions
<- name_2_rgn(df_in = remaining_rgns,
match_remaining fld_name='geo_area_name',
flds_unique=c('time_detail')) %>%
::select(-geo_area_name)
dplyr
## Now join with final dataset
<- rbind(all_rgns_data_df, match_remaining)
all_rgns_data_df
## Now check to see what OHI regions are missing (should be uninhabited regions)
sort(setdiff(rgns_eez$rgn_name, all_rgns_data_df$rgn_name))
# [1] "Antarctica" "Bouvet Island" "Clipperton Island" "Macquarie Island" - perfect
Data type 1: 35 regions with 2018 but no 2020 Data type 2: 21 with 2020 but no 2018 Data type 3: 103 regions with no data for 2018 or 2020 Data type 4: 58 regions with complete data Data type 5: 44 larger regions like “world”, “Asia”, “East Africa”, etc
Steps to gapfill:
<- all_rgns_data_df %>%
gf_df pivot_wider(names_from = time_detail, values_from = score) %>%
mutate(no_2018 = ifelse(is.na(`2018`) & !is.na(`2020`), 1, 0),
no_2020 = ifelse(is.na(`2020`) & !is.na(`2018`), 1, 0),
no_data = ifelse(is.na(`2020`) & is.na(`2018`), 1, 0),
complete_data = ifelse(!is.na(`2020`) & !is.na(`2018`), 1, 0)) %>%
filter(rgn_id <= 250)
<- gf_df %>%
test filter(no_data == 1)
sum(gf_df$complete_data)
## 35 regions with 2018 but no 2020
## 21 with 2020 but no 2018
## 103 regions with no data for 2018 or 2020
## 58 regions with complete data
35 + 21 + 103 + 58 # 217
217*2 #434
## Fill in the missing 2020 values with 2018 data
<- gf_df %>%
gf_2020_rgns filter(no_2020 == 1)
<- all_rgns_data_df %>%
gf_2020_df filter(rgn_id %in% c(gf_2020_rgns$rgn_id)) %>%
mutate(score_gf = score) %>%
group_by(rgn_id) %>%
do(fill(., score_gf, .direction = "down")) %>%
ungroup() %>%
mutate(gapfilled = ifelse(is.na(score), 1, 0)) %>%
mutate(method = ifelse(is.na(score), "Used prior year score", NA)) %>%
::select(-score) %>%
dplyrmutate(score = score_gf) %>%
::select(-score_gf)
dplyr
## fill in the missing 2018 values with 2020 values
<- gf_df %>%
gf_2018_rgns filter(no_2018 == 1)
<- all_rgns_data_df %>%
gf_2018_df filter(rgn_id %in% c(gf_2018_rgns$rgn_id)) %>%
mutate(score_gf = score) %>%
group_by(rgn_id) %>%
do(fill(., score_gf, .direction = "updown")) %>%
ungroup() %>%
mutate(gapfilled = ifelse(is.na(score), 1, 0)) %>%
mutate(method = ifelse(is.na(score), "Used later year score", NA)) %>%
::select(-score) %>%
dplyrmutate(score = score_gf) %>%
::select(-score_gf) dplyr
## now fill gapfill places with no scores at all by larger regions...
# explore the larger regions
<- large_region_df %>%
large_rgn_wide ::select(geo_area_code, geo_area_name, time_detail, region_type, score) %>%
dplyrpivot_wider(names_from = time_detail, values_from = score) %>%
mutate(no_2018 = ifelse(is.na(`2018`) & !is.na(`2020`), 1, 0),
no_2020 = ifelse(is.na(`2020`) & !is.na(`2018`), 1, 0),
complete_data = ifelse(!is.na(`2020`) & !is.na(`2018`), 1, 0))
## there are a couple larger regions with no 2020 data, so we will gapfill those backwards
<- large_region_df %>%
large_rgn_df_tidy ::select(geo_area_code, geo_area_name, time_detail, region_type, score) %>%
dplyrgroup_by(geo_area_code) %>%
do(fill(., score, .direction = "down")) %>%
ungroup() %>%
mutate(geo_area_name = ifelse(geo_area_name == "South-Eastern Asia", "South-eastern Asia", geo_area_name)) ## correct a typo
## filter for regions we haven't gapfilled yet
<- gf_df %>%
gf_nodata_rgns filter(no_data == 1)
<- all_rgns_data_df %>%
gf_nodata_df filter(rgn_id %in% c(gf_nodata_rgns$rgn_id))
unique(gf_nodata_df$rgn_id) # 103 regions - perfect
## now we will filter for "intermediate regions" and gapfill that way. If any remain after gapfilling by "intermediate region" we will filter for "sub_region" and gapfill using those values. If any remain after those steps, we will gapfill using larger "region_name" (these are the continents). And finally if there are still NAs, use the global "world" value.
##### Intermediate region gapfilling, Step 3a. #####
<- gf_nodata_df %>%
int_regions filter(!is.na(intermediate_region_name))
# now filter for intermediate regions in the large_rgn_df_tidy
<- large_rgn_df_tidy %>%
int_regions_data filter(geo_area_name %in% c(int_regions$intermediate_region_name))
<- gf_nodata_df %>%
int_regions_join_gf filter(!is.na(intermediate_region_name)) %>%
left_join(int_regions_data, by = c("intermediate_region_name" = "geo_area_name", "time_detail")) %>%
mutate(score = ifelse(small_island_developing_states_sids %in% "x", 0.8, score.y), region_type = region_type.x) %>%
::select(rgn_id, rgn_name, time_detail, score, region_type, score, region_name, sub_region_name, intermediate_region_name, small_island_developing_states_sids) %>%
dplyrfilter(!is.na(score)) %>% ## now filter out the ones that are still NA
mutate(gapfilled = 1) %>%
mutate(method = ifelse(small_island_developing_states_sids %in% "x", "Used developing small island score", "Used intermediate regions score"))
##### Sub region gapfilling Step 3b. #####
<- gf_nodata_df %>%
sub_regions filter(!is.na(sub_region_name)) %>% ## filter for regions with subregions
filter(!(rgn_id %in% c(int_regions_join_gf$rgn_id))) ## filter out those that we have already gapfilled
# now filter for intermediate regions in the large_rgn_df_tidy
<- large_rgn_df_tidy %>%
sub_regions_data filter(geo_area_name %in% c(sub_regions$sub_region_name))
<- gf_nodata_df %>%
sub_regions_join_gf filter(!is.na(sub_region_name)) %>%
filter(!(rgn_id %in% c(int_regions_join_gf$rgn_id))) %>%
left_join(sub_regions_data, by = c("sub_region_name" = "geo_area_name", "time_detail")) %>%
mutate(score = ifelse(small_island_developing_states_sids %in% "x", 0.8, score.y), region_type = region_type.x) %>%
::select(rgn_id, rgn_name, time_detail, score, region_type, score, region_name, sub_region_name, intermediate_region_name, small_island_developing_states_sids) %>%
dplyrfilter(!is.na(score)) %>%
mutate(gapfilled = 1) %>%
mutate(method = ifelse(small_island_developing_states_sids %in% "x", "Used developing small island score", "Used sub-regions score"))
#### Contentinal gapfilling (only 2 countries left, both in Oceania) Step 3c. ####
<- gf_nodata_df %>%
cont_regions filter(!is.na(region_name)) %>% ## filter for regions with subregions
filter(!(rgn_id %in% c(int_regions_join_gf$rgn_id)) & !(rgn_id %in% c(sub_regions_join_gf$rgn_id))) ## filter out those that we have already gapfilled
# now filter for intermediate regions in teh large_rgn_df_tidy
<- large_rgn_df_tidy %>%
cont_regions_data filter(geo_area_name %in% c(cont_regions$region_name))
<- gf_nodata_df %>%
cont_regions_join_gf filter(!is.na(region_name)) %>%
filter(!(rgn_id %in% c(int_regions_join_gf$rgn_id))& !(rgn_id %in% c(sub_regions_join_gf$rgn_id))) %>%
left_join(cont_regions_data, by = c("region_name" = "geo_area_name", "time_detail")) %>%
mutate(score = ifelse(small_island_developing_states_sids %in% "x", 0.8, score.y), region_type = region_type.x) %>%
::select(rgn_id, rgn_name, time_detail, score, region_type, score, region_name, sub_region_name, intermediate_region_name, small_island_developing_states_sids) %>%
dplyrfilter(!is.na(score)) %>%
mutate(gapfilled = 1) %>%
mutate(method = ifelse(small_island_developing_states_sids %in% "x", "Used developing small island score", "Used continental score"))
nrow(cont_regions_join_gf) + nrow(int_regions_join_gf) + nrow(sub_regions_join_gf) # 206 - matches the number of rows that needed to be completely gapfilled. No need to do global gapfilling!
#### Bind the gapfilled datasets together ####
<- rbind(cont_regions_join_gf, sub_regions_join_gf, int_regions_join_gf)
nodata_gf_final
<- rbind(gf_2020_df, gf_2018_df)
somedata_gf_final
<- rbind(cont_regions_join_gf, sub_regions_join_gf, int_regions_join_gf, gf_2020_df, gf_2018_df)
gapfilled_obs
#### Now bind together with non-gapfilled data to produce our final dataset ####
<- gf_df %>%
complete_data_rgns filter(complete_data == 1)
<- all_rgns_data_df %>%
final_gf_df filter(rgn_id %in% c(complete_data_rgns$rgn_id)) %>%
mutate(gapfilled = 0,
method = NA) %>%
rbind(gapfilled_obs) %>%
::select(rgn_id, "year" = "time_detail", "value" = "score", gapfilled, method)
dplyr
hist(final_gf_df$value)
## save gapfilling flag dataset
<- final_gf_df %>%
gf_flag_final ::select(rgn_id, year, gapfilled, method)
dplyr
write.csv(gf_flag_final, file.path(here(), "globalprep/ao/v2021/output/sdg_14_b_1_ao_gf.csv"), row.names = FALSE)
## save value dataset
<- final_gf_df %>%
final_data ::select(rgn_id, year, value)
dplyr
write.csv(final_data, file.path(here(), "globalprep/ao/v2021/output/sdg_14_b_1_ao.csv"), row.names = FALSE)
Lets compare to the old mora AO data. It is likely to be very dissimilar.
region_data()
<- read_csv(file.path(here(), "globalprep/res_mora_ao/v2013/data/r_mora_s4_2013a_updated.csv")) %>%
mora_data left_join(rgns_eez)
<- read_csv(file.path(here(), "globalprep/ao/v2021/output/sdg_14_b_1_ao.csv")) %>%
new_data left_join(rgns_eez)
print(setdiff(mora_data$rgn_name, new_data$rgn_name))
## regions in Mora data that are not in SDG data
# [1] "Macquarie Island" "Clipperton Island" NA "Bouvet Island" - these are all uninhabited, so it does not matter
print(setdiff(mora_data$rgn_id, new_data$rgn_id))
# 4 85 88 90 107 58 57 55 195 26 NA 105 237
setdiff(new_data$rgn_id, mora_data$rgn_id)
## 2013 (mora) vs 2018 (SDG)
<- new_data %>%
compare_2018 filter(year == 2018) %>%
left_join(mora_data, by = "rgn_id") %>%
mutate(difference = value.x - value.y)
ggplot(compare_2018, aes(x = value.y, y = value.x)) +
geom_point() +
geom_abline() +
labs(title = "AO Mora vs. SDG 14.b.1 values", x = "old value", y= "new value") +
theme_bw()
## doesnt look great since the SDG data is essentially categorical, but it is more up-to-date