This script downloads aggregated catch data from the Sea Around Us Project (SAUP). We will download 2 different data sets from this page: https://www.seaaroundus.org/data/#/search
Download region specific EEZ catch data.
Download high seas catch data.
We will assign FAO ids to each year/region/species record to the EEZs and high seas data. SAUP shared a lookup table with OHI (in v2016) that links SAUP EEZ region names and ids to the FAO region they are located in (some of the region names have changed, which required a bit of tinkering for v2021, but won’t be a problem for future assessments). The proportional area of each EEZ within the FAO region was calculated for overlapping EEZs, so that we assign the correct amount of catch to EEZ/FAO region overlaps.
Reference: Pauly D., Zeller D., Palomares M.L.D. (Editors), 2020. Sea Around Us Concepts, Design and Data (seaaroundus.org).
Downloaded: August 24, 2021
Description: Tons per year and SAUP region with information on sector type, industry type, fishing entitity, reporting status and taxonomic information.
Time range: 1950 - 2018
Native data resolution: Country EEZ/FAO regions
Format: CSV
Additional Information: Methods
::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'figs/',message = FALSE, warning = FALSE, echo = TRUE, eval=FALSE)
knitr
library(tidyverse)
library(rjson)
library(RCurl)
library(data.table)
library(purrr)
library(sf)
library(mapview)
setwd(here::here("globalprep/fis/v2021"))
source('../../../workflow/R/common.R')
## This file makes it easier to process data for the OHI global assessment
## by creating the following objects:
##
## * dir_M = identifies correct file path to Mazu (internal server) based on your operating system
## * mollCRS = the crs code for the mollweide coordinate reference system we use in the global assessment
## * regions_shape() = function to load global shapefile for land/eez/high seas/antarctica regions
## * ohi_rasters() = function to load two rasters: global eez regions and ocean region
## * region_data() = function to load 2 dataframes describing global regions
## * rgn_syns() = function to load dataframe of region synonyms (used to convert country names to OHI regions)
## * low_pop() = function to load dataframe of regions with low and no human population
## * UNgeorgn = function to load dataframe of UN geopolitical designations used to gapfill missing data
<- function(x, ...) {
cat_msg if(is.null(knitr:::.knitEnv$input.dir)) {
### not in knitr environment, so use cat()
cat(x, ..., '\n')
else {
} ### in knitr env, so use message()
message(x, ...)
}return(invisible(NULL))
}
Use the SAUP API to download by eez production data. To do this, we need SAUP region id numbers.
## this is the api link.. http://api.seaaroundus.org/api/v1/eez/tonnage/taxon/?format=csv&limit=10&sciname=false®ion_id=
## we need to figure out all of their region_ids, paste them into a string with region_id = "" and run a for loop 10 countries at a time
# to download 10 regions worth of data, the api would look like this: http://api.seaaroundus.org/api/v1/eez/tonnage/taxon/?format=csv&limit=10&sciname=false®ion_id=56®ion_id=174®ion_id=233®ion_id=328®ion_id=400®ion_id=478®ion_id=586®ion_id=882®ion_id=914®ion_id=851
# this saup_rgn_ids.csv file is derived from v2016 data that SAUP provided. I updated it for the v2021 assessment by hand. SAUP rarely changes/adds region names and ids, so this list should suffice for the future.
<- read.csv("raw/saup_rgn_ids.csv") %>%
saup_regions distinct(area_name, rgn_num)
length(unique(saup_regions$rgn_num)) # 281.. perfect. I counted through the dropdown here http://www.seaaroundus.org/data/#/search, and they have 281 regions
## to use the rep() function to get 10 at a time, i need an even number of rows. Take the first 280 regions, create api ids that way, and then join back in with the single missing region.
<- saup_regions %>%
region_ids_280 distinct(rgn_num) %>%
head(280) %>%
mutate(url_id = paste(sprintf("region_id=%s", rgn_num))) %>%
mutate(tester = rep(c(1:28), times = 10)) %>%
group_by(tester) %>%
summarize(text = str_c(url_id, collapse = "&"))
<- saup_regions %>%
region_ids_281 filter(rgn_num == 974) %>%
mutate(url_id = paste(sprintf("region_id=%s", rgn_num))) %>%
mutate(tester = 29, text = str_c(url_id, collapse = "&")) %>%
::select(tester, text) %>%
dplyrrbind(region_ids_280)
for(i in 1:29){
# i = 22
<- region_ids_281$text[i]
region_id_string
= paste("http://api.seaaroundus.org/api/v1/eez/tonnage/taxon/?format=csv&limit=10&sciname=false&",region_id_string, sep="")
full_url
<- URLencode(full_url)
full_url
<- getBinaryURL(full_url,
bin ssl.verifypeer=FALSE)
<- file(file.path(sprintf("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_eez_zip_new/%s_saup_eez.zip", i)), open = "wb")
con
writeBin(bin, con)
close(con)
cat_msg('Processed \n ', i, 'of' , nrow(region_ids_281), region_id_string)
}
## unzip the files
<- list.files("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_eez_zip_new")
unzip_files
walk(unzip_files, ~ unzip(zipfile = str_c("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_eez_zip_new/", .x),
exdir = str_c("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_eez_unzip_new/", .x)))
## extract the csvs and rbind
<- list.files("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_eez_unzip_new", full.names = T, recursive = T, pattern = ".*.csv")
files
<- lapply(files, function(f) {
list_df fread(f) # faster
})
<- bind_rows(list_df, .id = "column_label")
all_df
## See if any regions are missing
<- all_df %>%
test_df left_join(saup_regions, by = c("area_name"))
sum(all_df$tonnes) # 6129245230 ; perfect
setdiff(test_df$rgn_num, saup_regions$rgn_num) # 0
setdiff(saup_regions$rgn_num, test_df$rgn_num) # 0
setdiff(saup_regions$area_name, test_df$area_name) # 0
<- all_df %>%
test_df2 distinct(area_name)
## save eez csv
write.csv(all_df, file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_eez_production.csv"), row.names = FALSE)
sum(all_df_ids_eez_final$tonnes) # 6129245230
<- all_df_ids_eez_final %>%
test filter(year == 2017)
sum(test$tonnes) # 104583995 # seems reasonable... i think Watson was something like 111 million (including high seas)
Unzip high seas data from SAUP
## I manually downloaded all of the high seas data from here: http://www.seaaroundus.org/data/#/search ; Search by "High Sea(s)", select each high seas region in the drop down, and download. Since SAUP limits you to downloading 6 regions at a time, and there are 18 high seas regions, you will have download 3 different zip files. For v2022, consider writing a for loop like above to download to save time.. if you feel like it.
## unzip the files
<- list.files("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/high_seas_zip")
unzip_files
walk(unzip_files, ~ unzip(zipfile = str_c("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/high_seas_zip/", .x),
exdir = str_c("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/high_seas_unzip/", .x)))
## extract the csvs and rbind
<- list.files("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/high_seas_unzip", full.names = T, recursive = T, pattern = ".*.csv")
files
<- lapply(files, function(f) {
list.df fread(f) # faster
})
<- lapply(files, read.csv)
list.df
<- bind_rows(list.df)
all_hs_df
sum(all_hs_df$tonnes) # 121796974
121796974 + 6129245230 # 6251042204 ; pretty close to the global and FAO estimates from SAUP
write.csv(all_hs_df, file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_high_seas_production.csv"), row.names = FALSE)
<- read.csv(file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_high_seas_production.csv"))
all_hs_df
<- all_hs_df %>%
all_hs_df_2017 filter(year == 2017)
sum(all_hs_df_2017$tonnes) # 3301944
Download SAUP data by the FAO region - this will be used to obtain FAO ids for the high seas dataset downloaded above
# http://api.seaaroundus.org/api/v1/fao/tonnage/eez/?format=csv&limit=10&sciname=false®ion_id=18®ion_id=48®ion_id=34®ion_id=27®ion_id=21
## fao regions are 18, 48, 34, 27, 21, 47, 41, 31, 58, 57, 51, 37, 88, 77, 67, 61, 87, 81, 71
<- data.frame(rgn_num = c(18, 48, 34, 27, 21, 47, 41, 31, 58, 57, 51, 37, 88, 77, 67, 61, 87, 81, 71)) %>%
region_ids distinct(rgn_num) %>%
mutate(url_id = paste(sprintf("region_id=%s", rgn_num))) %>%
mutate(tester = c(1:19)) %>%
group_by(tester) %>%
summarize(text = str_c(url_id, collapse = "&"))
for(i in 1:19){
# i = 1
<- region_ids$text[i]
region_id_string
= paste("http://api.seaaroundus.org/api/v1/fao/tonnage/eez/?format=csv&limit=10&sciname=false&",region_id_string, sep="")
full_url
<- URLencode(full_url)
full_url
<- getBinaryURL(full_url,
bin ssl.verifypeer=FALSE)
<- file(file.path(sprintf("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_fao_area_zip/%s_saup_eez.zip", region_id_string)), open = "wb")
con
writeBin(bin, con)
close(con)
}
## unzip the files
<- list.files("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_fao_area_zip")
unzip_files
walk(unzip_files, ~ unzip(zipfile = str_c("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_fao_area_zip/", .x),
exdir = str_c("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_fao_area_unzip/", .x)))
## extract the csvs and rbind
<- list.files("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/by_fao_area_unzip", full.names = T, recursive = T, pattern = ".*.csv")
files
# list_df <- lapply(files, function(f) {
# fread(f) # faster
#
# })
<- function(filename){
read_csv_filename <- read.csv(filename)
ret $Source <- filename #EDIT
ret
ret
}
<- read_csv_filename(files[[1]])
test
<- lapply(files, function(x){
list.df
read_csv_filename(x) %>%
mutate(fao_id = sub('.*\\/', '', Source)) %>%
mutate(fao_id = substr(fao_id, 9,10))
})
<- bind_rows(list.df) %>%
all_fao_df ::select(-Source, fao_name = area_name)
dplyrsum(all_fao_df$tonnes) # 6253148740 # this is different from the rgn dataset above... did i miss a region or two? No. I think it is because the EEZ data I downloaded does not include high seas. Yep, looks like that.
write.csv(all_fao_df, file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_fao_production.csv"), row.names = FALSE)
Now obtain FAO ids for the high seas and the EEZ datasets.
<- st_read(file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2016/SAU_EEZ_FAO/SAU_EEZ_FAO.shp")) %>%
saup_eez_fao st_drop_geometry() %>% ## this is what I need
::select(-OBJECTID)
dplyr# - Join by EEZID
# - I will need to split the catch between those that are duplicated, so that we end up with the correct amount of eez catch. Do it by a area weighted average, since we have the area of the FAO region within each EEZ? Bigger the shape area the more catch is allocated to that area for that species.
# - I will also need to add a new region for the north korea split (SAUP has split North Korea into 2 regions since 2016).
# - Don't worry about high seas, we will keep that simple and just join by the fao region with the high seas region.
<- read_csv(file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_fao_production.csv")) %>%
fao_areas_prod ::select(-landed_value)
dplyr
<- read.csv(file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_eez_production.csv")) %>%
saup_eezs_prod ::select(-data_layer, -uncertainty_score, -landed_value) %>%
dplyrleft_join(saup_regions)
sum(saup_eezs_prod$tonnes)
colnames(saup_eezs_prod)
<- read.csv(file.path("/home/shares/ohi/git-annex/globalprep/_raw_data/SAUP/d2021/saup_high_seas_production.csv")) %>%
high_seas_prod mutate(rgn_num = NA) %>%
::select(-landed_value)
dplyr
sum(high_seas_prod$tonnes)
######## Join SAUP high seas production data to get the FAO id ########
<- high_seas_prod %>%
join_fao_high_seas left_join(fao_areas_prod, by = c("area_name" = "fao_name", "year", "scientific_name", "common_name", "functional_group", "commercial_group", "fishing_entity", "fishing_sector", "catch_type", "reporting_status", "gear_type", "end_use_type")) %>%
::select(1, "area_type" = "area_type.x", 3:13, "tonnes" = "tonnes.x", 18)
dplyr
sum(join_fao_high_seas$tonnes) # 121796974 ; perfect
sum(high_seas_prod$tonnes) # 121796974
# save high seas FAO ids
write.csv(join_fao_high_seas, file.path("/home/shares/ohi/git-annex/globalprep/fis/v2021/high_seas_fao_ids_prod.csv"), row.names = FALSE)
################
######## Join SAUP eez production data to get FAO ids ########
# fix the North Korea split for the saup_eez_fao regions dataset. Old North Korea EEZID == 408. Now is split into Korea (North, Sea of Japan) == 973, Korea (North, Yellow Sea) == 974. Split the old North Korea region into 2, and halve the area associated with it.
<- saup_eez_fao %>%
north_korea_rgn_fix ::filter(EEZID == 408) %>%
dplyrmutate(Shape_Leng = Shape_Leng/2,
Shape_Area = Shape_Area/2,
Area_km. = Area_km./2) %>%
add_row(EEZID = 973, F_AREA = "61", Shape_Leng = 3136525, Shape_Area = 57775239853, Area_km. = 57775.24) %>%
add_row(EEZID = 974, F_AREA = "61", Shape_Leng = 3136525, Shape_Area = 57775239853, Area_km. = 57775.24) %>%
::filter(EEZID != 408)
dplyr
colnames(north_korea_rgn_fix)
# Now join back together with the original rgn_fao dataset
<- saup_eez_fao %>%
saup_eez_fao_fix filter(EEZID != 408) %>%
rbind(north_korea_rgn_fix)
length(unique(saup_eez_fao_fix$EEZID)) # 280?
setdiff(saup_regions$rgn_num, saup_eez_fao_fix$EEZID) # missing 925 - Canada pacific? Need to fix that.
# Add Canada pacific into the dataset. We will give it FAO ID = 67 (since the Pacific ocean in canada only intersects that region) and give it the same area as Canada (Arctic) in the pacific ocean (470100.3 km).
<- saup_eez_fao_fix %>%
saup_eez_fao_fix_2 add_row(EEZID = 925, F_AREA = "67", Shape_Leng = NA, Shape_Area = NA, Area_km. = 470100.3) %>%
::select(-Shape_Leng, -Shape_Area)
dplyr
length(unique(saup_eez_fao_fix_2$EEZID)) # 281, perfect.
setdiff(saup_regions$rgn_num, saup_eez_fao_fix_2$EEZID) # 0
setdiff(saup_eez_fao_fix_2$EEZID, saup_regions$rgn_num) # 0 ; perfect
#### test argentina since it has overlapping FAO regions...
# test_2 <- saup_eezs_prod %>%
# filter(area_name == "Argentina")
# sum(test_2$tonnes) # 61154495
#
# test <- saup_eezs_prod %>%
# left_join(saup_eez_fao_fix_2, by = c("rgn_num" = "EEZID")) %>%
# filter(rgn_num == 32) %>%
# group_by(area_name, area_type, year, scientific_name, common_name, functional_group, commercial_group, fishing_entity, fishing_sector, catch_type, reporting_status, gear_type, end_use_type, rgn_num, tonnes) %>%
# mutate(total_area = sum(Area_km.)) %>%
# ungroup() %>%
# mutate(area_prop = Area_km./total_area) %>%
# mutate(tonnes_fix = tonnes*area_prop)
#
# sum(test$tonnes_fix) # 61154495 - perfect, it worked!
# Now lets join the SAUP 2021 production dataset to our FAO region dataset
<- saup_eezs_prod %>%
join_eez_fao_ids left_join(saup_eez_fao_fix_2, by = c("rgn_num" = "EEZID")) %>%
group_by(area_name, area_type, year, scientific_name, common_name, functional_group, commercial_group, fishing_entity, fishing_sector, catch_type, reporting_status, gear_type, end_use_type, rgn_num, tonnes) %>%
mutate(total_area = sum(Area_km.)) %>%
ungroup() %>%
mutate(area_prop = Area_km./total_area) %>%
mutate(tonnes_fix = tonnes*area_prop)
sum(saup_eezs_prod$tonnes) # 6129245227
sum(join_eez_fao_ids$tonnes_fix) # 6129245227 ; it worked!
<- join_eez_fao_ids %>%
test head(10000)
# now write this out to mazu and we are finished!
write.csv(join_eez_fao_ids, file.path("/home/shares/ohi/git-annex/globalprep/fis/v2021/eez_fao_ids_prod.csv"), row.names = FALSE)