ohi logo
OHI Science | Citation policy

REFERENCE RMD FILE

#Summary This analysis converts FAO capture production data into the OHI 2020 targeted harvest pressure data.

#Updates from previous assessment One more year of data

v2019: Adding in here() where appropriate and incorporating read_csv() etc, changed some objects to have more descriptive names.


#Data Source http://www.fao.org/fishery/statistics/software/fishstatj/en#downlApp Release date: March 2019 FAO Global Capture Production Quantity 1950_2018 Information: http://www.fao.org/fishery/statistics/global-capture-production/en

Downloaded: May 11 2020

Description: Quantity (tonnes) of fisheries capture for each county, species, year.

Time range: 1950-2018


# load libraries, set directories
library(ohicore)  #devtools::install_github('ohi-science/ohicore@dev')
library(tidyverse)
library(plotly)
library(here)
library(janitor)

### Load FAO-specific user-defined functions
source(here('workflow/R/fao_fxn.R')) # function for cleaning FAO files (not combined into common.R like most other functions have been at this point)
source(here('workflow/R/common.R')) # directory locations

1 Read in the raw data

This includes the FAO capture production data and a list of the “target” species.

## FAO capture production data - all columns being parsed as characters and producing error in one column, but not sure which? (read.csv might help avoid this error?)
fis_fao_raw <-  read_csv(file.path(dir_M, 'git-annex/globalprep/_raw_data/FAO_capture/d2020/Global_capture_production_Quantity_1950-2018.csv'))

# species list - used same raw files from v2019
sp2grp <-  read_csv(here('globalprep/prs_targetedharvest/v2020/raw/species2group.csv')) %>%
  dplyr::filter(incl_excl == 'include') %>%
  dplyr::select(target, species); head(sp2grp)

2 Clean the FAO data

# Rename columns and remove unit column
fao_clean <- fis_fao_raw %>% 
  dplyr::rename(country = "Country (Country)",
                species = "ASFIS species (ASFIS species)",
                area = "FAO major fishing area (FAO major fishing area)") %>%
  select(-"Unit (Unit)")

# Gather by year and value to expand and make each line a single observation for country, species and year (tidy data!) 
fao_clean <- fao_clean %>%
  tidyr::gather("year", "value", -(1:3)) %>%
  dplyr::mutate(year = gsub("X", "", year)) %>%
    fao_clean_data() 

fao_clean <- fao_clean %>%
  dplyr::mutate(species = as.character(species)) %>%
  dplyr::mutate(species = ifelse(stringr::str_detect(species, "Henslow.*s swimming crab"), "Henslow's swimming crab", species))

3 Identify the target species

This analysis only includes target species. The warning messages need to be checked and, if necessary, changes should be made to the raw/species2group.csv

# check for discrepancies in species list
spgroups <-  sort(as.character(unique(fao_clean$species))) # species groups in FAO data 
groups <-  c('turtle', 'whale', 'dolphin', 'porpoise') # seals and sea lions removed from vector (pinnipeds no longer included) 

# Going through FAO data species and seeing if they're in our master list of species
## Looking to see if we need to add species that have changed name
### v2020: All species in error message are excluded. Chinese softshell turtle and River and lake turtles nei are not marine turtles, and Velvet whalefish and Common dolphinfish are fish, not cetaceans.
for (group in groups) {# group='dolphin'
possibles <- spgroups[grep(group, spgroups)]
d_missing_l <-  setdiff(possibles, sp2grp$species)
  if (length(d_missing_l)>0){
    cat(sprintf("\nMISSING in the lookup the following species in target='%s'.\n    %s\n", 
                group, paste(d_missing_l, collapse='\n    ')))
  }
}


# check for species in lookup not found in data
l_missing_d <-  setdiff(sp2grp$species, spgroups)
if (length(l_missing_d)>0){
  cat(sprintf('\nMISSING: These species in the lookup are not found in the FAO data \n'))
  print(l_missing_d)
}


## filter data to include only target species ----
target_spp <-  fao_clean %>%
  dplyr::filter(species %in% sp2grp$species) # this goes from 2211 spp in FAO list to just 69

unique(target_spp$area) # confirm these are all marine regions

4 Summarize data

# widen spread to expand years
wide = target_spp %>%
  tidyr::spread(year, value) %>%
  dplyr::left_join(sp2grp, by='species'); head(wide)


# gather long by target
long = wide %>%
  dplyr::select(-area) %>%
  tidyr::gather(year, value, -country, -species, -target, na.rm=T) %>%
  dplyr::mutate(year = as.integer(as.character(year))) %>%
  dplyr::arrange(country, target, year); head(long)


# explore Japan[210] as an example
japan <- long %>% 
  dplyr::group_by(country, target, year) %>%
  dplyr::summarize(value = sum(value)) %>% 
  dplyr::filter(country == 'Japan', target == 'cetacean', year >= 2000) 

# summarize totals per region per year - number of individual animals from each spp group? 
sum = long %>%
  dplyr::group_by(country, year) %>%
  dplyr::summarize(value = sum(value, na.rm=TRUE)) %>%
  dplyr::filter(value != 0) %>%
  dplyr::ungroup(); head(sum) 

5 Assign country names to OHI regions

sum <- sum %>%
  dplyr::mutate(country = as.character(country)) %>%
  dplyr::mutate(country = ifelse(stringr::str_detect(country, "C.*te d'Ivoire"), "Ivory Coast", country))


### Function to convert to OHI region ID
m_sum_rgn <- name_2_rgn(df_in = sum, 
                       fld_name='country', 
                       flds_unique=c('year'))

# Filter out duplicates based on error message from previous step
dplyr::filter(m_sum_rgn, country %in% c("Guadeloupe", "Martinique"))

# They will be summed:
m_sum_rgn <- m_sum_rgn %>%
  dplyr::group_by(rgn_id, rgn_name, year) %>%
  dplyr::summarize(value = sum(value)) %>%
  dplyr::ungroup()

6 Scale the data and save files

Data is rescaled by dividing by the 95th quantile of values across all regions from 2011 to 2018 (most recent year of FAO data).

target_harvest <- m_sum_rgn %>%
  dplyr::mutate(quant_95 = quantile(value[year %in% 2011:2018], 0.95, na.rm = TRUE)) %>%
  dplyr::mutate(score = value / quant_95) %>% 
  dplyr::mutate(score = ifelse(score>1, 1, score)) %>%
  dplyr::select(rgn_id, year, pressure_score = score) %>%
  dplyr::arrange(rgn_id, year); head(target_harvest); summary(target_harvest)
  
# v2020 quant_95 = 3477.15


# any regions that did not have a catch should have score = 0 
rgns <-  rgn_master %>%
  dplyr::filter(rgn_typ == "eez") %>%
  dplyr::select(rgn_id = rgn_id_2013) %>%
  dplyr::filter(rgn_id < 255) %>%
  base::unique() %>%
  dplyr::arrange(rgn_id)

# This is just a list of rgn IDS - do we want to update it to a rgn list more recent than 2013?

# Add year; for v2020, min year is 1950, and max year is 2018  
rgns <- expand.grid(rgn_id = rgns$rgn_id, year = min(target_harvest$year):max(target_harvest$year))

# Change NAs in pressure_score column to 0s
target_harvest <-  rgns %>%
  dplyr::left_join(target_harvest) %>%
  dplyr::mutate(pressure_score = ifelse(is.na(pressure_score), 0, pressure_score)) %>%
  dplyr::arrange(rgn_id); head(target_harvest); summary(target_harvest)

# Write target_harvest to "fao_targeted.csv" in output folder
write_csv(target_harvest, 
          file.path(here('globalprep/prs_targetedharvest/v2020/output/fao_targeted.csv')))

# Create gapfill dataframe
target_harvest_gf <- target_harvest %>%
  dplyr::mutate(gapfill = 0) %>%
  dplyr::select(rgn_id, year, gapfill)
# all zeroes for gapfill column; nothing being gapfilled but need to have a record 

# Write target_harvest_gf to "fao_targeted_gf.csv" in output folder
write_csv(target_harvest_gf,
          file.path(here('globalprep/prs_targetedharvest/v2020/output/fao_targeted_gf.csv')))

7 Data check

The data from last year and this year should be the same unless there were changes to underlying FAO data or the master species list.

In this case, all of the regions looked very similar.

new <- read_csv(here("globalprep/prs_targetedharvest/v2020/output/fao_targeted.csv")) %>% 
  filter(year==2016)
# pull just 2016 data from target_harvest df - should we change this to a more recent year for v2020?

old <- read_csv(here("globalprep/prs_targetedharvest/v2019/output/fao_targeted.csv")) %>%
  dplyr::filter(year == 2016) %>%
  dplyr::select(rgn_id, year, pressure_score_old=pressure_score) %>%
  dplyr::left_join(new, by=c("rgn_id", "year"))

# Compare pressure_score between last year and this year's assessments
compare_plot <- ggplot(data = old, aes(x=pressure_score_old, y= pressure_score, label=rgn_id))+
  geom_point()+
  geom_abline(color="red")

plot(compare_plot)
ggplotly(compare_plot)

### v2020: outliers for rgn_id 141 (Faroe Islands) and 163 (United States). I look at Faroe Islands below.

# explore United States[163]
unitedstates <- long %>% 
  dplyr::group_by(country, target, year) %>%
  dplyr::summarize(value = sum(value)) %>% 
  dplyr::filter(country == 'United States of America', target == 'cetacean', year >= 2000) 
# Could just be due to updated backfilled FAO data?

## Use more recent year: 2017
new <- read_csv(here("globalprep/prs_targetedharvest/v2020/output/fao_targeted.csv")) %>% 
  filter(year==2017)
# pull just 2016 data from target_harvest df - should we change this to a more recent year for v2020?

old <- read_csv(here("globalprep/prs_targetedharvest/v2019/output/fao_targeted.csv")) %>%
  dplyr::filter(year == 2017) %>%
  dplyr::select(rgn_id, year, pressure_score_old=pressure_score) %>%
  dplyr::left_join(new, by=c("rgn_id", "year"))

# Compare pressure_score between last year and this year's assessments
compare_plot <- ggplot(data = old, aes(x=pressure_score_old, y= pressure_score, label=rgn_id))+
  geom_point()+
  geom_abline(color="red")

plot(compare_plot)
ggplotly(compare_plot)


### v2020: outlier for rgn_id 141, which is the Faeroe Islands (OHI region name in m_sum_rgn) or Atlantic, Northeast (FAO area).

# explore Faeroe Islands[141]
faroe <- long %>% 
  dplyr::group_by(country, target, year) %>%
  dplyr::summarize(value = sum(value)) %>% 
  dplyr::filter(country == 'Faroe Islands', target == 'cetacean', year >= 2000) 
# It looks like in v2019, the pressure_score for rgn_id 141 in 2017 was 0, compared to a pressure_score of 0.5 in v2020. Looking at the capture production quantity value (in tonnes) for the Faroe Islands however, there were definitely cetaceans harvested in 2017. I looked at fao_targeted.csv for v2019, and noticed rgn_id 141 (Faroe Islands) has a pressure_score of 0 from 2014 to 2017. A pressure_score of 0 makes sense for 2006, 2007, and 2015, when there was no reported harvest value for the Faroe Islands, but there should be scores for 2014, 2016, and 2017. It looks like this was corrected in v2020 though, so might not be an issue. Could just be due to updated backfilled FAO data?