This document describes the steps for obtaining the data used to calculate the tourism and recreation goal for the 2024 global assessment.
The general calculation is: tr = Ap * Sr and Xtr = tr/90th quantile across regions
The following data are used:
/home/shares/ohi/git-annex/globalprep/_raw_data/WorldBank/d2024/WorldBank_global_country_population_1960-2023/API_SP.POP.TOTL_DS2_en_csv_v2_1584446.csv
)./ohiprep_v2024/globalprep/mar_prs_population/v2024/output/mar_pop_25mi.csv
,Citation: World Bank. (2024). Population, total. https://data.worldbank.org/indicator/SP.POP.TOTL © 2024 The World Bank Group, All Rights Reserved.
Methodology:
/ohiprep_v2024/globalprep/mar_prs_population/v2024/output/mar_pop_25mi.csv
,/home/shares/ohi/git-annex/globalprep/_raw_data/WorldBank/d2024/WorldBank_global_country_population_1960-2023/API_SP.POP.TOTL_DS2_en_csv_v2_1584446.csv
\[\frac{T_r}{T_{90th}}\]
/ohiprep_v2024/globalprep/ao/v2024/intermediate/gdppcppp_ohi.csv
and substituted with CIA
GDP per capita ppp adj is used to gapfill scores into previous years
(2005 - 2022).knitr::opts_chunk$set(message = FALSE, warning = FALSE, eval=FALSE)
# library(devtools)
# devtools::install_github("ohi-science/ohicore@dev") # dont worry about devtools
library(ohicore)
library(tidyverse)
library(stringr)
library(WDI)
library(here)
library(janitor)
library(plotly)
library(readxl)
library(naniar)
library(countrycode)
library(pdftools)
library(tictoc)
library(ggplot2)
# ---- sources! ----
source(here("workflow", "R", "common.R")) # file creates objects to process data
region_data() # for rgns_all and rgns_eez
regions_shape() # returns spatial shape object named 'regions' which includes land, eez, highseas, and antarctica regions
#source(here(paste0("globalprep/tr/v", version_year, "/R/tr_fxns.R"))) # not used presently
# ---- set year and file path info ----
current_year <- 2024 # Update this in the future!!
version_year <- paste0("v",current_year)
data_dir_version_year <- paste0("d", current_year)
prev_ver_yr <- paste0("d", (current_year - 1))
# ---- data directories ----
# Raw data directory (on Mazu)
raw_data_dir <- here::here(dir_M, "git-annex", "globalprep", "_raw_data")
# UNWTO (UN World Tourism) raw data directory
unwto_dir <- here(raw_data_dir, "UNWTO", data_dir_version_year)
# intermediate data outputs dir
int_dir <- here("globalprep","tr", version_year, "intermediate")
# final output dir
output_dir <- here("globalprep","tr", version_year, "output")
# final figs dir
figs_dir <- here("globalprep","tr", version_year, "figs")
International data: Click Inbound Arrivals, scroll down to “Total arrivals”, click download data.
Domestic data: Click Domestic Tourism, scroll down to “Total trips”, click download. Then download from “Accommodations”.
file_path_unwto_international <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
unwto_arrivals_int <- readxl::read_xlsx(file_path_unwto_international, skip = 4) # read in the raw data
unwto_clean <- unwto_arrivals_int %>%
select(country = `Basic data and indicators`, total_arrivals = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
fill(country, .direction = "down") %>% # add country name to all data associated with that country
pivot_longer(cols = c("total_arrivals", "subdivision_1", "subdivision_2"),
values_to = "metric",
values_drop_na = TRUE) %>% # make the metrics into one column
select(-name) %>% # get rid of the name column since it's just the titles of the metrics which are already there
select(country, metric, everything()) %>% # reorder things
replace_with_na_all(condition = ~.x == "..") %>% # swap .. with NAs
pivot_longer(cols = 3:ncol(.), names_to = "year",
values_to = "tourism_arrivals_ct") %>% # make the years not columns anymore
pivot_wider(names_from = metric, values_from = tourism_arrivals_ct) %>%
mutate(overnights = as.numeric(`Overnights visitors (tourists)`),
same_day = as.numeric(`Same-day visitors (excursionists)`),
total_arrivals = as.numeric(`Total arrivals`),
tourism_arrivals_ct = as.numeric(NA)) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
select(country, year, overnights, same_day, total_arrivals, tourism_arrivals_ct) %>% # select columns needed for analysis (cruise passengers seem to be included in same-day)
group_by(country, year) %>% # group by county and year
mutate(
tourism_arrivals_ct = case_when(
!is.na(overnights) ~ overnights, # if there is a value, dont gapfill
is.na(overnights) & !is.na(same_day) & !is.na(total_arrivals) ~ total_arrivals - same_day, # gapfill, when there is no data on overnights, fill with total_arrivals - same day
TRUE ~ tourism_arrivals_ct # otherwise, NA
), # there were 0 situations like this in v2024
# total_arrivals = case_when(
# !is.na(total_arrivals) ~ total_arrivals,
# is.na(total_arrivals) & !is.na(same_day) & !is.na(overnights) ~ overnights + same_day,
# TRUE ~ total_arrivals
# )
) %>% # v2024: overnights has 1036 NAs out of 6021
# v2024: same_day has 3131 NAs out of 6021
# v2024: total_arrivals has 2363 NAs
mutate(arrivals_method = ifelse(is.na(overnights) & !is.na(same_day) & !is.na(total_arrivals), "UNWTO - subtraction", NA)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - subtraction", "gapfilled", NA)) %>% # prepare a "gapfilled" column to indicate "gapfilled" or NA
ungroup() %>% # ungroup since not needed anymore
select(country, year, tourism_arrivals_ct, total_arrivals, arrivals_method, arrivals_gapfilled) %>% # select only needed columns
mutate(country = str_to_title(country), # make countries look nice
tourism_arrivals_ct = round(as.numeric(tourism_arrivals_ct) * 1000),
total_arrivals = round(as.numeric(total_arrivals)*1000)) # since the units were in thousands
Now, run ohicore::name_2_rgn()
so that we can move
forward with OHI regions! Ensure duplicates are dealt with.
# Run name_2_rgn() to ensure the UNWTO data has OHI region names
unwto_clean_names_int <- name_2_rgn(df_in = unwto_clean, # do this just for Bonaire since it is the only region not matching above
fld_name = 'country',
# flds_unique = c('year'),
keep_fld_name = TRUE) %>%
dplyr::select(rgn_id, rgn_name, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled) # v2024: not losing the USA!! Yay (compare to v2023). Also, total_arrivals was dropped at this point because tourism_arrivals_ct has already been gapfilled using totals if same-day was present (Overnights = total - same-day), and it feels like if total was used to gapfill places that don't have any same-day data, it may be overinflating the arrivals for that region. This can be revisited.
# ---- determine if there were any duplicates due to name_2_rgn ----
colSums(is.na(unwto_clean_names_int)) # v2024: 789 NAs in tourism_arrivals_ct
length(unwto_clean_names_int$tourism_arrivals_ct) # within 4833 observations for `tourism_arrivals_ct`
# this shows rows that have at least one duplicate, including both the first and subsequent occurrences of each duplicated row. using the tidyverse allows us to see all instances of duplicated data, not just the second and subsequent occurrences (which is what just a duplicated() call would give you).
duplicates_int <- unwto_clean_names_int %>%
group_by(rgn_id, rgn_name, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates_int$rgn_name)
# v2024:
# [1] "China"
# [2] "Guadeloupe and Martinique"
# [3] "Northern Mariana Islands and Guam"
# [4] "Montenegro"
# [5] "Puerto Rico and Virgin Islands of the United States"
## v2024: this code to fix duplicates was not working, and actually removed the duplicates from above from the data
# unwto_dupe_fix <- unwto_clean_names_int %>%
# group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
# summarize(sum_fix = ifelse(all(is.na(tourism_arrivals_ct)), NA, sum(tourism_arrivals_ct, na.rm = TRUE)),
# sum_fix_2 = ifelse(all(is.na(total_arrivals)), NA, sum(total_arrivals, na.rm = TRUE))) %>%
# mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(sum_fix), "UNWTO", arrivals_method)) %>%
# rename(tourism_arrivals_ct = sum_fix,
# total_arrivals = sum_fix_2)
# instead, make a function to avoid losing NAs (they turn to 0s) when there are values to aggregate with
sum_with_na <- function(x) {
if (all(is.na(x))) { # for x, if all of the values are NA, leave as NA.
return(NA)
} else {
return(sum(x, na.rm = TRUE)) # if there are values for that grouped dataframe, then sum them according to the grouping (in this case, by region and year)
}
}
# use the function within a summarize to aggregate for all regions that have duplicate years
international_clean_names_agg <- unwto_clean_names_int %>%
group_by(rgn_id, rgn_name, year) %>%
summarize(
tourism_arrivals_ct = sum_with_na(tourism_arrivals_ct), # put each column in the function, so no NAs are lost
arrivals_method = first(arrivals_method, na_rm = TRUE),
arrivals_gapfilled = first(arrivals_gapfilled, na_rm = TRUE),
.groups = "drop" # to ungroup, fixes warning from summarise to use reframe (when it is not applicable here because we want to reduce each group down to a single row)
) %>%
mutate(
arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO", arrivals_method) # so that if there is data from UNWTO that was not gapfilled, the arrivals_method is accurate
)
# double check to test
duplicates_int_2 <- international_clean_names_agg %>%
group_by(rgn_id, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates_int_2$rgn_name) # v2024: 0! yay -- and China/other duplicates are preserved here!!
# check out things so far
summary(international_clean_names_agg)
# v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
# v2024: 757 NAs in `tourism_arrivals_ct` (before filtering the years down and gapfilling), `total_arrivals` removed already
international_clean_names_agg <- international_clean_names_agg %>% # year is a character, let's fix that quickly
mutate(year = as.numeric(year))
summary(international_clean_names_agg) # now its numeric!
# see how many regions have values
check_international <- international_clean_names_agg %>%
filter(!is.na(tourism_arrivals_ct))
length(unique(check_international$rgn_id)) # v2024: 161 regions present!
# filter international data for usable years to be used in the future. Ensure that it is between 2005 - 2021, and that there are at least three raw data values!
international_arrivals_rgns <- international_clean_names_agg %>%
filter(year %in% 2005:2021) %>%
group_by(rgn_id) %>%
summarize(count_values = sum(!is.na(tourism_arrivals_ct))) %>% # for each region, how many values in tourism_arrivals_ct did it have
filter(count_values >= 3) # keep only the regions that have greater than or equal to three values
length(unique(international_arrivals_rgns$rgn_id)) # v2024: 156 regions!
international_arrivals <- international_clean_names_agg %>%
filter(rgn_id %in% international_arrivals_rgns$rgn_id) %>%
filter(year %in% 2005:2021) # filter for regions we want in the right timeframe
length(unique(international_arrivals$rgn_id)) # v2024: 156 regions! great
summary(international_arrivals) # only 212 NAs out of 2652 observations to gapfill!
# bring in domestic accommodation data from UNWTO
file_path_unwto_domestic_acc <- here::here(unwto_dir, "unwto-domestic-accommodation-data.xlsx")
unwto_arrivals_dom_acc <- readxl::read_xlsx(file_path_unwto_domestic_acc, skip = 4) # read in the raw domestic accomodations data
# clean up the raw xlsx! We want to have the country, the source (Total:Guests, Total:Overnights, Hotel:Guests, Hotel:Overnights), then the values
unwto_clean_dom <- unwto_arrivals_dom_acc %>%
select(country = `Basic data and indicators`, source_arrivals = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
fill(country, .direction = "down") %>% # add country name to all data associated with that country
fill(source_arrivals, .direction = "down") %>%
filter(subdivision_1 %in% "Overnights" | subdivision_1 %in% "Guests") %>%
select(-subdivision_2) %>% # get rid of the NA column
replace_with_na_all(condition = ~.x == "..") %>% # swap ".." with NAs
mutate(source_arrivals_all = paste(source_arrivals, subdivision_1, sep = ":")) %>% # unite the columns that define the source for easier cleaning and pivoting
select(country, source_arrivals_all, 4:ncol(.)) # use the . to specify the whole dataframe
# tidy up the data to be usable
dom_acc_guests <- unwto_clean_dom %>%
pivot_longer(cols = 3:ncol(.), names_to = "year",
values_to = "tourism_arrivals_ct") %>% # make the years not columns anymore
pivot_wider(names_from = source_arrivals_all, values_from = tourism_arrivals_ct) %>%
mutate(
total_guests = as.numeric(`Total:Guests`),
total_overnights = as.numeric(`Total:Overnights`),
hotel_guests = as.numeric(`Hotels and similar establishments:Guests`),
hotel_overnights = as.numeric(`Hotels and similar establishments:Overnights`),
tourism_arrivals_ct = as.numeric(NA)
) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
select(country, year, total_guests, total_overnights, hotel_guests, hotel_overnights, tourism_arrivals_ct) %>% # select columns needed for future raw gapfilling
mutate(country = str_to_title(country)) # make countries look nice
Now let’s take a look at the raw data for Accomodations!
# see how many NAs are within the raw data for Total:Guests
summary(dom_acc_guests)
# total_guests total_overnights
# Min. : 1.0 Min. : 2
# 1st Qu.: 753.8 1st Qu.: 1532
# Median : 5226.0 Median : 13165
# Mean : 19478.2 Mean : 53824
# 3rd Qu.: 15603.5 3rd Qu.: 39173
# Max. :371590.0 Max. :576370
# NA's :4947 NA's :4614
# hotel_guests hotel_overnights
# Min. : 0.2 Min. : 0.2
# 1st Qu.: 160.0 1st Qu.: 412.2
# Median : 1326.0 Median : 2981.5
# Mean : 11091.4 Mean : 21297.9
# 3rd Qu.: 7737.2 3rd Qu.: 13999.5
# Max. :342883.0 Max. :440121.0
# NA's :4129 NA's :3835
length(unique(dom_acc_guests$country)) # 223, which includes regions that have all NA values (UNWTO always has datasets with those 223 countries)
dom_acc_guests_noNA <- dom_acc_guests %>%
filter(!is.na(total_guests)) # remove NAs to see how many regions have total_guests values
length(unique(dom_acc_guests_noNA$country)) # 57 unique countries, not that many. let's move on to raw gapfilling and see if we can increase those numbers using other columns.
summary(lm(total_guests ~ as.numeric(year), data = dom_acc_guests)) # significant model and slope, but very low Multiple R-squared: 0.01165. Not the best...
#### come back to this!! maybe we can add one more raw gapfilling using this data!
summary(lm(total_guests ~ hotel_overnights, data = dom_acc_guests)) # p-value: < 0.00000000000000022, multiple R squared is 0.9013, not bad!! slope is significant, 0.692837.
mod5 <- lm(total_guests ~ hotel_overnights, data = dom_acc_guests)
plot(dom_acc_guests$hotel_overnights, dom_acc_guests$total_guests)
abline(mod5, col="red") # not a linear fit, would be better to do nonlinear, but this is good for now. we can use the slope and intercept from this model to estimate total_guests
ggplot(dom_acc_guests, aes(x = total_guests, y = hotel_overnights)) +
geom_point(alpha = 0.5) + # Add some transparency to points
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal()
So, it looks like there are about 4947 NAs within the Total:Guests column. However, we want to preserve guests/individuals as the unit (rather than overnights), so in the Raw Gapfilling section, let’s see if we can gapfill with either other columns (hotel_guests) or the “Trips” data.
Let’s run ohicore::name_2_rgn() so we don’t have to worry about it in the future.
# run name_2_rgn on the domestic data so it has OHI region ids
rgns_dom_acc_guests <- name_2_rgn(df_in = dom_acc_guests,
fld_name = 'country',
keep_fld_name = TRUE) # lots of landlocked countries were removed, and there are duplicates. Let's take a look.
# we only need mainland China data, not necessarily Hong Kong or Macao.
duplicates <- rgns_dom_acc_guests %>%
group_by(rgn_id, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates$rgn_name)
# [1] "China"
# [2] "Guadeloupe and Martinique"
# [3] "Northern Mariana Islands and Guam"
# [4] "Montenegro"
# [5] "Puerto Rico and Virgin Islands of the United States"
# see which ones even have data -- if none of the columns have a value, then we should drop these regions anyways.
duplicates_no_all_NAs <- duplicates %>%
filter(if_any(c(total_guests, total_overnights, hotel_guests, hotel_overnights), ~!is.na(.))) # only keeps the rows where at least one of the columns has a value
unique(duplicates_no_all_NAs$rgn_name) # [1] "China" [2] "Montenegro" [3] "Puerto Rico and Virgin Islands of the United States" need to be aggregated and fixed... the rest did not have data anyway.
# looking at the data, it looks like the country "Serbia and Montenegro" does not have data past 2001... so it is not usable anyway. So we can just drop that country but leave the country "Montenegro". Then we can aggregate puerto rico and virgin islands `tourism_arrivals_ct` values per year.
# therefore, move forward by aggregating by region id and year for puerto rico and virgin islands.
## make a function to avoid losing NAs (they turn to 0s) when there are values to aggregate with
sum_with_na <- function(x) {
if (all(is.na(x))) {
return(NA)
} else {
return(sum(x, na.rm = TRUE))
}
}
# now use the function to aggregate by region and year!
dom_acc_guests_agg <- rgns_dom_acc_guests %>%
filter(!(rgn_id %in% c(140, 13))) %>% # removing duplicates that are all NAs. 140 is Guadeloupe and Martinique, 13 is Northern Mariana Islands and Guam
filter(!(country %in% "Serbia And Montenegro")) %>% # removing a duplicate that does not have good data anyway
group_by(rgn_id, rgn_name, year) %>%
summarize(
tourism_arrivals_ct = sum_with_na(tourism_arrivals_ct), # put each column in the function, so no NAs are lost
total_guests = sum_with_na(total_guests),
total_overnights = sum_with_na(total_overnights),
hotel_guests = sum_with_na(hotel_guests),
hotel_overnights = sum_with_na(hotel_overnights),
.groups = "drop" # to ungroup, fixes warning from summarise to use reframe (when it is not applicable here)
) #### looks good!!!
# double check to test
duplicates_2 <- dom_acc_guests_agg %>%
group_by(rgn_id, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates_2$rgn_name) # v2024: 0! yay
file_path_unwto_domestic_trips <- here::here(unwto_dir, "unwto-domestic-trips-data.xlsx")
unwto_arrivals_dom_trips <- readxl::read_xlsx(file_path_unwto_domestic_trips, skip = 4) # read in the raw domestic trips data
# clean up the Trips data!
unwto_dom_trips_clean <- unwto_arrivals_dom_trips %>%
select(country = `Basic data and indicators`, total_trips = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
fill(country, .direction = "down") %>% # add country name to all data associated with that country
pivot_longer(cols = c("total_trips", "subdivision_1", "subdivision_2"),
values_to = "metric",
values_drop_na = TRUE) %>% # make the metrics into one column
select(-name) %>% # get rid of the name column since it's just the titles of the metrics which are already there
select(country, metric, everything()) %>% # reorder things
replace_with_na_all(condition = ~.x == "..") %>% # swap .. with NAs
pivot_longer(cols = 3:ncol(.), names_to = "year",
values_to = "tourism_arrivals_trips") %>% # make the years not columns anymore
mutate(country = str_to_title(country)) %>% # make the country names look nicer
pivot_wider(names_from = metric, values_from = tourism_arrivals_trips) %>%
mutate(trips_overnights = as.numeric(`Overnights visitors (tourists)`),
trips_same_day = as.numeric(`Same-day visitors (excursionists)`),
trips_total = as.numeric(`Total trips`)) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
select(country, year, trips_overnights, trips_same_day, trips_total) # select only the columns we will be working with
# see how many NAs are within the raw data for Trips:Overnights
summary(unwto_dom_trips_clean)
# country year trips_overnights
# Length:6021 Length:6021 Min. : 3
# Class :character Class :character 1st Qu.: 2918
# Mode :character Mode :character Median : 14376
# Mean : 60263
# 3rd Qu.: 50884
# Max. :2321983
# NA's :5145
# trips_same_day trips_total
# Min. : 0.1 Min. : 3
# 1st Qu.: 4799.2 1st Qu.: 8438
# Median : 21174.0 Median : 37058
# Mean : 89189.9 Mean : 237510
# 3rd Qu.: 98575.5 3rd Qu.: 201468
# Max. :1834200.0 Max. :6005852
# NA's :5507 NA's :5305
length(unique(unwto_dom_trips_clean$country)) # 223, which includes regions that have all NA values (UNWTO always has datasets with those 223 countries)
unwto_dom_trips_clean_noNA <- unwto_dom_trips_clean %>%
filter(!is.na(trips_overnights)) # remove NAs to see how many regions have total_guests values
length(unique(unwto_dom_trips_clean_noNA$country)) # 82 unique countries, not bad! Let's see if there is an association between trips_overnights and total_guests, to see if we can use the trips_overnights values for total_guests etc!
Let’s run ohicore::name_2_rgn() so we don’t have to worry about it in the future.
# run name_2_rgn on the domestic data so it has OHI region ids
rgns_unwto_dom_trips_clean <- name_2_rgn(df_in = unwto_dom_trips_clean,
fld_name = 'country',
keep_fld_name = TRUE) # lots of landlocked countries were removed, and there are duplicates. Let's take a look.
# we only need mainland China data, not necessarily Hong Kong or Macao.
duplicates_trips <- rgns_unwto_dom_trips_clean %>%
group_by(rgn_id, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates_trips$rgn_name)
# [1] "China"
# [2] "Guadeloupe and Martinique"
# [3] "Northern Mariana Islands and Guam"
# [4] "Montenegro"
# [5] "Puerto Rico and Virgin Islands of the United States"
# see which ones even have data -- if none of the columns have a value, then we should drop these regions anyways.
duplicates_trips_no_all_NAs <- duplicates_trips %>%
filter(if_any(c(trips_overnights, trips_same_day, trips_total), ~!is.na(.))) # only keeps the rows where at least one of the columns has a value
unique(duplicates_trips_no_all_NAs$rgn_name) # [1] "China" needs to be aggregated and fixed... the rest did not have data anyway.
# therefore, move forward by aggregating by region id and year for china.
## make a function to avoid losing NAs (they turn to 0s) when there are values to aggregate with
sum_with_na <- function(x) {
if (all(is.na(x))) {
return(NA)
} else {
return(sum(x, na.rm = TRUE))
}
}
# now use the function to aggregate by region and year!
dom_trips_agg <- rgns_unwto_dom_trips_clean %>%
filter(!(rgn_id %in% c(140, 13, 186, 116))) %>% # removing duplicates that are all NAs. 140 is Guadeloupe and Martinique, 13 is Northern Mariana Islands and Guam, 186 is Montenegro, 116 is puerto rico and virgin islands
group_by(rgn_id, rgn_name, year) %>%
summarize(
trips_overnights = sum_with_na(trips_overnights), # put each column in the function, so no NAs are lost
trips_same_day = sum_with_na(trips_same_day),
trips_total = sum_with_na(trips_total),
.groups = "drop" # to ungroup, fixes warning from summarise to use reframe (when it is not applicable here)
) #### looks good!!!
# double check to test
duplicates_trips_2 <- dom_trips_agg %>%
group_by(rgn_id, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates_trips_2$rgn_name) # v2024: 0! yay
Hotel guests are registered individuals that stayed overnight at a hotel or similar establishment. Total guests includes dwellings or places one might stay in that region that is not considered a hotel, but would still be associated with tourism. Let’s use hotel_guests to estimate total_guests.
# ------- total_guests ~ hotel_guests ---------
mod1 <- lm(total_guests ~ hotel_guests, data = dom_acc_guests_agg)
summary(mod1)
# significant model! multiple r squared = 0.9503
# total_guests = 4014.424577 + 1.136106*hotel_guests
# plot the data to compare
ggplot(dom_acc_guests_agg, aes(x = hotel_guests, y = total_guests)) +
geom_point(alpha = 0.5) + # Add some transparency to points
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() + # Use a minimal theme
labs(title = "Hotel:Guests vs Total:Guests within Accommodation",
x = "Hotel Guests",
y = "Total Guests") # does not look awful, the slope is slightly more than 1
# log scale
ggplot(dom_acc_guests_agg, aes(x = log(hotel_guests + 1), y = log(total_guests + 1))) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Hotel:Guests vs Total:Guests (Log Scale) within Accommodation",
x = "Log(Hotel Guests + 1)",
y = "Log(Total Guests + 1)") # looks a lot better!
# let's look at their correlation. see ?`cor` if you are interested in learning more about how this is calculated (`cor` computes the variance of x and the covariance or correlation of x and y). Kendall correlation is preferred when there are small samples or some outliers (https://datascience.stackexchange.com/questions/64260/pearson-vs-spearman-vs-kendall)
cor(dom_acc_guests_agg$total_guests, dom_acc_guests_agg$hotel_guests, method = "kendall") # NA
sd(dom_acc_guests_agg$total_guests) # NA
sd(dom_acc_guests_agg$hotel_guests) # NA
# lets try a different cor method to see if that is why we are getting NAs
cor(dom_acc_guests_agg$total_guests, dom_acc_guests_agg$hotel_guests, method = "pearson") # NA
sum(!is.finite(dom_acc_guests_agg$total_guests)) # 0
sum(!is.finite(dom_acc_guests_agg$hotel_guests)) # 0, no infinite values, which is expected and good. so what is the issue...
range(dom_acc_guests_agg$total_guests, na.rm = TRUE) # 53 371590
range(dom_acc_guests_agg$hotel_guests, na.rm = TRUE) # 0.18 342883.00
sum(is.na(dom_acc_guests_agg$total_guests)) # 3793 NAs
sum(is.na(dom_acc_guests_agg$hotel_guests)) # 3244 NAs
mean(!is.na(dom_acc_guests_agg$total_guests)) # 0.1784709, aka 18% NA/total obs
mean(!is.na(dom_acc_guests_agg$hotel_guests)) # 0.2973793, aka 30% NA/total obs
# significant amt of NAs, so we need to remove them before finding the correlation:
clean_dom_acc_guests <- dom_acc_guests_agg[complete.cases(dom_acc_guests_agg[c("total_guests", "hotel_guests")]), ] # filtering the dom_acc_guests data frame to keep only the rows where both "total_guests" and "hotel_guests" have non-missing values
# now lets try to find the correlation:
cor(clean_dom_acc_guests$total_guests, clean_dom_acc_guests$hotel_guests, method = "kendall") # 0.8830874, great!! This means that there is a pretty high correlation between the two variables (cor is close to 1). Correlation measures the strength of association between two variables and the direction of the relationship: so in this case, it is positive and they have high association.
sd(clean_dom_acc_guests$total_guests) # 48210.07 guests
sd(clean_dom_acc_guests$hotel_guests) # 41365.82 guests, fairly similar to each other
# therefore, go ahead and use the linear model to conduct gapfilling in the next chunk using this data.
Let’s move on to gapfilling. Since it is slightly more than 1, let’s use the slope and coefficient from the linear model to calculate total_guests from hotel_guests. This is just to improve accuracy.
# ------- find coefficients for gapfilling using hotel_guests when total_guests = NA ------------------
# reminder: total_guests = 4014.424577 + 1.136106*hotel_guests
plot(dom_acc_guests_agg$hotel_guests, dom_acc_guests_agg$total_guests)
abline(mod1, col="red") # looks pretty good! let's move forward using this mode that we explored in the last chunk.
# will need to gapfill these slopes and intercepts for regions that have hotel_guests data but no total_guests data. join with georegions, group by r2, and later r1 and take the mean for those regions. Otherwise, use regional level.
georegions <- ohicore::georegions %>%
left_join(rgns_eez, by = "rgn_id") %>%
select(-c(Notes)) # not needed
# first, remove any regions that only have NAs, then calculate coefficients
dom_acc_guests_coef <- dom_acc_guests_agg %>%
left_join(georegions, by = c("rgn_id")) %>%
group_by(rgn_id) %>% # group by region
filter(if_all(c(total_guests, hotel_guests), ~!is.na(.))) %>% # remove rows where there are NAs for both columns
mutate(
total_guests_slope = lm(total_guests ~ 0 + hotel_guests)$coefficient[1]
) %>% # obtain the intercept and slope for each region that has data
ungroup() %>%
group_by(r2) %>% # get values for r1 and r2, so that if regional is available, use that... then if r2 is available, use that... finally, use r1 if you can't otherwise.
mutate(
total_guests_slope_r2 = lm(total_guests ~ 0 + hotel_guests)$coefficient[1]
) %>%
ungroup() %>%
group_by(r1) %>%
mutate(
total_guests_slope_r1 = lm(total_guests ~ 0 + hotel_guests)$coefficient[1]
) %>%
ungroup() %>%
distinct(rgn_id, r2, r1, total_guests_slope, total_guests_slope_r2, total_guests_slope_r1) %>% # prep for left join, otherwise many-to-many
select(rgn_id, r2, r1, total_guests_slope, total_guests_slope_r2, total_guests_slope_r1) # select the columns that will be needed
# ------- gapfill using hotel_guests when total_guests = NA ---------
dom_acc_guests_gf1 <- dom_acc_guests_agg %>%
left_join(georegions, by = c("rgn_id","rgn_name")) %>%
left_join(dom_acc_guests_coef, by = c("rgn_id","r2","r1")) %>%
group_by(r2) %>% # get values for r1 and r2 for regions that do not have data, so that if regional is available, use that... then if r2 is available, use that... finally, use r1 if you can't otherwise.
mutate(
total_guests_slope_r2mean = mean(total_guests_slope_r2, na.rm = TRUE),
total_guests_slope_r2mean = ifelse(is.nan(total_guests_slope_r2mean), NA, total_guests_slope_r2mean) # any NaN values to NAs
) %>%
ungroup() %>%
group_by(r1) %>%
mutate(
total_guests_slope_r1mean = mean(total_guests_slope_r1, na.rm = TRUE),
total_guests_slope_r1mean = ifelse(is.nan(total_guests_slope_r1mean), NA, total_guests_slope_r1mean) # any NaN values to NAs
) %>%
ungroup() %>% # now we have intercept and slope values for all regions (at different levels), lets use that to gapfill total_guests with hotel_overnights!
mutate(
arrivals_method = ifelse(is.na(total_guests) & !is.na(hotel_guests), "UNWTO - Hotel:Guests", NA), # do this first so that we can use when tourism_arrivals_ct is NA as a qualifier
tourism_arrivals_ct = case_when(
!is.na(total_guests) ~ total_guests, # if there is a value, dont gapfill
is.na(total_guests) & !is.na(hotel_guests) & !is.na(total_guests_slope) ~ (hotel_guests*total_guests_slope),
is.na(total_guests) & !is.na(hotel_guests) & is.na(total_guests_slope) & !is.na(total_guests_slope_r2mean) ~ (hotel_guests*total_guests_slope_r2mean), # if regional lm coefficients weren't possible, go to r2 which is second least specific, then r1
is.na(total_guests) & !is.na(hotel_guests) & is.na(total_guests_slope) & is.na(total_guests_slope_r2mean) & !is.na(total_guests_slope_r1mean) ~ (hotel_guests*total_guests_slope_r1mean),
TRUE ~ tourism_arrivals_ct
),
arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - Hotel:Guests", "gapfilled", NA)
) %>% # example: New Caledonia and Monaco. The numbers make sense because the estimated total_guests were larger than hotel_guests (makes sense because total includes other establishments than just hotels) and we can compare to hotel_overnights, of which tourism_arrivals_ct is about half ish, which based on our knowledge of mean_stays (found later on in raw gf for dom) the average is between 2-3 nights. So this makes sense!
select(rgn_id, rgn_name, year, tourism_arrivals_ct, total_guests, total_overnights, hotel_guests, hotel_overnights, arrivals_method, arrivals_gapfilled) # select all that we need moving forward
# take a look at how that changed the number of NAs after our first dom gf step:
summary(dom_acc_guests_gf1) # now, `tourism_arrivals_ct` has 3223 NAs, versus 3793 NAs in `total_guests` from the raw clean accommodation data
dom_acc_guests_gf1_noNA <- dom_acc_guests_gf1 %>%
filter(!is.na(tourism_arrivals_ct)) # remove NAs to see how many regions have total_guests values
length(unique(dom_acc_guests_gf1_noNA$rgn_name)) # we now have 69 unique countries, thats okay. Let's move on and see if we can gapfill further.
Join Accomodations and Trips data frames first.
dom_acc_trips_join <- left_join(dom_acc_guests_gf1, dom_trips_agg, by = c("rgn_id","rgn_name", "year")) # now it has all variables within one data frame
Now, evaluate if gapfilling is appropriate:
# ------- total_guests ~ trips_overnights ---------
mod2 <- lm(total_guests ~ trips_overnights, data = dom_acc_trips_join)
summary(mod2)
# significant model and slope! almost sig intercept, but that is okay. multiple r-squared was 0.7657 (okay enough to move forward, not the best).
# total_guests = -2799.2542 + 0.7111*trips_overnights
# plot the data to compare
ggplot(dom_acc_trips_join, aes(x = total_guests, y = trips_overnights)) +
geom_point(alpha = 0.5) + # Add some transparency to points
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() + # Use a minimal theme
labs(title = "Accomodation:Total:Guests vs Trips:Overnight",
x = "Trips Overnight Visitors (from Trips)",
y = "Total Guests (from Accomodation)") # does not look that bad, the slope is slightly less than 1
# log scale
ggplot(dom_acc_trips_join, aes(x = log(total_guests + 1), y = log(trips_overnights + 1))) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Accomodation:Total:Guests vs Trips:Overnight (Log Scale)",
x = "Log(Trips Overnight Visitors (from Trips) + 1)",
y = "Log(Total Guests (from Accomodation) + 1)") # mediocre, but not awful.
# let's look at their correlation. see ?`cor` if you are interested in learning more about how this is calculated (`cor` computes the variance of x and the covariance or correlation of x and y). Kendall correlation is preferred when there are small samples or some outliers (https://datascience.stackexchange.com/questions/64260/pearson-vs-spearman-vs-kendall)
range(dom_acc_trips_join$total_guests, na.rm = TRUE) # 53 371590
range(dom_acc_trips_join$trips_overnights, na.rm = TRUE) # 3 2321983, much larger range than total_guests, but it does include India and the US now
sum(is.na(dom_acc_trips_join$total_guests)) # 3793 NAs
sum(is.na(dom_acc_trips_join$trips_overnights)) # 3957 NAs
mean(!is.na(dom_acc_trips_join$total_guests)) # 0.1784709, aka 18% NA/total obs
mean(!is.na(dom_acc_trips_join$trips_overnights)) # 0.14295, aka 14% NA/total obs
# significant amt of NAs, so we need to remove them before finding the correlation:
clean_dom_acc_trips_join <- dom_acc_trips_join[complete.cases(dom_acc_trips_join[c("total_guests", "trips_overnights")]), ] #filtering the dom_acc_guests data frame to keep only the rows where both "total_guests" and "hotel_guests" have non-missing values
# now lets try to find the correlation:
cor(clean_dom_acc_trips_join$total_guests, clean_dom_acc_trips_join$trips_overnights, method = "kendall") # 0.8060229, not bad at all! This means that there is a positive correlation between the two variables (cor is relatively close to 1). Correlation measures the strength of association between two variables and the direction of the relationship: so in this case, it is positive and they have an association.
sd(clean_dom_acc_trips_join$total_guests) # 59897.74 guests
sd(clean_dom_acc_trips_join$trips_overnights) # 73703.34 guests, larger standard deviation than total_guests
# go ahead and do raw gapfilling in the next chunk using this data.
Let’s gf!
# -------- gf using trips_overnights for NA values of tourism_arrivals_ct -----
dom_acc_guests_gf2 <- dom_acc_trips_join %>%
group_by(rgn_id, year) %>% # group by county and year
mutate(
tourism_arrivals_ct = case_when(
is.na(tourism_arrivals_ct) & !is.na(trips_overnights) ~ trips_overnights, # gapfill, when there is no data on total guests or hotel guests, fill with trips_overnights visitors
TRUE ~ tourism_arrivals_ct # otherwise, NA
),
arrivals_method = ifelse(is.na(total_guests) & is.na(hotel_guests) & !is.na(trips_overnights), "UNWTO - Trips:Overnights", arrivals_method),
arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - Trips:Overnights", "gapfilled", arrivals_gapfilled) # fill the "gapfilled" column to indicate "gapfilled" or NA
) %>%
ungroup() # ungroup since not needed anymore
# take a look at how that changed the number of NAs:
summary(dom_acc_guests_gf2) # now, `tourism_arrivals_ct` has 3078 NAs
dom_acc_guests_gf2_noNA <- dom_acc_guests_gf2 %>%
filter(!is.na(tourism_arrivals_ct)) # remove NAs to see how many regions have total_guests values
length(unique(dom_acc_guests_gf2_noNA$rgn_id)) # we now have 82 unique regions,that's better! Let's move on and see if we can gapfill even further!
So, there are some instances in which there is data for Total:Overnights, but not Total:Guests. Based on looking at the methodology (https://www.e-unwto.org/doi/epdf/10.18111/9789284424160?role=tab), it seems that guests are the individuals coming to a specific place, versus overnights includes the same people over again if they stay for multiple nights.
Essentially, Guests = Overnights / Mean Stay (in days).
So, we need to find mean stays! As a reminder, this would need to be
calculated using the raw total_guests
and
total_overnights
variables.
We can do this by calculating mean_stay
for each
region, if possible. Using an lm, see if mean_stay
could be correlated to the georegion (r2 ideally because it is more
specific, but r1 works as well). If so, take the mean for that
georegion, and use it as a way to convert Accommodation:Total:Overnights
into Accommodation:Total:Guests.
dom_mean_stay <- dom_acc_guests_gf2 %>%
group_by(rgn_id, rgn_name, year) %>%
mutate(mean_stay = total_overnights/total_guests) # by referring to UNWTO documentation
range(dom_mean_stay$mean_stay, na.rm = TRUE) # 0.9666667 8.4101144
sd(dom_mean_stay$mean_stay, na.rm = TRUE) # 1.234238
# re-pasting the code as a reminder of what the object `georegions` is from
georegions <- ohicore::georegions %>%
left_join(rgns_eez, by = "rgn_id") %>%
select(-c(Notes)) # not needed
# join mean_stay df with georegions! so we can see if there can be an association by georgn...
dom_mean_stay_georgns <- dom_mean_stay %>%
left_join(georegions, by = c("rgn_id","rgn_name"))
mod3 <- lm(mean_stay ~ r2, data = dom_mean_stay_georgns)
summary(mod3)
# not significant model :(, so we have to move forward with the more general and less specific r1 groupings
mod4 <- lm(mean_stay ~ r1, data = dom_mean_stay_georgns)
summary(mod4)
# not significant model!! Multiple R-squared is 0.01365..... NOT GOOD.
# lets do a quick plot to compare mean stays and r1, just to be sure that we cant use georegions to estimate mean_stay:
# plot the data to compare
ggplot(dom_mean_stay_georgns, aes(x = as.numeric(r1), y = mean_stay)) +
geom_point(alpha = 0.5) + # Add some transparency to points
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() + # Use a minimal theme
labs(title = "Georegions vs Mean Stay",
x = "Georegion (r1)",
y = "Mean Stay (Days)") # yikesss!!
# log scale
ggplot(dom_mean_stay_georgns, aes(x = log(r1 + 1), y = log(mean_stay + 1))) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Georegions vs Mean Stay",
x = "Log(Georegion (r1) + 1)",
y = "Log(Mean Stay (Days) + 1)") # yup, definitely no correlation! lets check numerically though
# let's look at their correlation. see ?`cor` if you are interested in learning more about how this is calculated (`cor` computes the variance of x and the covariance or correlation of x and y). Kendall correlation is preferred when there are small samples or some outliers (https://datascience.stackexchange.com/questions/64260/pearson-vs-spearman-vs-kendall)
range(dom_mean_stay_georgns$r1, na.rm = TRUE) # 2 419, interesting to know
sum(is.na(dom_mean_stay_georgns$mean_stay)) # 3829 NAs
sum(is.na(dom_mean_stay_georgns$r1)) # 27 NAs: high seas regions
mean(!is.na(dom_mean_stay_georgns$mean_stay)) # 0.1706736, aka 17% NA/total obs
mean(!is.na(dom_mean_stay_georgns$r1)) # 0.994152, aka 99% NA/total obs, great!
# significant amt of NAs, so we need to remove them before finding the correlation:
clean_dom_mean_stay_georgns <- dom_mean_stay_georgns %>%
filter(!is.na(mean_stay)) #filtering the dom_acc_guests data frame to keep only the rows where mean_stay has non-missing values
# now lets try to find the correlation:
cor(clean_dom_mean_stay_georgns$mean_stay, clean_dom_mean_stay_georgns$r1, method = "kendall") # 0.08158826, VERY LITTLE correlation... to be expected.
We should not move forward using georegions to group by and take the mean. Instead, we should either use the global mean to gapfill for mean_stays, or consider using a moving average.
What I will do for now is take the mean by region, assuming that there are some instances in which there may be some years that have overnights and guest values, but in other years it could have only overnights values. In that case, it would make the most sense to use the value found within the region if possible.
For regions that only have overnights data, we will use the global mean of mean_stays as our value to divide Overnights by.
As a reminder:
Mean Stay = Overnights/Guests (to see how many nights they stayed on average)
Therefore, if we have Overnights, we can find Guests by dividing by the Mean Stay. (Guests = Overnights / Mean Stay)
dom_mean_stay_georgns_gf <- dom_mean_stay_georgns %>%
group_by(rgn_id, rgn_name) %>%
mutate(
mean_stay_filled = mean(mean_stay, na.rm = TRUE),
mean_stay_filled = if_else(is.nan(mean_stay_filled), NA, mean_stay_filled)
)
# global mean multiplier (technically, divider lol)
global_mean_stay <- mean(dom_mean_stay_georgns$mean_stay, na.rm = TRUE) # v2024: 2.893876 days
dom_mean_stay_georgns_filled <- dom_mean_stay_georgns_gf %>%
mutate(
mean_stay_filled = ifelse(is.na(mean_stay_filled), global_mean_stay, mean_stay_filled)
) # if the mean_stay was already found regionally, leave it, otherwise replace with global mean_stay value
# ------ use mean_stay to gapfill values --------
dom_acc_guests_gf3 <- dom_mean_stay_georgns_filled %>%
mutate(
tourism_arrivals_ct = case_when(
is.na(tourism_arrivals_ct) & !is.na(total_overnights) ~ total_overnights/mean_stay_filled,
TRUE ~ tourism_arrivals_ct
),
arrivals_method = ifelse(is.na(total_guests) & !is.na(total_overnights), "UNWTO - Total:Overnights / Mean Stay", arrivals_method),
arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - Total:Overnights / Mean Stay", "gapfilled", arrivals_gapfilled) # fill the "gapfilled" column to indicate "gapfilled" or NA
) %>% # it worked!! Angola 1998 is a good example of where it may have worked. There are others.
select(rgn_id, rgn_name, year, total_guests, hotel_overnights, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled) # only select columns needed for future analysis
# take a look at how that changed the number of NAs:
summary(dom_acc_guests_gf3) # now, `tourism_arrivals_ct` has 2928 NAs
dom_acc_guests_gf3_noNA <- dom_acc_guests_gf3 %>%
filter(!is.na(tourism_arrivals_ct)) # remove NAs to see how many regions have total_guests values
length(unique(dom_acc_guests_gf3_noNA$rgn_name)) # we now have 87 it added only 5 more regions --- it is still worth it!!
As a reminder, within the “Clean Accommodations Data” section we did a linear model between total_guests and hotel_overnights, and found that it was a significant model with a good multiple r-squared. Therefore, we are going to move forward and use the significant intercept and slope to estimate total_guests. Let’s first do some exploring:
# --------- lm between total_guests ~ hotel_overnights --------------
summary(lm(total_guests ~ hotel_overnights, data = dom_acc_guests_agg)) # p-value: < 0.00000000000000022, multiple R squared is 0.8971, not bad!! intercept and slope is significant, at -1477.876236 and 0.694437, respectively.
# name the model
mod5 <- lm(total_guests ~ hotel_overnights, data = dom_acc_guests_agg)
# plot the variables against each other to see their relationship, see if the model runs through the data appropriately
ggplot(dom_acc_guests_agg, aes(x = total_guests, y = hotel_overnights)) +
geom_point(alpha = 0.5) + # Add some transparency to points
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal() +
labs(
title = "Total:Guests vs Hotel:Overnights"
) # it makes sense that the hotel overnights have a larger amount of individuals, since it includes mean stay (it is hotel_guests*mean_stay_hotel). In this case, because hotel_overnights is like two/three steps removed from the total_guests unit, let's see if we can use the lm to find values for total_guests from hotel_overnights, instead of recalculating a mean_stay for hotel_guests specifically and hoping that the resulting hotel_guest values could pass for total_guest values.
# lets see if mod5 is explanatory:
plot(dom_acc_guests$hotel_overnights, dom_acc_guests$total_guests)
abline(mod5, col="red") # not a linear fit, would be better to do nonlinear, but this is good for now (perhaps update this in future years). we can use the slope and intercept from this model to estimate total_guests!
# total_guests = -1270.660768 + 0.692837*hotel_overnights
Let’s move forward, using the intercept and slope to determine total_guests values from hotel_overnights!
# first, remove any regions that only have NAs. Then use region and georegions to get lm coefficients
dom_gf4_coef <- dom_acc_guests_gf3 %>%
left_join(georegions, by = c("rgn_id")) %>%
group_by(rgn_id) %>% # group by region
filter(if_all(c(total_guests, hotel_overnights), ~!is.na(.))) %>% # remove rows where there are NAs for both columns
mutate(
total_guests_slope = lm(total_guests ~ 0 + hotel_overnights)$coefficient[1]
) %>% # obtain the intercept and slope for each region that has data
ungroup() %>%
group_by(r2) %>% # get values for r1 and r2, so that if regional is available, use that... then if r2 is available, use that... finally, use r1 if you can't otherwise.
mutate(
total_guests_slope_r2 = lm(total_guests ~ 0 + hotel_overnights)$coefficient[1]
) %>%
ungroup() %>%
group_by(r1) %>%
mutate(
total_guests_slope_r1 = lm(total_guests ~ 0 + hotel_overnights)$coefficient[1]
) %>%
ungroup() %>%
distinct(rgn_id, r2, r1, total_guests_slope, total_guests_slope_r2, total_guests_slope_r1) %>% # prep for left join, otherwise many-to-many
select(rgn_id, r2, r1, total_guests_slope, total_guests_slope_r2, total_guests_slope_r1) # select the columns that will be needed
# ------- gapfill using hotel_overnights when total_guests = NA ---------
# will need to gapfill these slopes and intercepts for regions that have hotel_overnights data but no total_guests data. join with georegions, group by r2, and take the mean for those regions. Otherwise, use regional level.
dom_acc_guests_gf4 <- dom_acc_guests_gf3 %>%
left_join(georegions, by = c("rgn_id","rgn_name")) %>%
left_join(dom_gf4_coef, by = c("rgn_id","r2","r1")) %>%
group_by(r2) %>% # get values for r1 and r2 for regions that do not have data, so that if regional is available, use that... then if r2 is available, use that... finally, use r1 if you can't otherwise.
mutate(
total_guests_slope_r2mean = mean(total_guests_slope_r2, na.rm = TRUE),
total_guests_slope_r2mean = ifelse(is.nan(total_guests_slope_r2mean), NA, total_guests_slope_r2mean) # any NaN values to NAs
) %>%
ungroup() %>%
group_by(r1) %>%
mutate(
total_guests_slope_r1mean = mean(total_guests_slope_r1, na.rm = TRUE),
total_guests_slope_r1mean = ifelse(is.nan(total_guests_slope_r1mean), NA, total_guests_slope_r1mean) # any NaN values to NAs
) %>%
ungroup() %>% # now we have intercept and slope values for all regions (at different levels), lets use that to gapfill total_guests with hotel_overnights!
mutate(
arrivals_method = ifelse(is.na(total_guests) & !is.na(hotel_overnights), "UNWTO - Hotel:Overnights", arrivals_method), # do this first so that we can use when tourism_arrivals_ct is NA as a qualifier
tourism_arrivals_ct = case_when(
is.na(tourism_arrivals_ct) & !is.na(hotel_overnights) & !is.na(total_guests_slope) ~ (hotel_overnights*total_guests_slope),
is.na(tourism_arrivals_ct) & !is.na(hotel_overnights) & is.na(total_guests_slope) & !is.na(total_guests_slope_r2mean) ~ (hotel_overnights*total_guests_slope_r2mean), # if regional lm coefficients weren't possible, go to r2 which is second least specific, then r1
is.na(tourism_arrivals_ct) & !is.na(hotel_overnights) & is.na(total_guests_slope) & is.na(total_guests_slope_r2mean) & !is.na(total_guests_slope_r1mean) ~ (hotel_overnights*total_guests_slope_r1mean),
TRUE ~ tourism_arrivals_ct
),
arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - Hotel:Overnights", "gapfilled", arrivals_gapfilled)
) %>% # example: New Caledonia and Monaco. The numbers make sense because the estimated total_guests were larger than hotel_guests (makes sense because total includes other establishments than just hotels) and we can compare to hotel_overnights, of which tourism_arrivals_ct is about half ish, which based on our knowledge of mean_stays (found later on in raw gf for dom) the average is between 2-3 nights. So this makes sense!
select(rgn_id, rgn_name, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled) # select all that we need moving forward
Now that we can gapfilled using this method, lets see how it changed our data:
summary(dom_acc_guests_gf4) # we now have 2688 out of 4617 observations. This is now 58% filled!
# rgn_id rgn_name
# Min. : 5.0 Length:4617
# 1st Qu.: 65.0 Class :character
# Median :123.0 Mode :character
# Mean :123.3
# 3rd Qu.:184.0
# Max. :250.0
#
# year tourism_arrivals_ct
# Length:4617 Min. : 0.3
# Class :character 1st Qu.: 586.1
# Mode :character Median : 6287.5
# Mean : 34599.9
# 3rd Qu.: 24990.9
# Max. :2321983.0
# NA's :2688
# arrivals_method arrivals_gapfilled
# Length:4617 Length:4617
# Class :character Class :character
# Mode :character Mode :character
dom_acc_guests_gf4_noNA <- dom_acc_guests_gf4 %>%
filter(!is.na(tourism_arrivals_ct)) # remove NAs to see how many regions have total_guests values
length(unique(dom_acc_guests_gf4_noNA$rgn_name)) # 98 unique regions, that's great!!!
# Let's see how many are left if we filter to 2005-2021, which is our years of interest.
dom_acc_guests_filt <- dom_acc_guests_gf4_noNA %>%
filter(year %in% 2005:2021)
length(unique(dom_acc_guests_filt$rgn_name)) # 91 unique regions, NOT BAD!!! Let's move forward :)
# filter domestic data for usable years to be used in the future. Ensure that it is between 2005 - 2021, and that there are at least three raw data values!
domestic_guests_rgns <- dom_acc_guests_gf4 %>%
filter(year %in% 2005:2021) %>%
group_by(rgn_id) %>%
summarize(count_values = sum(!is.na(tourism_arrivals_ct))) %>% # for each region, how many values in tourism_arrivals_ct did it have
filter(count_values >= 3) # keep only the regions that have greater than or equal to three values
length(unique(domestic_guests_rgns$rgn_id)) # v2024: 87 rgns
domestic_guests <- dom_acc_guests_gf4 %>%
filter(rgn_id %in% domestic_guests_rgns$rgn_id) %>% # filter only for regions that are in that time frame and have at least three values
filter(year %in% 2005:2021)
length(unique(domestic_guests$rgn_id)) # v2024: 87 regions! expected, hopefully enough to be useable
summary(domestic_guests) # only 229 NAs out of 1479 observations to gapfill!
Here is the second big gapfilling step in the script, which is the gapfilling of prepped international and domestic tourism arrivals data, for regions that are missing values for certain years.
# gapfill for international arrivals!
# downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
# for 2020 and 2021, use the global average proportion increase or decrease and add to the previous years value
# reminder:
# `tourism_arrivals_ct` is the tourism arrivals count from the INTERNATIONAL data, which consists of overnights, and was gapfilled with (total_arrivals - same_day).
test_2021_int <- international_arrivals %>%
filter(year %in% c(2020, 2021)) %>%
# filter(!is.na(tourism_arrivals_ct)) %>%
pivot_wider(names_from = year,
values_from = tourism_arrivals_ct,
names_prefix = "int_tourism_") %>%
mutate(tourism_ct_diff = (int_tourism_2021 - int_tourism_2020)/int_tourism_2020)
gf_2021_tourism_int <- mean(test_2021_int$tourism_ct_diff, na.rm = TRUE) # global average increase for 2021 tourism_arrivals_ct
# v2024: 0.1290837
# ok, lets use these values to gapfill 2021 if it is NA and 2020 exists. So increase by X proportion (0.129)
test_2020_int <- international_arrivals %>%
filter(year %in% c(2019, 2020)) %>%
filter(!is.na(tourism_arrivals_ct)) %>%
pivot_wider(names_from = year,
values_from = tourism_arrivals_ct,
names_prefix = "int_tourism_") %>%
mutate(tourism_ct_diff = (int_tourism_2020 - int_tourism_2019)/int_tourism_2019)
gf_2020_tourism_int <- mean(test_2020_int$tourism_ct_diff, na.rm = TRUE) # global average decrease for 2020 toursism
# v2024: -0.7118338
# ok, lets use these values to gapfill 2020 if it is NA and 2019 exists. So decrease by X proportion ~70%
# upfill earlier years that do not have data with nearby year data
unwto_int_upfill <- international_arrivals %>%
filter(year < 2020) %>% # change to 2005:2021?
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
tidyr::fill(tourism_arrivals_ct, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year. Doing this because doesn't make sense to add earlier years based on a trend
mutate(
arrivals_gapfilled = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "gapfilled", arrivals_gapfilled),
arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO - nearby year", arrivals_method)
)
## calculate regional average increase or decrease in number of tourism arrivals
lm_coef_data_tourism_int <- international_arrivals %>%
mutate(year = as.numeric(year)) %>%
filter(!(year %in% c(2020, 2021))) %>%
group_by(rgn_id) %>%
filter(!is.na(tourism_arrivals_ct)) %>%
summarize(lm_coef_tourism = lm(tourism_arrivals_ct ~ year)$coefficients[2])
# Initialize a flag to check if there are still NAs
na_flag <- TRUE
# filter out any regions that had all NAs
unwto_gapfill_lm_2019_tourism_int <- unwto_int_upfill %>%
mutate(year = as.numeric(year)) %>%
left_join(lm_coef_data_tourism_int) %>%
ungroup() %>%
dplyr::select(rgn_id, rgn_name, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled, lm_coef_tourism)
## now lets fill in any values down with the linear model average increase per year for tourists
while(na_flag) {
unwto_gapfill_lm_2019_tourism_int <- unwto_gapfill_lm_2019_tourism_int %>%
group_by(rgn_id) %>%
arrange(year) %>%
mutate(
tourism_arrivals_ct = case_when(
is.na(tourism_arrivals_ct) & !is.na(lag(tourism_arrivals_ct)) ~ lag(tourism_arrivals_ct) + lm_coef_tourism, # if there is a previous value, but no current value, gapfill using the lag plus the lm_coef_tourism
TRUE ~ tourism_arrivals_ct
)
) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO - linear model", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - linear model", "gapfilled", arrivals_gapfilled)) %>%
ungroup()
# Check if there are still NAs left in either column
na_flag <- any(is.na(unwto_gapfill_lm_2019_tourism_int$tourism_arrivals_ct))
}
# ==== gapfill for 2020 and 2021 now ==== #
unwto_2020_2021_int <- international_arrivals %>%
filter(year > 2019)
unwto_all_gf_int <- unwto_gapfill_lm_2019_tourism_int %>%
select(-lm_coef_tourism) %>%
complete(year = 2005:2021) %>%
rbind(unwto_2020_2021_int) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
# apply global average proportional increase or decrease for 2020 and 2021, because of covid pandemic messing up trends...
mutate(tourism_arrivals_ct = ifelse(year == 2020 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2020_tourism_int, tourism_arrivals_ct)) %>%
mutate(tourism_arrivals_ct = ifelse(year == 2021 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2021_tourism_int, tourism_arrivals_ct)) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO - 2020 and 2021 gapfill method", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - 2020 and 2021 gapfill method", "gapfilled", arrivals_gapfilled)) %>%
filter(year >= 2008) %>% # get only the year we need and beyond
drop_na(tourism_arrivals_ct) %>% # remove any remaining NAs (any remaining have all NAs for that region)
mutate(year = as.numeric(year)) # ensure it is numeric
length(unique(unwto_all_gf_int$rgn_name)) # still 156 regions after gapfilling! good, same as `international_arrivals`
summary(unwto_all_gf_int) # however, we do see that Libya has negative values for `tourism_arrivals_ct` when the lm slope is used to gapfill by year, since we have the lag and now are between 2008:2021. This needs to be looked into....
# gapfill for domestic arrivals!
# downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
# for 2020 and 2021, use the global average proportion increase or decrease and add to the previous years value
# reminder:
# `tourism_arrivals_ct` is the tourism arrivals count from the DOMESTIC data, which is primarily total guests (accommodation) data, and was gapfilled using domestic trip data also from UNWTO.
test_2021_dom <- domestic_guests %>%
filter(year %in% c(2020, 2021)) %>%
filter(!is.na(tourism_arrivals_ct)) %>%
pivot_wider(names_from = year,
values_from = tourism_arrivals_ct,
names_prefix = "int_tourism_") %>%
mutate(tourism_arrivals_ct_diff = (int_tourism_2021 - int_tourism_2020)/int_tourism_2020)
gf_2021_tourism_dom <- mean(test_2021_dom$tourism_arrivals_ct_diff, na.rm = TRUE) # global average increase for 2021 tourism_arrivals_ct
# v2024: 0.2361595
# ok, lets use these values to gapfill 2021 if it is NA and 2020 exists. So increase by X proportion (0.294)
test_2020_dom <- domestic_guests %>%
filter(year %in% c(2019, 2020)) %>%
filter(!is.na(tourism_arrivals_ct)) %>%
pivot_wider(names_from = year,
values_from = tourism_arrivals_ct,
names_prefix = "int_tourism_") %>%
mutate(tourism_arrivals_ct_diff = (int_tourism_2020 - int_tourism_2019)/int_tourism_2019)
gf_2020_tourism_dom <- mean(test_2020_dom$tourism_arrivals_ct_diff, na.rm = TRUE) # global average decrease for 2020 toursism
# v2024: -0.2984019
# ok, lets use these values to gapfill 2020 if it is NA and 2019 exists. So decrease by X proportion ~35%
# upfill earlier years that do not have data with nearby year data
unwto_upfill_dom <- domestic_guests %>%
filter(year < 2020) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
tidyr::fill(tourism_arrivals_ct, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year. Doing this because doesn't make sense to add earlier years based on a trend
mutate(
arrivals_gapfilled = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "gapfilled", arrivals_gapfilled),
arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO - nearby year", arrivals_method)
)
## calculate regional average increase or decrease in number of tourism arrivals
lm_coef_data_tourism_dom <- domestic_guests %>%
filter(!(year %in% c(2020, 2021))) %>%
mutate(year = as.numeric(year)) %>%
group_by(rgn_id) %>%
filter(!is.na(tourism_arrivals_ct)) %>%
summarize(lm_coef_tourism = lm(tourism_arrivals_ct ~ year)$coefficients[2]) # pull out the coeffiecient
# Initialize a flag to check if there are still NAs
na_flags <- TRUE
unwto_future_gapfill_lm_2019_tourism_dom <- unwto_upfill_dom %>%
left_join(lm_coef_data_tourism_dom) %>%
ungroup() %>%
dplyr::select(rgn_id, rgn_name, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled, lm_coef_tourism)
## now lets fill in any values down with the linear model average increase per year for tourists
## v2024: issue with running!! It will complete the process but it won't finish running -- on Anna's session as well.
while(na_flags) {
unwto_future_gapfill_lm_2019_tourism_dom <- unwto_future_gapfill_lm_2019_tourism_dom %>%
group_by(rgn_id) %>%
arrange(year) %>%
mutate(
tourism_arrivals_ct = case_when(
is.na(tourism_arrivals_ct) & !is.na(dplyr::lag(tourism_arrivals_ct)) ~ dplyr::lag(tourism_arrivals_ct) + lm_coef_tourism, # if there is a previous value, but no current value, gapfill using the lag plus the lm_coef_tourism
TRUE ~ tourism_arrivals_ct
)
) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO - linear model", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - linear model", "gapfilled", arrivals_gapfilled)) %>%
ungroup()
# Check if there are still NAs left in either column
na_flags <- any(is.na(unwto_future_gapfill_lm_2019_tourism_dom$tourism_arrivals_ct)) # FALSE, great!
}
# ==== gapfill for 2020 and 2021 now ==== #
unwto_2020_2021_dom <- domestic_guests %>%
filter(year > 2019)
unwto_all_gf_dom <- unwto_future_gapfill_lm_2019_tourism_dom %>%
select(-lm_coef_tourism) %>%
rbind(unwto_2020_2021_dom) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
# apply global average proportional increase or decrease for 2020 and 2021, because of covid pandemic messing up trends...
mutate(tourism_arrivals_ct = ifelse(year == 2020 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2020_tourism_dom, tourism_arrivals_ct)) %>%
mutate(tourism_arrivals_ct = ifelse(year == 2021 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2021_tourism_dom, tourism_arrivals_ct)) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "UNWTO - 2020 and 2021 gapfill method", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - 2020 and 2021 gapfill method", "gapfilled", arrivals_gapfilled)) %>%
filter(year >= 2008) %>% # get only the year we need and beyond
drop_na(tourism_arrivals_ct) %>% # remove any remaining NAs (any remaining have all NAs for that region)
mutate(year = as.numeric(year)) # ensure it is numeric
length(unique(unwto_all_gf_dom$rgn_name)) # now still 87 regions after gapfilling and removing regions with only NAs!
summary(unwto_all_gf_dom) # and yay, no negative guest values!
Let’s move on, as we decided that it makes sense to aggregate the international and domestic data! This is because we are creating a score for each region, and only comparing across regions once that score has been calculated (versus ranking regions and comparing them to each other based on tourism density, not the rescaled score).
We decided that there would be less error after aggregating if we at least put a mean (by georegion) domestic guest value for regions that have no domestic data, but they have international data. That way, all international regions will be adding to a domestic value, rather than some regions only having international data.
# first, let's join international and domestic data to see where the gaps are!
## aggregating international and domestic data by region and year
arrivals_join <- left_join(unwto_all_gf_int, unwto_all_gf_dom, by = c("rgn_id","rgn_name","year")) %>%
rename(
tourism_arrivals_ct_int = tourism_arrivals_ct.x,
tourism_arrivals_ct_dom = tourism_arrivals_ct.y,
data_source = arrivals_method.x,
arrivals_gf = arrivals_gapfilled.x,
arrivals_method = arrivals_method.y,
arrivals_gapfilled = arrivals_gapfilled.y
)
length(unique(arrivals_join$rgn_name)) # v2024: 156 regions that have data! This checks out!
Now that we have a joined dataframe, let’s adjust it to gapfill by years.
# join with georegions again, and take the mean of the domestic guests data for r2 and r1 groups
mean_domestic_guests <- arrivals_join %>%
group_by(rgn_id) %>%
complete(year = 2008:2021) %>% # ensures that there is a row for every year for each region, to ensure there are no gaps
left_join(georegions, by = c("rgn_id","rgn_name")) %>%
ungroup() %>%
group_by(r2) %>%
mutate(
tourism_arrivals_ct_dom_r2 = mean(tourism_arrivals_ct_dom, na.rm = TRUE), # take the mean of arrivals by georegion
tourism_arrivals_ct_dom_r2 = ifelse(is.nan(tourism_arrivals_ct_dom_r2), NA, tourism_arrivals_ct_dom_r2) # to make NaN = NAs
) %>%
ungroup() %>%
group_by(r1) %>%
mutate(
tourism_arrivals_ct_dom_r1 = mean(tourism_arrivals_ct_dom, na.rm = TRUE),
tourism_arrivals_ct_dom_r1 = ifelse(is.nan(tourism_arrivals_ct_dom_r1), NA, tourism_arrivals_ct_dom_r1)
) %>%
ungroup()
length(unique(mean_domestic_guests$rgn_name)) # 156 yay!
length(unique(georegions$r2)) # 22 r2 regions
length(unique(mean_domestic_guests$tourism_arrivals_ct_dom_r2)) # 20, so it missed only two r2 regions!
length(unique(georegions$r1)) # 7 regions
length(unique(mean_domestic_guests$tourism_arrivals_ct_dom_r1)) # 7, that means it got all the r1 regions
# ----- use the mean domestic data to fill for any domestic rows that need guest values, then aggregate! -----
arrivals_aggregated <- mean_domestic_guests %>%
mutate(
tourism_arrivals_ct_dom = case_when(
is.na(tourism_arrivals_ct_dom) & !is.na(tourism_arrivals_ct_dom_r2) ~ tourism_arrivals_ct_dom_r2,
is.na(tourism_arrivals_ct_dom) & is.na(tourism_arrivals_ct_dom_r2) & !is.na(tourism_arrivals_ct_dom_r1) ~ tourism_arrivals_ct_dom_r1,
TRUE ~ tourism_arrivals_ct_dom
),
tourism_arrivals_ag = case_when(
!is.na(tourism_arrivals_ct_int) & !is.na(tourism_arrivals_ct_dom) ~ tourism_arrivals_ct_int + tourism_arrivals_ct_dom,
is.na(tourism_arrivals_ct_int) ~ tourism_arrivals_ct_dom,
is.na(tourism_arrivals_ct_dom) ~ tourism_arrivals_ct_int,
TRUE ~ tourism_arrivals_ct_int
),
ag_method = case_when(
!is.na(tourism_arrivals_ct_int) & !is.na(tourism_arrivals_ct_dom) ~ "UNWTO - aggregated international and domestic",
is.na(tourism_arrivals_ct_int) ~ "UNWTO - international",
is.na(tourism_arrivals_ct_dom) ~ "UNWTO - domestic",
TRUE ~ data_source
),
ag_gapfilled = case_when(
arrivals_gf == "gapfilled" & arrivals_gapfilled == "gapfilled" ~ "gapfilled - both",
arrivals_gf == "gapfilled" & is.na(arrivals_gapfilled) ~ "gapfilled - international",
is.na(arrivals_gf) & arrivals_gapfilled == "gapfilled" ~ "gapfilled - domestic",
TRUE ~ arrivals_gf
)
)
length(!is.na(arrivals_aggregated$tourism_arrivals_ct_int)) # 2184
length(!is.na(arrivals_aggregated$tourism_arrivals_ct_dom)) # 2184, yay! now we can aggregate international and domestic with the least amount of error with the data we have
# save gf information
tourism_aggregated_gf <- arrivals_aggregated %>%
rename(
int_method = data_source,
int_gapfilled = arrivals_gf,
dom_method = arrivals_method,
dom_gapfilled = arrivals_gapfilled
) %>%
select(rgn_id, rgn_name, year, int_method, int_gapfilled, dom_method, dom_gapfilled, ag_method, ag_gapfilled)
write_csv(tourism_aggregated_gf, here::here(int_dir, "tourism_aggregated_gf.csv"))
Now, lets bring in our tourism data and calculate the coastal proportion! The coastal proportion will then be multiplied by the arrivals value per region and year. Finally, the coastal area will be divided, giving us a density of tourism per region (individuals/km^2).
# ------- coastal population data -----------
# In 2024, the coastal population data was fixed! This is great. We will now use `globalprep/mar_prs_population/v2024/output/mar_pop_25mi.csv`.
coastal_pop_data <- read_csv(here("globalprep","mar_prs_population","v2024","output","mar_pop_25mi.csv")) ## read in coastal population data from other data layer
coastal_pop_data_fill <- coastal_pop_data %>%
filter(year == 2020) %>%
mutate(year = 2021) %>%
rbind(coastal_pop_data) # add 2021 data year, just using year 2020
length(unique(coastal_pop_data_fill$rgn_id)) # v2024: 220 regions available! Any region without data is uninhabited.
# ------- total global population data -----------
# bring in World Bank Global population data from `/home/shares/ohi/git-annex/globalprep/_raw_data/WorldBank/d2024/WorldBank_global_country_population_1960-2023`
total_population_wb <- read_csv(here(raw_data_dir, "WorldBank", data_dir_version_year, "WorldBank_global_country_population_1960-2023","API_SP.POP.TOTL_DS2_en_csv_v2_1584446.csv"), skip = 3) %>%
pivot_longer(cols = 5:ncol(.), names_to = "year",
values_to = "total_pop") %>%
janitor::clean_names()
# give ohi rgns to world bank data
total_population_rgns <- name_2_rgn(df_in = total_population_wb,
fld_name = 'country_name',
keep_fld_name = TRUE)
# this shows rows that have at least one duplicate, including both the first and subsequent occurrences of each duplicated row. using the tidyverse allows us to see all instances of duplicated data, not just the second and subsequent occurrences (which is what just a duplicated() call would give you).
duplicates_wb <- total_population_rgns %>%
group_by(rgn_id, rgn_name, year) %>%
filter(n() > 1) %>%
ungroup()
unique(duplicates_wb$rgn_name)
# [1] "China"
# [2] "Northern Mariana Islands and Guam"
# [3] "Puerto Rico and Virgin Islands of the United States"
# drop Macao and Hong Kong, since we only need mainland China
# Northern Mariana Islands and Guam, Puerto Rico and Virgin Islands of the United States: aggregate!
total_population_final <- total_population_rgns %>%
filter(country_name != "Hong Kong SAR, China" & country_name != "Macao SAR, China") %>% # only keep mainland, not city specific data
group_by(rgn_id, rgn_name, year) %>%
summarise(
total_pop = sum(total_pop, na.rm = TRUE), # aggregate by rgn and year
.groups = "drop" # ungroup
) %>%
mutate(year = as.numeric(year))
length(unique(total_population_final$rgn_name)) # v2024: 166 regions available of total population data!
length(unique(arrivals_aggregated$rgn_name)) # 156
length(union(arrivals_aggregated$rgn_name, total_population_final$rgn_name)) # v2024: 176, so 10 regions do not have total population data
length(intersect(arrivals_aggregated$rgn_name, total_population_final$rgn_name)) # v2024: 146, only 146 regions have the same name
Now, lets bring in the \[km^2\] of coastal area for all OHI regions by aggregating the area protected 1 kilometer inland, and 3 nautical miles out into the eez.
# ------- coastal area data -----------
# read in area of coastline data
inland_filepath <- here::here("globalprep", "lsp", version_year, "int", "area_protected_1km.csv")
inland_data <- read_csv(inland_filepath)
offshore_filepath <- here::here("globalprep", "lsp", version_year, "int", "area_protected_3nm.csv")
offshore_data <- read_csv(offshore_filepath)
# get combined value of inland and offshore for each ohi region
inland_offshore <- inland_data %>%
left_join(offshore_data, by = join_by(rgn_id, year, rgn_name)) %>%
select(rgn_id, year, a_tot_km2.x, a_tot_km2.y) %>%
group_by(rgn_id, year) %>%
mutate(total_inland_offshore_area = a_tot_km2.x + a_tot_km2.y,
year = as.numeric(year)) %>% # add the areas together, as numeric
select(rgn_id, year, total_inland_offshore_area)
Now we can join all of these data sources and conduct \[\frac{TourismArrivals * (CoastalPop/TotalPop)}{CoastalArea}\].
# ============= join the tourism arrival, coastal population, and total population data =================== #
arrivals_prop_df <- left_join(arrivals_aggregated, total_population_final, by = c("rgn_id","rgn_name","year")) %>%
mutate(year = as.numeric(year)) %>%
left_join(., coastal_pop_data_fill, by = c("rgn_id","year")) %>%
left_join(., inland_offshore, by = c("rgn_id","year")) %>%
filter(year %in% 2000:2021) %>% # to keep consistent with the coastal pop data
mutate(coastal_prop = (popsum/total_pop)) %>% # values that are greater than 1 are because of inconsistencies between World Bank and the coastal population data. However, those with a value greater than 1 are mostly islands or small regions with a large coastal area. In that case, we could say 100% of the tourism could be coastally related.
mutate(coastal_prop = ifelse(coastal_prop > 1, 1, coastal_prop)) # for rgns and years in which the total_population is greater than 1, we cap it at 1
ap_final <- arrivals_prop_df %>%
mutate(Ap = (tourism_arrivals_ag * coastal_prop)/(total_inland_offshore_area)) %>%
drop_na(Ap) %>% # regions that did not have tourism arrival values but were present in the coastal population data
select(rgn_id, rgn_name, year, Ap) # this final `Ap` (arrivals proportion) is in the units of (tourists/km2). Aka, a higher proportion indicates that there is a larger number of tourists per km2 of coastal area, and there is therefore a higher tourism density. This number is now ready to be divided by `Sr`, or the sustainability of tourism.
# save the Ap proportions dataframe
# write_csv(ap_final, here::here(int_dir, "ap.csv"))
# # save which observations were gapfilled and how into output -- from v2023 version
# tr_arrivals_props_tourism_gf <- arrivals_prop_df %>%
# select(rgn_id, rgn_name, year, data_source, arrivals_gf)
#
# write_csv(tr_arrivals_props_tourism_gf, here::here(output_dir, "tr_arrivals_props_tourism_gf.csv"))
# interactive plot of the arrival proportions before rescaling
ap_plot <- ap_final %>%
plot_ly(x = ~year, y = ~Ap, color = ~rgn_name,
type = "scatter", mode = "lines") %>%
layout(title = "Global Arrival proportions (Ap) by Region",
xaxis = list(title = "Year"),
yaxis = list(title = "Arrival proportion (Ap)"))
ap_plot
# htmlwidgets::saveWidget(ap_plot, here::here(figs_dir, "ap_interactive_plot.html")) # save plot
As according to the OHI methods, the Ap values for each year and region is rescaled using the 90th quartile for each region.
\[\frac{T_r}{T_{90th}}\]
# to make the Ap scaled between 0 and 1, we can divide the current Ap by the 90th quartile values by region.
ap_rescaled <- ap_final %>%
group_by(rgn_id, rgn_name) %>%
summarize(value_90th_percentile = quantile(Ap, 0.9, na.rm = TRUE))
tr_arrivals_props_tourism <- left_join(ap_final, ap_rescaled, by = c("rgn_id", "rgn_name")) %>%
mutate(
Ap = Ap/value_90th_percentile,
Ap = if_else(Ap < 0, 0, Ap), # if less than 1, that is due to gapfilling where linear models drew the tourism arrival values less than 0.
Ap = if_else(Ap > 1, 1, Ap) # if more than 1, then the Ap value for that year was above the 90th percentile. So the value is adjusted to 1
) %>%
select(rgn_id, year, Ap) # keep in rgn_name when you want to plot!!!
# interactive plot of the arrival proportions after rescaling
ap_final_plot <- tr_arrivals_props_tourism %>%
plot_ly(x = ~year, y = ~Ap, color = ~rgn_name,
type = "scatter", mode = "lines") %>%
layout(title = "Global Arrival proportions (Ap) by Region, Rescaled",
xaxis = list(title = "Year"),
yaxis = list(title = "Arrival proportion (Ap)"))
ap_final_plot
# htmlwidgets::saveWidget(ap_final_plot, here::here(figs_dir, "ap_rescaled.html")) # save plot
# save the final tourism arrivals csv
write_csv(tr_arrivals_props_tourism, here::here(output_dir, "tr_arrivals_props_tourism.csv"))
These data are from the World Economic Forum’s “Travel and Tourism Development Index” (https://www.weforum.org/publications/travel-tourism-development-index-2024/) Downloaded 08/20/2024.
See mazu: _raw_data/WEF-Economics/ for more details and the raw data.
The TTDI was formerly the TTCI which was a similar index, but unfortunately not comparable. v2024’s updated TTDI uses data up through 2023 to calculate their sustainability scores.
These data are gapfilled using gdppcppp and UN georegion information (see next section for obtaining and preparing these data).
# bring in TTDI pdf! Download it to your computer, place it in Mazu
ttdi_file <- "WEF_Travel_and_Tourism_Development_Index_2024.pdf" # the file name
ttdi_raw <- pdf_text(here::here(raw_data_dir, "WEF-Economics", data_dir_version_year, ttdi_file)) # using the package `pdftools`, read in using pdf_text()
sr_scores_table <- ttdi_raw[[11]] %>% # page number of the pdf that has the data we want (aka the sustainability scores)
as.data.frame() # make the page as a data frame. Be warned, the data is all there! You just can't see it yet.
columns = c("rank","economy","score","rank_change","score_change","diff_avg","rank_2","economy_2","score_2","rank_change_2","score_change_2","diff_avg_2", "rank_3","economy_3","score_3","rank_change_3","score_change_3","diff_avg_3") # mirror the pdf and create clean column names
sr_scores <- sr_scores_table %>%
separate_rows('.', sep = "\n") %>% # separate the rows: \n separates by moving down to the next line without returning to the beginning of the line
rename(main_column = ".") %>% # rename the only column as main_column instead of just the period
slice(16:55) %>% # choose the rows that have the table in them (disregard the page title etc)
separate_wider_position(
main_column, # the issue here is that there is not a common delimiter, and the three columns blur together. So, we can use `separate_wider_position()`, which allows you to instead define column breaks by the number of characters, which includes spaces
widths = c(rank = 2, economy = 24, score = 4, rank_change = 3, score_change = 5, diff_avg = 19,
rank_2 = 7, economy_2 = 24, score_2 = 5, rank_change_2 = 3, score_change_2 = 7, diff_avg_2 = 19,
rank_3 = 9, economy_3 = 26, score_3 = 8, rank_change_3 = 1, score_change_3 = 1, diff_avg_3 = 1), # these numbers were found through trial and error, but are always at least the length of the longest character string
names_sep = NULL, # default, to avoid renaming the column names
names_repair = "check_unique", #the default), no name repair, but checks they are unique
too_many = "debug", # will add additional columns to the output to help you locate and resolve the underlying problem
too_few = "debug", # adds additional columns to the output to help you locate and resolve the underlying problem... these will not remain in final code
cols_remove = FALSE # always FALSE if too_few or too_many are set to "debug"
) %>%
select(rank, economy, score, rank_2, economy_2, score_2, rank_3, economy_3, score_3) %>% # to get rid of debugging columns and rankings
mutate(
economy = str_trim(economy, side = "both"), # remove the whitespace before and after the string
rank_2 = str_trim(rank_2, side = "both"),
economy_3 = str_trim(economy_3, side = "both"),
rank_3 = str_trim(rank_3, side = "both"),
score_3 = str_trim(score_3, side = "both"),
economy_3 = str_replace(economy_3, pattern = "Trinidad and Tobago 3.5", replacement = "Trinidad and Tobago"), # this line specifically had issues with spacing, and no matter what it always had bleeding from the score. Therefore, the economy name and score were fixed using str_replace.
score_3 = str_replace(score_3, pattern = "2 -5", replacement = "3.52")
)
# now, we have a dataframe that contains three columns we would like to make into one. let's subset and create three dfs that each have the same name (they couldnt have the same name beforehand because they were in the same df), then rbind them together to have one df of all regions and their scores.
sr_scores_sub1 <- sr_scores %>%
select(rank, economy, score)
sr_scores_sub2 <- sr_scores %>%
select(rank_2, economy_2, score_2) %>%
rename(
rank = rank_2,
economy = economy_2,
score = score_2
)
sr_scores_sub3 <- sr_scores %>%
select(rank_3, economy_3, score_3) %>%
rename(
rank = rank_3,
economy = economy_3,
score = score_3
)
# rbind them together and clean up -- we do not need the column "rank", it was just preserved until now as an easy way to ensure a clean bind
sr_scores_clean <- rbind(sr_scores_sub1, sr_scores_sub2, sr_scores_sub3) %>%
select(-rank) %>% # remove rank
filter(!is.na(score)) %>% # remove NA, leaves 119 regions, good
mutate(
S_score = as.numeric(score)) # ensure it is numeric
# run name_2_gn so each economy receives an OHI rgn name and id
ttdi_rgn <- name_2_rgn(df_in = sr_scores_clean,
fld_name = 'economy',
keep_fld_name = TRUE) %>% # v2024: no duplicates! looks good
select(rgn_id, rgn_name, S_score)
### Save TTDI data file
# write_csv(ttdi_rgn, here(int_dir,"wef_ttdi.csv")) # save to int folder
These data are used to gapfill missing values in tourism sustainability. Most of the data are from the World Bank, but CIA data fill some gaps (CIA data is available for only the most recent year).
The Artisanal Opportunities goal uses gdppcppp data, so we will get the data that was processed for that goal.
wb <- read_csv(here::here("globalprep","ao", version_year, "intermediate","gdppcppp_ohi.csv")) %>%
dplyr::select(rgn_id, year, value)
CIA data are used to fill in missing gaps in the gdppcppp data (https://www.cia.gov/the-world-factbook/field/real-gdp-per-capita/country-comparison)
Downloaded: 09/22/2024
See README on the raw folder for instructions on how to download this data. Place in Mazu.
The following code is used to prepare these data for OHI:
cia_gdp <- read_csv(here::here(raw_data_dir, "CIA", data_dir_version_year, "Real_GDP_per_capita.csv")) %>%
# remove dollar signs and commas and convert to numeric
mutate(value = as.numeric(gsub("[$,]", "", value))) %>%
select(name, value) %>%
rename(country = name, pcgdp_cia = value)
## Data reported in a lower resolution than OHI regions
splits <- data.frame(country = "Saint Helena, Ascension, and Tristan da Cunha",
country2 = c("Saint Helena", "Ascension","Tristan da Cunha"))
cia_gdp_splits <- cia_gdp %>%
left_join(splits, by='country') %>%
mutate(country2 = ifelse(is.na(country2), country, country2)) %>%
select(country = country2, pcgdp_cia)
cia_gdp_rgn <- name_2_rgn(df_in = cia_gdp_splits,
fld_name='country')
### Duplicated regions: Collapse regions after weighting by population (regions we include as a single region) -
population_weights <- data.frame(country = c("Virgin Islands", "Puerto Rico",
"China", "Hong Kong", "Macau",
"Guam", "Northern Mariana Islands"),
# from world bank - updated v2024 using 2023 data from `total_population_wb` read in earlier during aggregation of tourism data
population = c(104917, 3205691, 1410710000,
7536100, 704149, 172952, 49796))
cia_gdp_rgn_clean <- cia_gdp_rgn %>%
left_join(population_weights, by="country") %>%
mutate(population = ifelse(is.na(population), 1, population)) %>%
group_by(rgn_id) %>%
summarize(pcgdp_cia = weighted.mean(pcgdp_cia, population)) %>%
ungroup() %>%
filter(rgn_id <= 250) %>%
select(rgn_id, pcgdp_cia)
# save the cia data in intermediate folder
# write_csv(cia_gdp_rgn_clean, here::here(int_dir, "wb_rgn_cia_GDPPCPPP.csv"))
The following code combines the two gdp datasets and gapfills missing regions using UN georegions.
If there is no World Bank gdppcppp data (pcgdp), the CIA data is used (pcgdp_cia). The pcgdp2 variable includes both the World Bank and CIA data (with CIA data only used if there is not World Bank data). The remaining data are estimated using UN geopolitical regions. Ideally, the mean gdppcppp value is calculated at the r2 scale (gdp_pred_r2) using regions within each class with gdppcppp data. If there were not enough regions with data at the r2 scale, the average at the r1 scale was used (gdp_pred_r1). The gdp_all variable combines all estimates using the following hierarchy: World Bank -> CIA -> estimated using mean from r2 UN geopolitical regions -> estimated using mean from r1 UN geopolitical regions.
### world bank gdp data
gdppcppp <- wb %>%
select(rgn_id, year, pcgdp = value)
### cia gdp data
gdppcppp2 <- read_csv(here(int_dir, "wb_rgn_cia_GDPPCPPP.csv"))
### Use WB data, but if missing, use pcgdp_cia.
### combine with UN georegion data
years <- data.frame(year = min(gdppcppp$year):max(gdppcppp$year))
# bring in georegions and join with their labels
georegions <- ohicore::georegions
regions <- georegions %>%
left_join(georegion_labels, by = 'rgn_id')
# merge the df "regions" with years, so there is one year per region from 2005 to 2023. Then join the wb and cia data
gdp_raw <- merge(years, regions, by=NULL) %>%
left_join(gdppcppp, by = c("rgn_id", "year")) %>%
left_join(gdppcppp2, by = c("rgn_id"))
## quick compare to make sure the CIA and World Bank data are compatible
plot(gdp_raw$pcgdp[gdp_raw$year==2022], gdp_raw$pcgdp_cia[gdp_raw$year==2022])
abline(0,1, col="red")
# a few minor outliers but overall looks good!
gdp_replace_raw <- gdp_raw %>%
mutate(pcgdp2 = ifelse(is.na(pcgdp), pcgdp_cia, pcgdp)) %>% # create an object to replace WB data with CIA data if necessary
select(-c(pcgdp, pcgdp_cia))
## Calculating the means across different geopolitical levels (e.g. r2, r1)
gdp_means_georegions <- gdp_replace_raw %>%
group_by(r2, year) %>%
mutate(gdp_pred_r2 = mean(pcgdp2, na.rm=TRUE)) %>%
ungroup() %>%
group_by(r1, year) %>%
mutate(gdp_pred_r1 = mean(pcgdp2, na.rm=TRUE)) %>%
ungroup()
#if we dont have gdp data from the world bank or cia, then replace with the r2 mean. If it still does not have data, replace with the r1 mean.
gdp_georegions_gf <- gdp_means_georegions %>%
mutate(gdp_all = ifelse(is.na(pcgdp2), gdp_pred_r2, pcgdp2)) %>%
mutate(gdp_all = ifelse(is.na(gdp_all), gdp_pred_r1, gdp_all)) %>%
mutate(gapfilled = ifelse(is.na(pcgdp2) & !is.na(gdp_all), "gapfilled", NA)) %>%
mutate(method = ifelse(is.na(pcgdp2) & !is.na(gdp_pred_r2), "UN georegion (r2)", NA)) %>%
mutate(method = ifelse(is.na(pcgdp2) & is.na(gdp_pred_r2) & !is.na(gdp_pred_r1), "UN georegion (r1)", method))
# write_csv(gdp_georegions_gf, here(int_dir, "gdp_raw_gf.csv"))
gdp_data_gf <- gdp_georegions_gf %>%
select(rgn_id, year, gapfilled, method)
# write_csv(gdp_data_gf, here(int_dir, "gdp_gf.csv"))
gdp_data <- gdp_georegions_gf %>%
select(rgn_id, year, pcgdp = gdp_all)
# write_csv(gdp_data, here(int_dir, "gdp.csv"))
The final step is gapfilling the Sustainability data using an ANCOVA model with gdppcppp and UN geopolitical regions as predictor variables.
# tr sustainability raw TTDI scores
sust <- read_csv(here(int_dir, "wef_ttdi.csv"))
# bring in tourism arrivals data to see which regions need gapfilled S_scores
ap_gf <- read_csv(here(output_dir, "tr_arrivals_props_tourism.csv")) %>%
# filter(year == 2021) %>%
select(rgn_id, Ap, year) %>%
filter(!is.na(Ap)) %>%
mutate(year = as.numeric(year))
# gdp dataframe prepared above (World Bank, CIA, and gapfilled gdp data)
gdp_raw_gf <- read_csv(here(int_dir, "gdp_raw_gf.csv")) %>%
# filter(year == 2021) %>%
select(rgn_id, r0_label, r1_label, r2_label, rgn_label, pcgdp2, gdp_all, year) %>%
mutate(year = as.numeric(year))
tr_sust <- gdp_raw_gf %>%
left_join(sust, by = c("rgn_id")) %>%
left_join(ap_gf, by = c("rgn_id", "year"))
# filter(rgn_id != 213)
### Add gapfill flag variable
## Reminder:
## pcgdp2: includes both the World Bank and CIA data (with CIA data only used if there is not World Bank data)
## Ap: Proportion of tourism arrivals (international and domestic when available)
## S_score: tourism sustainability score
# specify whether each S_score will be gapfilled and with what method
tr_sust_gf <- tr_sust %>%
mutate(gapfilled = ifelse(is.na(S_score) & !is.na(Ap), "gapfilled", NA)) %>%
mutate(method = ifelse(is.na(S_score) & !is.na(Ap) & is.na(pcgdp2), "lm georegion + gdppcppp, with est. gdppcppp", NA)) %>%
mutate(method = ifelse(is.na(S_score) & !is.na(Ap) & !is.na(pcgdp2), "lm georegion + gdppcppp", method)) %>%
select(rgn_id, gapfilled, method, year)
# write_csv(tr_sust_gf, here(output_dir, "tr_sustainability_gf.csv"))
ANOVA model using gdppcppp (GDP per capita, PPP) and UN geopolitical regions (as factors) as predictor variables. However if there is no gdppc data we estimate the gdppc using the UN georegions and then used in the linear model to gapfill the sustainability score.
### Gapfill S using r1 and/or r2 regional data and PPP-adjusted per-capita GDP
### Looked at models with a year variable, but wasn't significant and decided to exclude
mod3 <- lm(S_score ~ as.factor(r2_label) + gdp_all, data=tr_sust, na.action = na.exclude)
summary(mod3)
anova(mod3)
mod4 <- lm(S_score ~ as.factor(r1_label) + gdp_all, data=tr_sust, na.action = na.exclude)
summary(mod4)
anova(mod4)
plot(predict(mod3), tr_sust$S_score)
abline(0,1)
plot(predict(mod4), tr_sust$S_score)
abline(0,1)
## Estimate missing data and gapfill
# Some of the r1 levels do not have data and consequently causes a fail. This chunk of code drops these levels so an NA is returned
# Select only r2 column
new_data <- tr_sust %>%
dplyr::select(r2_label, gdp_all)
# see what regions are
unique(tr_sust$r2_label)
r2_w_data <- unique(tr_sust$r2_label[!is.na(tr_sust$S_score)]) # only the r2s that dont have NAs for the sustainability score
new_data_r2 <- new_data %>%
mutate(r2_label = ifelse(r2_label %in% r2_w_data, r2_label, NA))
# Predict sustainability scores using linear model 3 (using r2 data)
tr_sust_mod3 <- tr_sust %>%
dplyr::mutate(s_score_pred_r2 = predict(mod3, newdata = new_data_r2))
# Select only r1 column
new_data <- tr_sust %>%
dplyr::select(r1_label, gdp_all)
unique(tr_sust$r1_label)
r1_w_data <- unique(tr_sust$r1_label[!is.na(tr_sust$S_score)])
new_data_r1 <- new_data %>%
mutate(r1_label = ifelse(r1_label %in% r1_w_data, r1_label, NA))
# Predict sustainability scores using linear model 4 (using r1 data)
tr_sust_mod4 <- tr_sust_mod3 %>%
dplyr::mutate(s_score_pred_r1 = predict(mod4, newdata = new_data_r1))
## some are missing the r1 predictions, but none of these have Ap scores, so not relevant
View(filter(tr_sust_mod4, is.na(s_score_pred_r1)))
tr_sust_final <- tr_sust_mod4 %>%
mutate(s_score_2 = ifelse(is.na(S_score), s_score_pred_r2, S_score)) %>%
mutate(s_score_2 = ifelse(is.na(s_score_2), s_score_pred_r1, s_score_2)) %>%
# filter(year %in% c(2019, 2021)) %>%
select(rgn_id, year, S_score=s_score_2)
summary(tr_sust_final)
# write_csv(tr_sust_final, here(output_dir, "tr_sustainability.csv"))