library(dplyr) # data manipulation
library(tidyr) # data tidying
library(here) # file path management
library(readr) # reading and writing CSV files
library(countrycode) # country and region codes
03 - Data preparation for models
03_D_data_preparation_models.qmd
General overview
Here we provided data preparation for generalized linear models performed using the script 04_A_model_NBTs.qmd
Reading libraries and data
Load data
# Data from 01_C_data_preparation
<- readr::read_csv(here::here("data", "processed", "flow_country.csv"))
flow_country
<- readr::read_csv(here::here("data", "raw", "spp_native_distribution.csv"))
spp_native_distribution
<- readr::read_csv(file = here::here("data","processed", "df_country_native.csv"))
df_country_native
<- readr::read_csv(here::here("data", "raw", "spp_type_distribution.csv"))
spp_type_distribution
<- readr::read_csv(file = here::here("data","processed", "df_country_type.csv"))
df_country_type
<- readr::read_csv(file = here::here("data", "raw", "bio-dem_data.csv"))
df_bio_dem
<- readr::read_csv(here::here("data", "processed", "infra_museum.csv"))
infra_museum
# Data from 02_D_beta-countries.qmd
<- readr::read_csv(here::here("data", "processed", "df_endemic_beta.csv")) df_endemic_beta
Get a list of countries and regions
<-
countries_list ::codelist |>
countrycode::select(region, iso3c) |>
dplyr::drop_na(region, iso3c) tidyr
Joining primary type (NBT) and native data with Bio-Dem, Infra-museum, and Beta metrics (NBT turnover and native turnover)
# joining type and native data frames
<-
df_country_all |>
df_country_native ::full_join(df_country_type,
dplyrby = c("country_distribution" = "country_museum")) |>
::mutate(
dplyrregion_distribution = ifelse(
is.na(region_distribution),
region_museum,
region_distribution),# REU doesn't have region on dataset
region_distribution = ifelse(
== "REU",
country_distribution "Sub-Saharan Africa",
region_distribution
)|>
) ::select(
dplyrregion = region_distribution,
country = country_distribution,
native_richness = native.richness,
type_richness|>
) ::drop_na(country)|>
tidyr# change NAs to zero
::mutate(
dplyr::across(c(native_richness,
dplyr
type_richness),~ifelse(is.na(.),0,.)))
# joining type and native data frame with biodem information
<-
df_country_complete |>
df_country_all ::left_join(df_bio_dem, by = "country")
dplyr
# joining with museum infrastructure information
<-
df_country_complete2 |>
df_country_complete ::left_join(infra_museum, by = c("country" = "country_museum"))
dplyr
# joining with beta metric
<-
df_country_complete3 |>
df_country_complete2 ::left_join(df_endemic_beta, by = c("country" = "countries")) dplyr
Modelling number of primary types per country
Here we will model the number (counting) of total NBT in each country. Countries with no NBT in their museums and natural history collections are represented with NA. However the absence of primary types are an important information, and consist in a true absence, meaning that these countries host no NBT. So, first we transformed the NA values in type_richness
variable to 0. Then we standardized all the predictors to 0 mean and 1 variation unit.
<-
df_country_complete4 |>
df_country_complete3 ::ungroup() |>
dplyr::mutate(
dplyrtype.richness.pad = ifelse(is.na(type_richness), 0, type_richness),
n.museums = ifelse(is.na(n.museums), 0, n.museums)) |>
::select(-type_richness) |>
dplyr::drop_na() |> # removing NAs
tidyr::mutate(
dplyryears.independence = ifelse(yearsSinceIndependence == "undefined", 0, yearsSinceIndependence),
native.richness.pad = scale(native_richness, center = T, scale = TRUE)[, 1],
records.per.area.pad = scale(records_per_area, center = T, scale = T)[, 1],
years.independence.pad = scale(as.numeric(years.independence), center = T, scale = T)[, 1],
years.independence.fac = ifelse(as.numeric(years.independence) >= 1, 1, 0),
gdp.pad = scale(e_migdppc, center = T, scale = T)[, 1],
n.museums.pad = scale(n.museums, center = T, scale = T)[, 1],
colonization = relevel(as.factor(years.independence.fac), ref = "0")
)
Transforming turnover metrics (NBT turnover and Native turnover) to fit in the model. Here we transform beta metrics to fit it between zero and one, following Ferrari and Cribari-Neto
# just transforming type and native turnover variable to fit it between zero and one
# First creating a function to transform the data and avoid 0 and 1
<- function(x, s){
std_beta <- (x*((length(x) - 1)) + s)/(length(x))
x_std return(x_std)
}
<-
df_country_complete5 |>
df_country_complete4 ::mutate(type.beta.model = std_beta(x = type.beta, s = 0.5),
dplyrnative.beta.model = std_beta(x = native.beta, s = 0.5))
Data frame used to model Domestic Contribution (DC) and Domestic Retention (DR)
<-
df_country_prop |>
flow_country ::group_by(country_museum) |>
dplyr::add_count(name = "total_country_museum",
dplyrwt = n) |>
::ungroup() |>
dplyr#add total_world
::add_count(name = "total_type_world",
dplyrwt = n) |>
#filter only the flow to the same country
::filter(country_type == country_museum) |>
dplyr#rename to domestic_type_retained
::rename(domestic_type_retained = n) |>
dplyr#add prop_DC, prop_DR, and prop_CW
# DC - Domestic Contribution
# DR - Domestic Retention
# WC - World Contribution
::mutate(
dplyrprop_DC = domestic_type_retained/total_country_museum,
prop_DR = domestic_type_retained/total_country_type,
prop_WC = total_country_museum/total_type_world,
prop_DC = ifelse(is.na(prop_DC), 0, prop_DC)) |>
::select(-country_museum) |>
dplyr::rename(country = country_type) |>
dplyr::filter(total_country_type != 0) dplyr
Join the df_country_prop with df_country_biodem5
Additionally, create a metric of success and failure for each proportion. These quantities will be later used in the modelling approach
<-
df_country_complete6 |>
df_country_prop ::inner_join(df_country_complete5, by = c("country")) |>
dplyr::mutate(
dplyrprop_DC.pad = std_beta(x = prop_DC, s = 0.5),
prop_DR.pad = std_beta(x = prop_DR, s = 0.5),
success.DC = domestic_type_retained,
failures.DC = total_country_museum - domestic_type_retained,
success.DR = domestic_type_retained,
failures.DR = total_country_type - domestic_type_retained
)
Saving data sets to be used in the models 04_A_model_NBTs.qmd
::write_csv(df_country_complete6, here::here("data", "processed", "df_country_complete6.csv"))
readr::write_csv(df_country_prop, here::here("data", "processed", "df_country_prop.csv")) readr