library(dplyr) # data manipulation
library(tidyr) # data tidying
library(here) # file path management
library(readr) # reading and writing CSV files
library(countrycode) # country and region codes
03 - Data preparation for models
General overview
Here we provided data preparation for generalized linear models performed using the script 04_A_model_NBTs.qmd
Reading libraries and data
Load data
# Data from 01_C_data_preparation
<- readr::read_csv(here::here("data", "processed", "flow_country.csv"))
<- readr::read_csv(here::here("data", "raw", "spp_native_distribution.csv"))
<- readr::read_csv(file = here::here("data","processed", "df_country_native.csv"))
<- readr::read_csv(here::here("data", "raw", "spp_type_distribution.csv"))
<- readr::read_csv(file = here::here("data","processed", "df_country_type.csv"))
<- readr::read_csv(file = here::here("data", "raw", "bio-dem_data.csv"))
<- readr::read_csv(here::here("data", "processed", "infra_museum.csv"))
# Data from 02_D_beta-countries.qmd
<- readr::read_csv(here::here("data", "processed", "df_endemic_beta.csv")) df_endemic_beta
Get a list of countries and regions
countries_list ::codelist |>
countrycode::select(region, iso3c) |>
dplyr::drop_na(region, iso3c) tidyr
Joining primary type (NBT) and native data with Bio-Dem, Infra-museum, and Beta metrics (NBT turnover and native turnover)
# joining type and native data frames
df_country_all |>
df_country_native ::full_join(df_country_type,
dplyrby = c("country_distribution" = "country_museum")) |>
dplyrregion_distribution = ifelse(,
region_distribution),# REU doesn't have region on dataset
region_distribution = ifelse(
== "REU",
country_distribution "Sub-Saharan Africa",
) ::select(
dplyrregion = region_distribution,
country = country_distribution,
native_richness = native.richness,
) ::drop_na(country)|>
tidyr# change NAs to zero
# joining type and native data frame with biodem information
df_country_complete |>
df_country_all ::left_join(df_bio_dem, by = "country")
# joining with museum infrastructure information
df_country_complete2 |>
df_country_complete ::left_join(infra_museum, by = c("country" = "country_museum"))
# joining with beta metric
df_country_complete3 |>
df_country_complete2 ::left_join(df_endemic_beta, by = c("country" = "countries")) dplyr
Modelling number of primary types per country
Here we will model the number (counting) of total NBT in each country. Countries with no NBT in their museums and natural history collections are represented with NA. However the absence of primary types are an important information, and consist in a true absence, meaning that these countries host no NBT. So, first we transformed the NA values in type_richness
variable to 0. Then we standardized all the predictors to 0 mean and 1 variation unit.
df_country_complete4 |>
df_country_complete3 ::ungroup() |>
dplyrtype.richness.pad = ifelse(, 0, type_richness),
n.museums = ifelse(, 0, n.museums)) |>
::select(-type_richness) |>
dplyr::drop_na() |> # removing NAs
dplyryears.independence = ifelse(yearsSinceIndependence == "undefined", 0, yearsSinceIndependence),
native.richness.pad = scale(native_richness, center = T, scale = TRUE)[, 1],
records.per.area.pad = scale(records_per_area, center = T, scale = T)[, 1],
years.independence.pad = scale(as.numeric(years.independence), center = T, scale = T)[, 1],
years.independence.fac = ifelse(as.numeric(years.independence) >= 1, 1, 0),
gdp.pad = scale(e_migdppc, center = T, scale = T)[, 1],
n.museums.pad = scale(n.museums, center = T, scale = T)[, 1],
colonization = relevel(as.factor(years.independence.fac), ref = "0")
Transforming turnover metrics (NBT turnover and Native turnover) to fit in the model. Here we transform beta metrics to fit it between zero and one, following Ferrari and Cribari-Neto
# just transforming type and native turnover variable to fit it between zero and one
# First creating a function to transform the data and avoid 0 and 1
<- function(x, s){
std_beta <- (x*((length(x) - 1)) + s)/(length(x))
x_std return(x_std)
df_country_complete5 |>
df_country_complete4 ::mutate(type.beta.model = std_beta(x = type.beta, s = 0.5),
dplyrnative.beta.model = std_beta(x = native.beta, s = 0.5))
Data frame used to model Domestic Contribution (DC) and Domestic Retention (DR)
df_country_prop |>
flow_country ::group_by(country_museum) |>
dplyr::add_count(name = "total_country_museum",
dplyrwt = n) |>
::ungroup() |>
dplyr#add total_world
::add_count(name = "total_type_world",
dplyrwt = n) |>
#filter only the flow to the same country
::filter(country_type == country_museum) |>
dplyr#rename to domestic_type_retained
::rename(domestic_type_retained = n) |>
dplyr#add prop_DC, prop_DR, and prop_CW
# DC - Domestic Contribution
# DR - Domestic Retention
# WC - World Contribution
dplyrprop_DC = domestic_type_retained/total_country_museum,
prop_DR = domestic_type_retained/total_country_type,
prop_WC = total_country_museum/total_type_world,
prop_DC = ifelse(, 0, prop_DC)) |>
::select(-country_museum) |>
dplyr::rename(country = country_type) |>
dplyr::filter(total_country_type != 0) dplyr
Join the df_country_prop with df_country_biodem5
Additionally, create a metric of success and failure for each proportion. These quantities will be later used in the modelling approach
df_country_complete6 |>
df_country_prop ::inner_join(df_country_complete5, by = c("country")) |>
dplyrprop_DC.pad = std_beta(x = prop_DC, s = 0.5),
prop_DR.pad = std_beta(x = prop_DR, s = 0.5),
success.DC = domestic_type_retained,
failures.DC = total_country_museum - domestic_type_retained,
success.DR = domestic_type_retained,
failures.DR = total_country_type - domestic_type_retained
Saving data sets to be used in the models 04_A_model_NBTs.qmd
::write_csv(df_country_complete6, here::here("data", "processed", "df_country_complete6.csv"))
readr::write_csv(df_country_prop, here::here("data", "processed", "df_country_prop.csv")) readr