03 - Data preparation for models

03_D_data_preparation_models.qmd

General overview

Here we provided data preparation for generalized linear models performed using the script 04_A_model_NBTs.qmd

Reading libraries and data

library(dplyr)       # data manipulation
library(tidyr)       # data tidying
library(here)        # file path management
library(readr)       # reading and writing CSV files
library(countrycode) # country and region codes

Load data

# Data from 01_C_data_preparation
flow_country <- readr::read_csv(here::here("data", "processed", "flow_country.csv")) 

spp_native_distribution <- readr::read_csv(here::here("data", "raw", "spp_native_distribution.csv")) 

df_country_native <- readr::read_csv(file = here::here("data","processed", "df_country_native.csv"))

spp_type_distribution <- readr::read_csv(here::here("data", "raw", "spp_type_distribution.csv")) 

df_country_type <- readr::read_csv(file = here::here("data","processed", "df_country_type.csv"))

df_bio_dem <- readr::read_csv(file = here::here("data", "raw", "bio-dem_data.csv"))

infra_museum <- readr::read_csv(here::here("data", "processed", "infra_museum.csv"))

# Data from 02_D_beta-countries.qmd
df_endemic_beta <- readr::read_csv(here::here("data", "processed", "df_endemic_beta.csv"))

Get a list of countries and regions

countries_list <- 
  countrycode::codelist |>
  dplyr::select(region, iso3c) |>
  tidyr::drop_na(region, iso3c)

Joining primary type (NBT) and native data with Bio-Dem, Infra-museum, and Beta metrics (NBT turnover and native turnover)

# joining type and native data frames
df_country_all <-
  df_country_native |>
  dplyr::full_join(df_country_type, 
            by = c("country_distribution" = "country_museum")) |> 
  dplyr::mutate(
    region_distribution = ifelse(
      is.na(region_distribution), 
      region_museum, 
      region_distribution),
    # REU doesn't have region on dataset
    region_distribution = ifelse(
      country_distribution == "REU", 
      "Sub-Saharan Africa", 
      region_distribution
      )
    ) |>
  dplyr::select(
    region = region_distribution,
    country = country_distribution,
    native_richness = native.richness,
    type_richness
  ) |>
  tidyr::drop_na(country)|>
  # change NAs to zero
  dplyr::mutate(
    dplyr::across(c(native_richness,
                    type_richness),
                  ~ifelse(is.na(.),0,.)))
  

# joining type and native data frame with biodem information
df_country_complete <-
  df_country_all |>
  dplyr::left_join(df_bio_dem, by = "country")

# joining with museum infrastructure information
df_country_complete2 <-
  df_country_complete |>
  dplyr::left_join(infra_museum, by = c("country" = "country_museum"))

# joining with beta metric
df_country_complete3 <-
  df_country_complete2 |>
  dplyr::left_join(df_endemic_beta, by = c("country" = "countries"))

Modelling number of primary types per country

Here we will model the number (counting) of total NBT in each country. Countries with no NBT in their museums and natural history collections are represented with NA. However the absence of primary types are an important information, and consist in a true absence, meaning that these countries host no NBT. So, first we transformed the NA values in type_richness variable to 0. Then we standardized all the predictors to 0 mean and 1 variation unit.

df_country_complete4 <-
  df_country_complete3 |>
  dplyr::ungroup() |>
  dplyr::mutate(
    type.richness.pad = ifelse(is.na(type_richness), 0, type_richness),
    n.museums = ifelse(is.na(n.museums), 0, n.museums)) |>
  dplyr::select(-type_richness) |>
  tidyr::drop_na() |>  # removing NAs
  dplyr::mutate(
    years.independence = ifelse(yearsSinceIndependence == "undefined", 0, yearsSinceIndependence),
    native.richness.pad = scale(native_richness, center = T, scale = TRUE)[, 1],
    records.per.area.pad = scale(records_per_area, center = T, scale = T)[, 1],
    years.independence.pad = scale(as.numeric(years.independence), center = T, scale = T)[, 1],
    years.independence.fac = ifelse(as.numeric(years.independence) >= 1, 1, 0),
    gdp.pad = scale(e_migdppc, center = T, scale = T)[, 1],
    n.museums.pad = scale(n.museums, center = T, scale = T)[, 1],
    colonization = relevel(as.factor(years.independence.fac), ref = "0")
  )

Transforming turnover metrics (NBT turnover and Native turnover) to fit in the model. Here we transform beta metrics to fit it between zero and one, following Ferrari and Cribari-Neto

# just transforming type and native turnover variable to fit it between zero and one

# First creating a function to transform the data and avoid 0 and 1

std_beta <- function(x, s){
  x_std <- (x*((length(x) - 1)) + s)/(length(x))
  return(x_std)
}

df_country_complete5 <-
  df_country_complete4 |>
  dplyr::mutate(type.beta.model = std_beta(x = type.beta, s = 0.5),
                native.beta.model = std_beta(x = native.beta, s = 0.5))

Data frame used to model Domestic Contribution (DC) and Domestic Retention (DR)

df_country_prop <- 
  flow_country |> 
  dplyr::group_by(country_museum) |>
  dplyr::add_count(name = "total_country_museum",
                  wt = n) |>
  dplyr::ungroup() |>
  #add total_world
  dplyr::add_count(name = "total_type_world",
                   wt = n) |>
  #filter only the flow to the same country
  dplyr::filter(country_type == country_museum) |>
  #rename to domestic_type_retained
  dplyr::rename(domestic_type_retained = n) |>
  #add prop_DC, prop_DR, and prop_CW
  # DC - Domestic Contribution
  # DR - Domestic Retention
  # WC - World Contribution
  dplyr::mutate(
    prop_DC = domestic_type_retained/total_country_museum,
    prop_DR = domestic_type_retained/total_country_type,
    prop_WC = total_country_museum/total_type_world,
    prop_DC = ifelse(is.na(prop_DC), 0, prop_DC)) |> 
  dplyr::select(-country_museum) |> 
  dplyr::rename(country = country_type) |> 
  dplyr::filter(total_country_type != 0)

Join the df_country_prop with df_country_biodem5

Additionally, create a metric of success and failure for each proportion. These quantities will be later used in the modelling approach

df_country_complete6 <- 
  df_country_prop |> 
  dplyr::inner_join(df_country_complete5, by = c("country")) |> 
  dplyr::mutate(
    prop_DC.pad = std_beta(x = prop_DC, s = 0.5), 
    prop_DR.pad = std_beta(x = prop_DR, s = 0.5),
    success.DC = domestic_type_retained, 
    failures.DC = total_country_museum - domestic_type_retained, 
    success.DR = domestic_type_retained, 
    failures.DR = total_country_type - domestic_type_retained
    )

Saving data sets to be used in the models 04_A_model_NBTs.qmd

readr::write_csv(df_country_complete6, here::here("data", "processed", "df_country_complete6.csv"))
readr::write_csv(df_country_prop, here::here("data", "processed", "df_country_prop.csv"))