03 - Data preparation for models

03_D_data_preparation_models.qmd

General overview

Here we provided data preparation for generalized linear models performed using the script 04_A_model_NBTs.qmd

Reading libraries and data

library(dplyr)       # data manipulation
library(tidyr)       # data tidying
library(here)        # file path management
library(readr)       # reading and writing CSV files
library(countrycode) # country and region codes

Load data

# Data from 01_C_data_preparation
flow_country <- readr::read_csv(here::here("data", "processed", "flow_country.csv")) 

spp_native_distribution <- readr::read_csv(here::here("data", "raw", "spp_native_distribution.csv")) 

df_country_native <- readr::read_csv(file = here::here("data","processed", "df_country_native.csv"))

spp_type_distribution <- readr::read_csv(here::here("data", "raw", "spp_type_distribution.csv")) 

df_country_type <- readr::read_csv(file = here::here("data","processed", "df_country_type.csv"))

df_bio_dem <- readr::read_csv(file = here::here("data", "raw", "bio-dem_data.csv"))

infra_museum <- readr::read_csv(here::here("data", "processed", "infra_museum.csv"))

# Data from 02_D_beta-countries.qmd
df_endemic_beta <- readr::read_csv(here::here("data", "processed", "df_endemic_beta.csv"))

Get a list of countries and regions

countries_list <- 
  countrycode::codelist |>
  dplyr::select(region, iso3c) |>
  tidyr::drop_na(region, iso3c)

Joining name bearers and native data with Bio-Dem, Infra-museum, and native deficit and non-endemic name bearers proportions

# joining type and native data frames
df_country_all <-
  df_country_native |>
  dplyr::full_join(df_country_type, 
            by = c("country_distribution" = "country_museum")) |> 
  dplyr::mutate(
    region_distribution = ifelse(
      is.na(region_distribution), 
      region_museum, 
      region_distribution),
    # REU doesn't have region on dataset
    region_distribution = ifelse(
      country_distribution == "REU", 
      "Sub-Saharan Africa", 
      region_distribution
      )
    ) |>
  dplyr::select(
    region = region_distribution,
    country = country_distribution,
    native_richness = native.richness,
    type_richness
  ) |>
  tidyr::drop_na(country)|>
  # change NAs to zero
  dplyr::mutate(
    dplyr::across(c(native_richness,
                    type_richness),
                  ~ifelse(is.na(.),0,.)))
  

# joining type and native data frame with biodem information
df_country_complete <-
  df_country_all |>
  dplyr::left_join(df_bio_dem, by = "country")

# joining with museum infrastructure information
df_country_complete2 <-
  df_country_complete |>
  dplyr::left_join(infra_museum, by = c("country" = "country_museum"))

# joining with beta metric
df_country_complete3 <-
  df_country_complete2 |>
  dplyr::left_join(df_endemic_beta, by = c("country" = "countries"))

Modelling number of primary types per country

Here we will model the number (counting) of total name bearers in each country. Countries with no name bearers in their museums and natural history collections are represented with NA. However the absence of primary types are an important information, and consist in a true absence, meaning that these countries host no name bearers. So, first we transformed the NA values in type_richness variable to 0. Then we standardized all the predictors to 0 mean and 1 variation unit.

df_country_complete4 <-
  df_country_complete3 |>
  dplyr::ungroup() |>
  dplyr::mutate(
    type.richness.pad = ifelse(is.na(type_richness), 0, type_richness),
    n.museums = ifelse(is.na(n.museums), 0, n.museums)) |>
  dplyr::select(-type_richness) |>
  tidyr::drop_na() |>  # removing NAs
  dplyr::mutate(
    years.independence = ifelse(yearsSinceIndependence == "undefined", 0, yearsSinceIndependence),
    native.richness.pad = scale(native_richness, center = T, scale = TRUE)[, 1],
    records.per.area.pad = scale(records_per_area, center = T, scale = T)[, 1],
    years.independence.pad = scale(as.numeric(years.independence), center = T, scale = T)[, 1],
    years.independence.fac = ifelse(as.numeric(years.independence) >= 1, 1, 0),
    gdp.pad = scale(e_migdppc, center = T, scale = T)[, 1],
    n.museums.pad = scale(n.museums, center = T, scale = T)[, 1],
    colonization = relevel(as.factor(years.independence.fac), ref = "0")
  )

Transforming endemic deficit and non-endemic name bearers proportions to fit in the model. Here we transform these proportions to fit it between zero and one, following Ferrari and Cribari-Neto

# just transforming type and native turnover variable to fit it between zero and one

# First creating a function to transform the data and avoid 0 and 1

std_beta <- function(x, s){
  x_std <- (x*((length(x) - 1)) + s)/(length(x))
  return(x_std)
}

df_country_complete5 <-
  df_country_complete4 |>
  dplyr::mutate(type.beta.model = std_beta(x = type.beta, s = 0.5),
                native.beta.model = std_beta(x = native.beta, s = 0.5))

Here we perform some other calculations to add the total name bearers in museums, countries and some simple proportions

df_country_prop <- 
  flow_country |> 
  dplyr::group_by(country_museum) |>
  dplyr::add_count(name = "total_country_museum",
                  wt = n) |>
  dplyr::ungroup() |>
  #add total_world
  dplyr::add_count(name = "total_type_world",
                   wt = n) |>
  #filter only the flow to the same country
  dplyr::filter(country_type == country_museum) |>
  #rename to domestic_type_retained
  dplyr::rename(domestic_type_retained = n) |>
  #add prop_DC, prop_DR, and prop_CW
  # DC - Domestic Contribution
  # DR - Domestic Retention
  # WC - World Contribution
  dplyr::mutate(
    prop_DC = domestic_type_retained/total_country_museum,
    prop_DR = domestic_type_retained/total_country_type,
    prop_WC = total_country_museum/total_type_world,
    prop_DC = ifelse(is.na(prop_DC), 0, prop_DC)) |> 
  dplyr::select(-country_museum) |> 
  dplyr::rename(country = country_type) |> 
  dplyr::filter(total_country_type != 0)

Join the df_country_prop with df_country_biodem5

Additionally, create a metric of success and failure for each proportion. These quantities will be later used in the modelling approach

df_country_complete6 <- 
  df_country_prop |> 
  dplyr::inner_join(df_country_complete5, by = c("country")) |> 
  dplyr::mutate(
    prop_DC.pad = std_beta(x = prop_DC, s = 0.5), 
    prop_DR.pad = std_beta(x = prop_DR, s = 0.5),
    success.DC = domestic_type_retained, 
    failures.DC = total_country_museum - domestic_type_retained, 
    success.DR = domestic_type_retained, 
    failures.DR = total_country_type - domestic_type_retained
    )

Saving data sets to be used in the models `04_A_model_NBTs.qmd`

readr::write_csv(df_country_complete6, here::here("data", "processed", "df_country_complete6.csv"))
readr::write_csv(df_country_prop, here::here("data", "processed", "df_country_prop.csv"))

General overview

Reading libraries and data

Load data

Modelling number of primary types per country

Saving data sets to be used in the models 04_A_model_NBTs.qmd

Saving data sets to be used in the models `04_A_model_NBTs.qmd`