This document describes how to get and manipulate data from unified checklist for checklist-based indicators.

1 Setup

Load libraries:

library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(janitor) # To clean column names
library(here) # To find files
library(rgbif) # To get taxa from publisehd unified checklist
library(trias) # To get indicator functionalities

2 Get data

2.1 Get taxonomic data

Get taxa from the published unified checklist, the Global Register of Introduced and Invasive Species - Belgium:

taxa <- name_usage(
  datasetKey = "6d9e952f-948c-4483-9807-575348147c7e",
  limit = 10000
)
taxa <- 
  taxa$data %>% 
  filter(origin == "SOURCE")

Select columns we are interested to:

checklist_info <- c(
  "key", "nubKey", "scientificName", "datasetKey",
  "canonicalName", "species", "genus", "family",
  "order", "class", "phylum", "kingdom",
  "rank", "speciesKey", "genusKey", "familyKey",
  "orderKey", "classKey", "phylumKey", "kingdomKey", 
  "taxonomicStatus", "acceptedKey", "accepted"
)
taxa <-
  taxa %>%
  select(one_of(checklist_info))

2.2 Get distribution

Extract distribution information

distribution <- 
  taxa %>% 
  pull(key) %>%
  map_dfr(., function(x) {
    distribution_details <-name_usage(x, data = "distribution")
    return(distribution_details$data)
    }
  )
distribution <- distribution %>% select(-source)
head(distribution)

2.3 Get description

Extract description information

description <- taxa %>%
  pull(key) %>%
  map_dfr(., function(x) {
    description_details <-name_usage(x, data = "description")
    return(description_details$data)
    }
  )
description <- description %>% select(-c(key, language, source))
head(description, n = 10)

Description contains a column type with the following descriptors:

description %>% distinct(type)

Patch: pathways "introduction pathway" and "pathway of introduction" should be renamed to pathways. Bug documented in issue #68.

description <- 
  description %>%
  mutate(type = if_else(type %in% c("introduction pathway",
                                    "pathway of introduction"),
                        "pathway",
                        type)
)

We are interested to the following types:

types <- c(
  "native range",
  "degree of establishment",
  "pathway"
)

We filter out the other types:

description <-
  description %>%
  filter(type %in% types)

We tidy this data.frame, thus having different descriptors as different columns:

description <-
  description %>%
  as_tibble() %>%
  pivot_wider(
    names_from = type,
    values_from = description,
    values_fn = list(name = list)
  ) %>%
  unnest_longer(types[1]) %>%
  unnest_longer(types[2]) %>%
  unnest_longer(types[3])
description %>% head(n = 20)

We also clean column names by standardizing column names (snake_case):

description <-
  description %>%
  janitor::clean_names(case = "snake") %>%
  rename(taxonKey = taxon_key)
names(description)
## [1] "taxonKey"                "pathway"                
## [3] "degree_of_establishment" "native_range"

2.3.1 Mapping degree of establishment

Values in column degree_of_establishment:

description %>%
  distinct(degree_of_establishment)

We map them as follows:

description <-
  description %>%
  mutate(degree_of_establishment = recode(
    degree_of_establishment,
    "captive (blackburn_2011:B1)" = "captive",
    "blackburn_et_al_2011:B2" = "cultivated",
    "released (blackburn_2011:B3)" = "released",
    "blackburn_et_al_2011:C0" = "failing",
    "casual (blackburn_2011:C1)" = "casual",
    "reproducing (blackburn_2011:C2)" = "reproducing",
    "established (blackburn_2011:C3)" = "established",
    "blackburn_et_al_2011:C3" = "established",
    "colonizing (blackburn_2011:D1)" = "colonizing",
    "blackburn_et_al_2011:D2" = "invasive",
    "blackburn_et_al_2011:E" = "widespreadInvasive"
  ))

Mapping result:

description %>%
  distinct(degree_of_establishment)

2.3.2 Add native continent based on native range

We use native range information to create a similar column containing native continent:

description <-
  description %>%
  mutate(native_continent = case_when(
    grepl(pattern = "Africa", native_range, ignore.case = TRUE) ~ "Africa",
    grepl(pattern = "America", native_range, ignore.case = TRUE) ~ "America",
    grepl(pattern = "Asia", native_range, ignore.case = TRUE) ~ "Asia",
    grepl(pattern = "Australia", native_range, ignore.case = TRUE) ~ "Oceania",
    grepl(pattern = "nesia", native_range, ignore.case = TRUE) ~ "Oceania",
    grepl(pattern = "Europe", native_range, ignore.case = TRUE) ~ "Europe",
    TRUE ~ as.character(NA))
  )

Mapping result:

description %>%
  distinct(native_continent)

2.3.3 Mapping pathway

This section contains a series of patches to standardize pathway information.

Add prefix cbd_2014_pathway: where missing (see issue #69):

description <-
  description %>%
  mutate(pathway = if_else(
    str_starts(.data$pathway, "cbd_2014_pathway:"),
    pathway,
    paste0("cbd_2014_pathway:", pathway))
)

Pathway cbd_2014_pathway:agriculture should be replaced by cbd_2014_pathway:escape_agriculture (see issue #38):

description <- 
  description %>%
  mutate(pathway = if_else(pathway == "cbd_2014_pathway:agriculture",
                           "cbd_2014_pathway:escape_agriculture",
                           pathway)
)

Pathway contaminant_plant should be replaced by contaminant_on_plants (see issue #9)

description <- 
  description %>%
  mutate(pathway = if_else(pathway == "cbd_2014_pathway:contaminant_plant",
                           "cbd_2014_pathway:contaminant_on_plants",
                           pathway)
)

Remove pathway unintentional as it is not a valid pathway (see issue #3):

description <- 
  description  %>%
  filter(!pathway %in% c("cbd_2014_pathway:unintentional"))

Rename pathway nursery as contaminant_nursery (see issue #40):

description <- 
  description  %>%
  mutate(pathway = if_else(pathway == "cbd_2014_pathway:nursery",
                           "cbd_2014_pathway:contaminant_nursery",
                           pathway)
)

2.4 Species profile

Extract species profile

speciesProfiles <- taxa %>%
  pull(key) %>%
  map_dfr(., function(x) {
    species_profiles_details <-name_usage(x, data = "speciesProfile")
    return(species_profiles_details$data)
    }
  )
head(speciesProfiles)

3 Merge data

Merge distribution, description and species profile information:

merged_extensions <-
  full_join(distribution,
    description,
    by = "taxonKey"
  ) %>%
  full_join(speciesProfiles,
    by = "taxonKey"
  )

to merge them all with the taxonomic information:

merged_info <- left_join(taxa, merged_extensions,
  by = c("key" = "taxonKey")
)
merged_info %>% head(n = 100)

4 Tidy data

Some columns should be splitted in order to make the data.frame completely tidy: temporal and pathway.

4.1 Split column temporal

The column temporal contains one or two datums:

In case there are two datums, we split them in first_observed and last_observed. If only a datum is present, then it is used for both first_observedand last_observed.

merged_info <-
  merged_info %>%
  separate(
    col = temporal,
    sep = "/",
    into = c("first_observed", "last_observed"),
    convert = TRUE,
    fill = "right",
    remove = TRUE
  ) %>%
  mutate(last_observed = ifelse(is.na(last_observed),
    first_observed,
    last_observed
  ))

An example:

Some first_observed are greater than last_observed, (see issue #41):

wrong_temporal <- 
  merged_info %>%
  filter(last_observed < first_observed)
wrong_temporal %>%
  distinct(key, first_observed, last_observed)

Correct them:

# invert first/last_observed columns 
wrong_temporal$aux <- wrong_temporal$first_observed
wrong_temporal$first_observed <- wrong_temporal$last_observed
wrong_temporal$last_observed <- wrong_temporal$aux
wrong_temporal$aux <- NULL
wrong_temporal %>%
  distinct(key, first_observed, last_observed)
# replace wrong rows with the corrected ones
merged_info <- merged_info %>%
  anti_join(wrong_temporal, 
            by = c("key", "nubKey", "scientificName", "datasetKey",
                   "canonicalName", "species", "genus", "family", "order", 
                   "class", "phylum", "kingdom", "rank", "speciesKey", 
                   "taxonomicStatus", "locationId", "locality", "country", 
                   "status", "establishmentMeans", "pathway", 
                   "degree_of_establishment", "native_range", 
                   "native_continent", "habitat", "marine", "freshwater",
                   "terrestrial", "source")) %>%
  bind_rows(wrong_temporal) %>%
  arrange(key)

4.2 Split column pathway

The column pathway contains a prefix, cbd_2014_pathway: and two different pathway levels divided by symbol _ (Note: this is not valid for pathway natural_dispersal, which should not be divided in two levels as it is a level 2 pathway of level 1 unaided):

We split pathway in pathway_level1 and pathway_level2 assigning value unknown if pathway is empty (NA) or a zero-length string (""). We also assign level 2 natural_dispersal to taxa with pathway level 1 unaided, as suggested by Tim Adriaens in trias-project/indicators#61 (comment):

merged_info <-
  merged_info %>%
  rowwise() %>%
  mutate(
    pathway_level1 =
      str_split_fixed(
        str_split(pathway,
          pattern = "pathway:"
        )[[1]][2],
        pattern = "_", n = 2
      )[[1]][1],
    pathway_level2 =
      str_split_fixed(
        str_split(pathway, pattern = "pathway:")[[1]][2],
        pattern = "_", n = 2
      )[[2]][1]
  ) %>%
  ungroup() %>%
  mutate(pathway_level2 = ifelse(pathway_level1 == "unaided" & 
                                   (pathway_level2 == "" |
                                      is.na(pathway_level2)),
                                 "natural_dispersal",
                                 pathway_level2)) %>%
  mutate(pathway_level2 = ifelse(pathway_level1 == "unknown",
                                 "unknown",
                                 pathway_level2)) %>%
  mutate(
    pathway_level1 = ifelse(is.na(pathway_level1) | pathway_level1 == "",
                            "unknown",
                            pathway_level1),
    pathway_level2 = ifelse(is.na(pathway_level2) | pathway_level2 == "",
                            "unknown",
                            pathway_level2)
  ) %>%
  mutate(
    pathway_level1 = ifelse(
      pathway_level1 == "natural",
      "unaided",
      pathway_level1
    ),
    pathway_level2 = ifelse(
      pathway_level2 == "dispersal",
      "natural_dispersal",
      pathway_level2
    )) %>%
  select(-pathway)

Full mapping:

5 Save data

The data.frame merged_info is saved as output file:

write_tsv(merged_info, 
  file = here::here(
    "data",
    "interim",
    "data_input_checklist_indicators.tsv"
  ),
  na = ""
)

This file is the start point for building checklist-based indicators.