1 Setup

Load libraries:

library(tidyverse)      # To do data science
library(here)           # To find files
library(janitor)        # To clean input data

2 Read source data

The data is maintained in this Google Spreadsheet.

Read the relevant worksheet (published as csv):

raw_data <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vTl8IEk2fProQorMu5xKQPdMXl3OQp-c0f6eBXitv0BiVFZ3JSJCde0PtbFXuETgguf6vK8b43FDX1C/pub?gid=1518765324&single=true&output=csv", show_col_types = FALSE)

Copy the source data to the repository to keep track of changes:

write_csv(raw_data, here("data", "raw", "alien_mollusca_dump.csv"), na = "")

3 Preprocessing: tidy data

To link taxa with information in the extension(s), each taxon needs a unique and relatively stable taxonID. We have created one in the form of dataset_shortname:taxon:hash, where hash is unique code based on scientific name and kingdom. Once this is created, it is added to the source data.

input_data <-
  raw_data %>%
  remove_empty("rows") %>%
  clean_names() %>%
  mutate(
    taxon_id = paste(
      "alien-mollusca-checklist",
      "taxon",
      .data$taxon_id_hash,
      sep = ":"
    )
  )

4 Darwin Core mapping

4.1 Taxon core

Create a dataframe with unique taxa only (ignoring multiple distribution rows). Map the data to Darwin Core Taxon.

taxon <-
  input_data %>%
  distinct(taxon_id, .keep_all = TRUE) %>%
  mutate(
    language = "en",
    license = "http://creativecommons.org/publicdomain/zero/1.0/",
    rightsHolder = "Royal Belgian Institute of Natural Sciences",
    accessRights = "https://www.inbo.be/en/norms-data-use",
    datasetID = "https://doi.org/10.15468/t13kwo",
    institutionCode = "RBINS",
    datasetName = "Registry of introduced terrestrial molluscs in Belgium",
    taxonID = taxon_id,
    scientificName = scientific_name,
    kingdom = kingdom,
    phylum = phylum,
    order = order,
    family = family,
    genus = genus,
    taxonRank = taxon_rank,
    nomenclaturalCode = nomenclatural_code,
    .keep = "none"
  ) |>
  arrange(taxonID) |>
  select(
    "language", "license", "rightsHolder", "accessRights", "datasetID",
    "institutionCode", "datasetName", "taxonID", "scientificName", "kingdom", 
    "phylum", "order", "family", "genus", "taxonRank", "nomenclaturalCode"
  )

5 Vernacular name extension

Map the data to Vernacular Nams. Vernacular names are available in four languages: Dutch, French, English and German. We will gather these columns to generate a single column containing the vernacular name (vernacularName) and an additional column with the language (language):

vernacular_name <- 
  input_data |>
  tidyr::gather(
    key = language,
    value = vernacularName,
    common_name, dutch_name, french_name, german_name,
    na.rm = TRUE,
    convert = TRUE) %>%
  dplyr::mutate(vernacularName = strsplit(vernacularName, "\\|")) |>
  unnest(vernacularName) |>
  dplyr::mutate(
    taxonID = taxon_id,
    vernacularName = str_trim(vernacularName),
    language = 
      dplyr::recode(
        language,
        "dutch_name" = "nl",
        "french_name" = "fr",
        "common_name" = "en",
        "german_name" = "ge"
        ) 
    ) |>
  dplyr::select(taxonID, vernacularName, language) |>
  dplyr::arrange(taxonID)

5.1 Species profile extension

In this extension we will express broad habitat characteristics of the species (e.g. isTerrestrial).

Create a dataframe with unique taxa only (ignoring multiple distribution rows). Only keep records for which terrestrial, marine and freshwater is not empty.

Map the data to Species Profile.

species_profile <-
  input_data %>%
  distinct(taxon_id, .keep_all = TRUE) |>
  filter(
    !is.na(terrestrial) |
      !is.na(marine) |
      !is.na(freshwater)
  ) |>
  mutate(
    .keep = "none",
    taxonID = taxon_id,
    isMarine = marine,
    isFreshwater = freshwater,
    isTerrestrial = terrestrial
  ) |>
  arrange(taxonID)

5.2 Distribution extension

Create a dataframe with all data (including multiple distributions). Map the data to Species Distribution.

Information for eventDate is contained in date_first_observation and date_last_observation, which we will express here in an ISO 8601 date format yyyy/yyyy (start_date/end_date).

Not all cells for date_first_observation (DFO) and/or date_last_observation (DLO) are populated. So, we used the following rules for those records:

case 1. If DFO is empty and DLO is empty, eventDate is NA case 2. If DFO is empty and DLO is not empty: eventDate = /DLO case 3. If DFO is not empty and DLO is empty, eventDate is DFO/

distribution <-
  input_data %>%
  # pathway
  pivot_longer(
    names_to = "key",
    values_to = "pathway",
    starts_with("introduction_pathway"),
    values_drop_na = FALSE) %>%
  filter( # keep NA value for species with no pathway provided
    !is.na(pathway) |
      (is.na(pathway) & key == "introduction_pathway_1")
    ) %>%
  # other terms
  mutate(
    taxonID = taxon_id,
    locationID = case_when(
      location == "Flanders" ~ "ISO_3166-2:BE-VLG",
      location == "Wallonia" ~ "ISO_3166-2:BE-WAL",
      location == "Brussels" ~ "ISO_3166-2:BE-BRU",
      location == "Belgium" ~ "ISO_3166-2:BE"
    ),
    locality = case_when(
      location == "Flanders" ~ "Flemish Region",
      location == "Wallonia" ~ "Walloon Region",
      location == "Brussels" ~ "Brussels-Capital Region",
      location == "Belgium" ~ "Belgium"
    ),
    countryCode = country_code,
    occurrenceStatus = occurrence_status,
    establishmentMeans = establishment_means,
    degreeOfEstablishment = degree_of_establishment,
    eventDate = case_when(
      is.na(date_first_observation) & is.na(date_last_observation) ~ NA,
      is.na(date_first_observation) ~ paste0("/", date_last_observation),
      is.na(date_last_observation) ~ paste0(date_first_observation, "/"),
      !is.na(date_first_observation) & !is.na(date_last_observation) ~
        paste(date_first_observation, date_last_observation, sep = "/")
    ),
    source = source,
    occurrenceRemarks = occurrence_remarks
  ) %>%
  select(
    "taxonID", "locationID", "locality", "countryCode", "occurrenceStatus",
    "establishmentMeans", "degreeOfEstablishment", "pathway", 
    "eventDate", "source", "occurrenceRemarks"
  ) %>%
  arrange(taxonID)

5.3 Description extension

In the description extension we want to include the native range of a species

description <-
  input_data |>
  # unique taxa only (ignoring multiple distribution rows)
  distinct(taxon_id, .keep_all = TRUE) |>
  # Separate values on `|` 
  mutate(native_range = strsplit(native_range, "\\|")) |>
  unnest(native_range) |>
  filter(!is.na(native_range)) |>
  mutate(
    .keep = "none",
    taxonID = taxon_id,
    description = str_trim(native_range),
    type = "native range",
    language = "en"
    ) |>
  select("taxonID", "description", "type", "language") |>
  arrange(taxonID)

6 Save to CSV:

write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")
write_csv(vernacular_name, here("data", "processed", "vernacularname.csv"), na = "")
write_csv(distribution, here("data", "processed", "distribution.csv"), na = "")
write_csv(species_profile, here("data", "processed", "speciesprofile.csv"), na = "")
write_csv(description, here("data", "processed", "description.csv"), na = "")