This script prepares checklist data for publication on GBIF. The source file for this document can be found here. It downloads the checklist data from this Google Spreadsheet, where these data are maintained. The script transforms the data to a Darwin Core checklist that can be harvested by GBIF. It was developed for the TrIAS project.

Load libraries:

library(tidyverse)
library(magrittr)
library(janitor)
library(digest)
library(here)

Read source data

Read the relevant worksheet (published as csv) and Copy the source data to the repository to keep track of changes.

#input_data <-read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSXWde261c45tj0pIMeESy15M3BMogW-1Q-92a4m7A8aKtztHi4JPKEJHDSjEHYHGjQ6de1z27avnOs/pub?gid=0&single=true&output=csv", na = "NA")
input_data <-read_csv("../data/raw/Checklist alien freshwater fishes - Checklist.csv", na = "NA")
readr::write_csv(input_data, here("data", "raw", "alien_fisches_checklist_dump.csv"), na = "")

Preprocessing

Tidy data

input_data %<>%
  janitor::remove_empty("rows") %>%    # Remove empty rows
  janitor::clean_names()               # Have sensible (lowercase) column names

Taxon IDs

To link taxa with information in the extension(s), each taxon needs a unique and relatively stable taxonID. Here we create one in the form of dataset_shortname:taxon:hash, where hash is unique code based on scientific name and kingdom (that will remain the same as long as scientific name and kingdom remain the same):

vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>% 
  dplyr::mutate(
    taxon_id = 
      paste(
        "alien-fishes-checklist",
        "taxon",
        vdigest(paste(scientific_name), algo = "md5"),
        sep = ":"
        )
    )

Darwin Core mapping

Taxon core

Create a dataframe with unique taxa only (ignoring multiple distribution rows). Map the data to Darwin Core Taxon.

taxon <-
  input_data %>%
  dplyr::distinct(taxon_id, .keep_all = TRUE) %>%
  dplyr::mutate(
    language = "en",
    license = "http://creativecommons.org/publicdomain/zero/1.0/",
    rightsHolder = "INBO",
    accessRights = "http://www.inbo.be/en/norms-for-data-use", #"https://www.inbo.be/en/norms-data-use",
    datasetID = "https://doi.org/10.15468/xvuzfh",
    institutionCode = "INBO",
    datasetName = "Checklist of non-native freshwater fishes in Flanders, Belgium", # Add capital later
    taxonID = taxon_id,
    scientificName = scientific_name,
    kingdom = kingdom,
    phylum = phylum,
    order = order,
    family = family,
    genus = genus,
    taxonRank = taxon_rank,
    nomenclaturalCode = nomenclatural_code,
    .keep = "none"
  ) %>% 
 # dplyr::arrange(taxonID) %>% 
  # dplyr::select(
  #   "language", "license", "rightsHolder", "accessRights", "datasetID",
  #   "institutionCode", "datasetName", "taxonID", "scientificName", "kingdom", 
  #   "phylum", "order", "family", "genus", "taxonRank", "nomenclaturalCode"
  # )
  dplyr::select(
    "language", "license", "rightsHolder", "accessRights", "datasetID",
    "datasetName", "taxonID", "scientificName", "kingdom", 
    "taxonRank", "nomenclaturalCode"
  )

Create vernacular names extension

Map the data to Vernacular Names. Vernacular names are available in two languages: English (raw_common_name) and Dutch (raw_nederlandse_naam). We will gather these columns to generate a single column containing the vernacular name (vernacularName) and an additional column with the language (language):

vernacular_names <- 
  input_data %>% 
  tidyr::gather(key = language, value = vernacularName, common_name, dutch_name, na.rm = TRUE, convert = TRUE) %>%
  dplyr::mutate(
    taxonID = taxon_id,
    language = 
      dplyr::recode(
        language,
        "common_name" = "en",
        "dutch_name" = "nl"
        ) 
    ) %>% 
  dplyr::select(taxonID, vernacularName, language) %>% 
  dplyr::arrange(taxonID)

Create distribution extension

Map the data to Species Distribution.

distribution <-
  input_data %>% 
  dplyr::mutate(
    taxonID = taxon_id,
    locationID = dplyr::case_when(
      location == "Flanders" ~ "ISO_3166-2:BE-VLG",
      location == "Wallonia" ~ "ISO_3166-2:BE-WAL",
      location == "Brussels" ~ "ISO_3166-2:BE-BRU"
    ),
    locality = dplyr::case_when(
      location == "Flanders" ~ "Flemish Region",
      location == "Wallonia" ~ "Walloon Region",
      location == "Brussels" ~ "Brussels-Capital Region"
    ),
    countryCode = "BE",
    occurrenceStatus = occurrence_status,
    establishmentMeans = "introduced",
    degreeOfEstablishment = degree_of_establishment,
    pathway = introduction_pathway, 
    eventDate = dplyr::recode( # need to be updated
      date_first_observation,
      "20xx" = "2000",
      "17th c." = "1601",
      "1980s" = "1980",
      "13th c." = "1201"
    ),
    eventDate = paste0(eventDate, "/"),
    .keep = "none"
  ) %>% 
  dplyr::mutate(pathway = strsplit(as.character(pathway), " ")) %>% 
  tidyr::unnest(pathway) %>% 
  dplyr::filter(pathway != "|") %>% 
  dplyr::arrange(taxonID)

Create description extension

Map the data to Taxon Description. raw_origin contains multiple values (currently not more than 2), so we separate it on " or " in 2 columns.

description <- 
  input_data %>% 
  dplyr::filter(!is.na(native_range)) %>% 
  dplyr::mutate(
    taxonID = taxon_id,
    description = strsplit(as.character(native_range), " or "),
    type = "native range",
    language = "en",
    .keep = "none"
  ) %>% 
  tidyr::unnest(description) %>% 
  dplyr::arrange(taxonID) 

Create species profile extension

species_profile <- 
  input_data %>% 
  dplyr::mutate(
    taxonID = taxon_id,
    isMarine = ifelse(realm == "marine", TRUE, FALSE),
    isFreshwater = ifelse(realm == "freshwater", TRUE, FALSE),
    isTerrestrial = ifelse(realm == "terrestrial", TRUE, FALSE),
    .keep = "none"
  ) %>% 
  dplyr::arrange(taxonID)
#estuarine is missing?

Save to CSV:

readr::write_csv(taxon, file = here::here("data", "processed", "taxon.csv"), na = "")
readr::write_csv(vernacular_names, file = here::here("data", "processed", "vernacularname.csv"), na = "")
readr::write_csv(distribution, file = here::here("data", "processed", "distribution.csv"), na = "")
readr::write_csv(description, file = here::here("data", "processed", "description.csv"), na = "")
readr::write_csv(species_profile, file = here::here("data", "processed", "speciesprofile.csv"), na = "")