This script prepares checklist data for publication on GBIF. The source file for this document can be found here. It downloads the checklist data from this Google Spreadsheet, where these data are maintained. The script transforms the data to a Darwin Core checklist that can be harvested by GBIF. It was developed for the TrIAS project.
Load libraries:
library(tidyverse)
library(magrittr)
library(janitor)
library(digest)
library(here)
Read the relevant worksheet (published as csv) and Copy the source data to the repository to keep track of changes.
#input_data <-read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSXWde261c45tj0pIMeESy15M3BMogW-1Q-92a4m7A8aKtztHi4JPKEJHDSjEHYHGjQ6de1z27avnOs/pub?gid=0&single=true&output=csv", na = "NA")
input_data <-read_csv("../data/raw/Checklist alien freshwater fishes - Checklist.csv", na = "NA")
readr::write_csv(input_data, here("data", "raw", "alien_fisches_checklist_dump.csv"), na = "")
input_data %<>%
janitor::remove_empty("rows") %>% # Remove empty rows
janitor::clean_names() # Have sensible (lowercase) column names
To link taxa with information in the extension(s), each taxon needs a
unique and relatively stable taxonID
. Here we create one in
the form of dataset_shortname:taxon:hash
, where
hash
is unique code based on scientific name and kingdom
(that will remain the same as long as scientific name and kingdom remain
the same):
vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>%
dplyr::mutate(
taxon_id =
paste(
"alien-fishes-checklist",
"taxon",
vdigest(paste(scientific_name), algo = "md5"),
sep = ":"
)
)
Create a dataframe with unique taxa only (ignoring multiple distribution rows). Map the data to Darwin Core Taxon.
taxon <-
input_data %>%
dplyr::distinct(taxon_id, .keep_all = TRUE) %>%
dplyr::mutate(
language = "en",
license = "http://creativecommons.org/publicdomain/zero/1.0/",
rightsHolder = "INBO",
accessRights = "http://www.inbo.be/en/norms-for-data-use", #"https://www.inbo.be/en/norms-data-use",
datasetID = "https://doi.org/10.15468/xvuzfh",
institutionCode = "INBO",
datasetName = "Checklist of non-native freshwater fishes in Flanders, Belgium", # Add capital later
taxonID = taxon_id,
scientificName = scientific_name,
kingdom = kingdom,
phylum = phylum,
order = order,
family = family,
genus = genus,
taxonRank = taxon_rank,
nomenclaturalCode = nomenclatural_code,
.keep = "none"
) %>%
# dplyr::arrange(taxonID) %>%
# dplyr::select(
# "language", "license", "rightsHolder", "accessRights", "datasetID",
# "institutionCode", "datasetName", "taxonID", "scientificName", "kingdom",
# "phylum", "order", "family", "genus", "taxonRank", "nomenclaturalCode"
# )
dplyr::select(
"language", "license", "rightsHolder", "accessRights", "datasetID",
"datasetName", "taxonID", "scientificName", "kingdom",
"taxonRank", "nomenclaturalCode"
)
Map the data to Vernacular
Names. Vernacular names are available in two languages: English
(raw_common_name
) and Dutch
(raw_nederlandse_naam
). We will gather these columns to
generate a single column containing the vernacular name
(vernacularName
) and an additional column with the language
(language
):
vernacular_names <-
input_data %>%
tidyr::gather(key = language, value = vernacularName, common_name, dutch_name, na.rm = TRUE, convert = TRUE) %>%
dplyr::mutate(
taxonID = taxon_id,
language =
dplyr::recode(
language,
"common_name" = "en",
"dutch_name" = "nl"
)
) %>%
dplyr::select(taxonID, vernacularName, language) %>%
dplyr::arrange(taxonID)
Map the data to Species Distribution.
distribution <-
input_data %>%
dplyr::mutate(
taxonID = taxon_id,
locationID = dplyr::case_when(
location == "Flanders" ~ "ISO_3166-2:BE-VLG",
location == "Wallonia" ~ "ISO_3166-2:BE-WAL",
location == "Brussels" ~ "ISO_3166-2:BE-BRU"
),
locality = dplyr::case_when(
location == "Flanders" ~ "Flemish Region",
location == "Wallonia" ~ "Walloon Region",
location == "Brussels" ~ "Brussels-Capital Region"
),
countryCode = "BE",
occurrenceStatus = occurrence_status,
establishmentMeans = "introduced",
degreeOfEstablishment = degree_of_establishment,
pathway = introduction_pathway,
eventDate = dplyr::recode( # need to be updated
date_first_observation,
"20xx" = "2000",
"17th c." = "1601",
"1980s" = "1980",
"13th c." = "1201"
),
eventDate = paste0(eventDate, "/"),
.keep = "none"
) %>%
dplyr::mutate(pathway = strsplit(as.character(pathway), " ")) %>%
tidyr::unnest(pathway) %>%
dplyr::filter(pathway != "|") %>%
dplyr::arrange(taxonID)
Map the data to Taxon
Description. raw_origin
contains multiple values
(currently not more than 2), so we separate it on " or "
in
2 columns.
description <-
input_data %>%
dplyr::filter(!is.na(native_range)) %>%
dplyr::mutate(
taxonID = taxon_id,
description = strsplit(as.character(native_range), " or "),
type = "native range",
language = "en",
.keep = "none"
) %>%
tidyr::unnest(description) %>%
dplyr::arrange(taxonID)
species_profile <-
input_data %>%
dplyr::mutate(
taxonID = taxon_id,
isMarine = ifelse(realm == "marine", TRUE, FALSE),
isFreshwater = ifelse(realm == "freshwater", TRUE, FALSE),
isTerrestrial = ifelse(realm == "terrestrial", TRUE, FALSE),
.keep = "none"
) %>%
dplyr::arrange(taxonID)
#estuarine is missing?
readr::write_csv(taxon, file = here::here("data", "processed", "taxon.csv"), na = "")
readr::write_csv(vernacular_names, file = here::here("data", "processed", "vernacularname.csv"), na = "")
readr::write_csv(distribution, file = here::here("data", "processed", "distribution.csv"), na = "")
readr::write_csv(description, file = here::here("data", "processed", "description.csv"), na = "")
readr::write_csv(species_profile, file = here::here("data", "processed", "speciesprofile.csv"), na = "")