Load libraries:
library(tidyverse) # To do data science
library(here) # To find files
library(janitor) # To clean input data
The data is maintained in this Google Spreadsheet.
Read the relevant worksheet (published as csv):
raw_data <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vTl8IEk2fProQorMu5xKQPdMXl3OQp-c0f6eBXitv0BiVFZ3JSJCde0PtbFXuETgguf6vK8b43FDX1C/pub?gid=1518765324&single=true&output=csv", show_col_types = FALSE)
Copy the source data to the repository to keep track of changes:
write_csv(raw_data, here("data", "raw", "alien_mollusca_dump.csv"), na = "")
To link taxa with information in the extension(s), each taxon needs a
unique and relatively stable taxonID. We have created one
in the form of dataset_shortname:taxon:hash, where
hash is unique code based on scientific name and kingdom.
Once this is created, it is added to the source data.
input_data <-
raw_data %>%
remove_empty("rows") %>%
clean_names() %>%
mutate(
taxon_id = paste(
"alien-mollusca-checklist",
"taxon",
.data$taxon_id_hash,
sep = ":"
)
)
Create a dataframe with unique taxa only (ignoring multiple distribution rows). Map the data to Darwin Core Taxon.
taxon <-
input_data %>%
distinct(taxon_id, .keep_all = TRUE) %>%
mutate(
language = "en",
license = "http://creativecommons.org/publicdomain/zero/1.0/",
rightsHolder = "Royal Belgian Institute of Natural Sciences",
accessRights = "https://www.inbo.be/en/norms-data-use",
datasetID = "https://doi.org/10.15468/t13kwo",
institutionCode = "RBINS",
datasetName = "Registry of introduced terrestrial molluscs in Belgium",
taxonID = taxon_id,
scientificName = scientific_name,
kingdom = kingdom,
phylum = phylum,
order = order,
family = family,
genus = genus,
taxonRank = taxon_rank,
nomenclaturalCode = nomenclatural_code,
.keep = "none"
) |>
arrange(taxonID) |>
select(
"language", "license", "rightsHolder", "accessRights", "datasetID",
"institutionCode", "datasetName", "taxonID", "scientificName", "kingdom",
"phylum", "order", "family", "genus", "taxonRank", "nomenclaturalCode"
)
Map the data to Vernacular
Nams. Vernacular names are available in four languages: Dutch,
French, English and German. We will gather these columns to generate a
single column containing the vernacular name
(vernacularName) and an additional column with the language
(language):
vernacular_name <-
input_data |>
tidyr::gather(
key = language,
value = vernacularName,
common_name, dutch_name, french_name, german_name,
na.rm = TRUE,
convert = TRUE) %>%
dplyr::mutate(vernacularName = strsplit(vernacularName, "\\|")) |>
unnest(vernacularName) |>
dplyr::mutate(
taxonID = taxon_id,
vernacularName = str_trim(vernacularName),
language =
dplyr::recode(
language,
"dutch_name" = "nl",
"french_name" = "fr",
"common_name" = "en",
"german_name" = "ge"
)
) |>
dplyr::select(taxonID, vernacularName, language) |>
dplyr::arrange(taxonID)
In this extension we will express broad habitat characteristics of
the species (e.g. isTerrestrial).
Create a dataframe with unique taxa only (ignoring multiple
distribution rows). Only keep records for which
terrestrial, marine and
freshwater is not empty.
Map the data to Species Profile.
species_profile <-
input_data %>%
distinct(taxon_id, .keep_all = TRUE) |>
filter(
!is.na(terrestrial) |
!is.na(marine) |
!is.na(freshwater)
) |>
mutate(
.keep = "none",
taxonID = taxon_id,
isMarine = marine,
isFreshwater = freshwater,
isTerrestrial = terrestrial
) |>
arrange(taxonID)
Create a dataframe with all data (including multiple distributions). Map the data to Species Distribution.
Information for eventDate is contained in
date_first_observation and
date_last_observation, which we will express here in an ISO
8601 date format yyyy/yyyy
(start_date/end_date).
Not all cells for date_first_observation (DFO) and/or
date_last_observation (DLO) are populated. So, we used the
following rules for those records:
case 1. If DFO is empty and
DLO is empty, eventDate is NA
case 2. If DFO is empty and
DLO is not empty: eventDate = /DLO
case 3. If DFO is not empty and
DLO is empty, eventDate is DFO/
distribution <-
input_data %>%
# pathway
pivot_longer(
names_to = "key",
values_to = "pathway",
starts_with("introduction_pathway"),
values_drop_na = FALSE) %>%
filter( # keep NA value for species with no pathway provided
!is.na(pathway) |
(is.na(pathway) & key == "introduction_pathway_1")
) %>%
# other terms
mutate(
taxonID = taxon_id,
locationID = case_when(
location == "Flanders" ~ "ISO_3166-2:BE-VLG",
location == "Wallonia" ~ "ISO_3166-2:BE-WAL",
location == "Brussels" ~ "ISO_3166-2:BE-BRU",
location == "Belgium" ~ "ISO_3166-2:BE"
),
locality = case_when(
location == "Flanders" ~ "Flemish Region",
location == "Wallonia" ~ "Walloon Region",
location == "Brussels" ~ "Brussels-Capital Region",
location == "Belgium" ~ "Belgium"
),
countryCode = country_code,
occurrenceStatus = occurrence_status,
establishmentMeans = establishment_means,
degreeOfEstablishment = degree_of_establishment,
eventDate = case_when(
is.na(date_first_observation) & is.na(date_last_observation) ~ NA,
is.na(date_first_observation) ~ paste0("/", date_last_observation),
is.na(date_last_observation) ~ paste0(date_first_observation, "/"),
!is.na(date_first_observation) & !is.na(date_last_observation) ~
paste(date_first_observation, date_last_observation, sep = "/")
),
source = source,
occurrenceRemarks = occurrence_remarks
) %>%
select(
"taxonID", "locationID", "locality", "countryCode", "occurrenceStatus",
"establishmentMeans", "degreeOfEstablishment", "pathway",
"eventDate", "source", "occurrenceRemarks"
) %>%
arrange(taxonID)
In the description extension we want to include the native range of a species
description <-
input_data |>
# unique taxa only (ignoring multiple distribution rows)
distinct(taxon_id, .keep_all = TRUE) |>
# Separate values on `|`
mutate(native_range = strsplit(native_range, "\\|")) |>
unnest(native_range) |>
filter(!is.na(native_range)) |>
mutate(
.keep = "none",
taxonID = taxon_id,
description = str_trim(native_range),
type = "native range",
language = "en"
) |>
select("taxonID", "description", "type", "language") |>
arrange(taxonID)
write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")
write_csv(vernacular_name, here("data", "processed", "vernacularname.csv"), na = "")
write_csv(distribution, here("data", "processed", "distribution.csv"), na = "")
write_csv(species_profile, here("data", "processed", "speciesprofile.csv"), na = "")
write_csv(description, here("data", "processed", "description.csv"), na = "")