This document describes how we map the checklist data to Darwin Core. The source file for this document can be found here.
Load libraries:
library(tidyverse) # To do data science
library(magrittr) # To use %<>% pipes
library(here) # To find files
library(janitor) # To clean input data
library(digest) # To generate hashes
library(rgbif) # To use GBIF services
The data is maintained in this Google Spreadsheet.
Read the relevant worksheet (published as csv):
input_data <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSgGi_Un0-7cyg-SzaiE0-RYY5-WvuZNF9kG2GLgeonX6heR6U3xpechdMKWVMQ9raT6AuR86U_gQt9/pub?gid=0&single=true&output=csv", col_types = cols(.default = col_character()))
Copy the source data to the repository to keep track of changes:
write_csv(input_data, here("data", "raw", "alien_bird_checklist_dump.csv"), na = "")
Preview data:
input_data %>% head()
Clean names data somewhat:
input_data %<>%
remove_empty("rows") %>% # Remove empty rows
clean_names() # Have sensible (lowercase) column names
Use the GBIF nameparser to retrieve nomenclatural information for the scientific names in the checklist:
parsed_names <- input_data %>%
distinct(scientific_name) %>%
pull() %>% # Create vector from dataframe
parsenames() # An rgbif function
Show scientific names with nomenclatural issues, i.e. not of type = SCIENTIFIC
or that could not be fully parsed (should be 0). Note: these are not necessarily incorrect.
parsed_names %>%
select(scientificname, type, parsed, parsedpartially, rankmarker) %>%
filter(!(type == "SCIENTIFIC" & parsed == "TRUE" & parsedpartially == "FALSE"))
To link taxa with information in the extension(s), each taxon needs a unique and relatively stable taxonID
. Here we create one in the form of dataset_shortname:taxon:hash
, where hash
is unique code based on scientific name and kingdom (that will remain the same as long as scientific name and kingdom remain the same):
vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>% mutate(taxon_id = paste(
"alien-birds-checklist", # e.g. "alien-fishes-checklist"
"taxon",
vdigest(scientific_name, algo = "md5"),
sep = ":"
))
Show the number of taxa and distributions per rank:
input_data %>%
group_by(taxon_rank) %>%
summarize(
`# taxa` = n_distinct(taxon_id),
`# distributions` = n()
) %>%
adorn_totals("row")
Preview data:
input_data %>% head()
Create a dataframe with unique taxa only (ignoring multiple distribution rows):
taxon <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
Map the data to Darwin Core Taxon.
taxon %<>% mutate(dwc_language = "en")
taxon %<>% mutate(dwc_license = "http://creativecommons.org/publicdomain/zero/1.0/")
taxon %<>% mutate(dwc_rightsHolder = "INBO")
taxon %<>% mutate(dwc_accessRights = "https://www.inbo.be/en/norms-data-use")
taxon %<>% mutate(dwc_datasetID = "https://doi.org/10.15468/wr3gis")
taxon %<>% mutate(dwc_institutionCode = "INBO")
taxon %<>% mutate(dwc_datasetName = "Checklist of alien birds of Belgium")
taxon %<>% mutate(dwc_taxonID = taxon_id)
taxon %<>% mutate(dwc_scientificName = scientific_name)
taxon %<>% mutate(dwc_kingdom = "Animalia")
taxon %<>% mutate(dwc_phylum = "Chordata")
taxon %<>% mutate(dwc_class = "Aves")
taxon %<>% mutate(dwc_order = order)
taxon %<>% mutate(dwc_family = family)
taxon %<>% mutate(dwc_genus = genus)
Inspect values:
taxon %>%
group_by(taxon_rank) %>%
count()
Values are conform GBIF rank vocabulary, map as is:
taxon %<>% mutate(dwc_taxonRank = taxon_rank)
taxon %<>% mutate(dwc_nomenclaturalCode = "ICZN")
Create a dataframe with all data:
distribution <- input_data
Map the data to Species Distribution.
distribution %<>% mutate(dwc_taxonID = taxon_id)
Inspect values:
distribution %>%
group_by(country_code, location) %>%
count()
Set current NA
’s in location
to Belgium
:
distribution %<>% mutate(location = case_when(
is.na(location) ~ "Belgium",
TRUE ~ location
))
Map locationID
to ISO 3166 code:
distribution %<>% mutate(dwc_locationID = case_when(
location == "Belgium" ~ "ISO_3166-2:BE",
location == "Flanders" ~ "ISO_3166-2:BE-VLG",
location == "Wallonia" ~ "ISO_3166-2:BE-WAL",
location == "Brussels" ~ "ISO_3166-2:BE-BRU")
)
Inspect values:
distribution %>%
group_by(location, dwc_locationID) %>%
count()
Map to standard values:
distribution %<>% mutate(dwc_locality = recode(location,
"Flanders" = "Flemish Region",
"Wallonia" = "Walloon Region",
"Brussels" = "Brussels-Capital Region"
))
Inspect values:
distribution %>%
group_by(location, dwc_locality) %>%
count()
Inspect values:
distribution %>%
group_by(country_code) %>%
count()
Map values:
distribution %<>% mutate(dwc_countryCode = country_code)
Inspect values:
distribution %>%
group_by(occurrence_status) %>%
count()
Map values:
distribution %<>% mutate(dwc_occurrenceStatus = occurrence_status)
distribution %<>% mutate(dwc_establishmentMeans = "introduced")
Inspect values for date_first_observation
:
distribution %>%
group_by(date_first_observation) %>%
count()
All date information should comply to the ISO 8601 standard, which requires a four-digit year as a minimum. We here transform deviating values:
distribution %<>% mutate(
date_first_observation = recode(
.x = date_first_observation,
"18th century" = "1701",
"1890s" = "1890",
"1950ies" = "1950"
))
Inspect values for date_last_observation
:
distribution %>%
group_by(date_last_observation) %>%
count()
Inspect all combinations of date_first_observation
and date_last_observation
:
distribution %>%
group_by(date_first_observation, date_last_observation) %>%
count()
Map eventDate
:
distribution %<>% mutate(
dwc_eventDate = case_when(
is.na(date_first_observation) & is.na(date_last_observation) ~ NA_character_,
is.na(date_first_observation) & !is.na(date_last_observation) ~ date_last_observation,
!is.na(date_first_observation) & is.na(date_last_observation) ~ date_first_observation,
!is.na(date_first_observation) & !is.na(date_last_observation) ~ paste(
date_first_observation, date_last_observation, sep = "/"
)
)
)
Show mapping:
distribution %>%
group_by(date_first_observation, date_last_observation, dwc_eventDate) %>%
count()
Inspect values:
distribution %>%
group_by(source) %>%
count()
Map values:
distribution %<>% mutate(dwc_source = source)
Inspect values:
distribution %>%
group_by(remarks) %>%
count()
Map values:
distribution %<>% mutate(dwc_occurrenceRemarks = remarks)
Create a dataframe with unique taxa only (ignoring multiple distribution rows):
species_profile <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
Map the data to Species Profile.
species_profile %<>% mutate(dwc_taxonID = taxon_id)
Inspect realm
:
species_profile %>%
group_by(realm) %>%
count()
species_profile %<>% mutate(dwc_isMarine = "FALSE")
species_profile %<>% mutate(dwc_isTerrestrial = case_when(
realm == "terrestrial" ~ "TRUE",
TRUE ~ "FALSE"
))
species_profile %<>% mutate(dwc_isFreshwater = case_when(
realm == "freshwater" ~ "TRUE",
TRUE ~ "FALSE"
))
In the description extension we want to include several important characteristics (hereafter referred to as descriptors) about the species:
For each descriptor, we create a separate dataframe to process the specific information. We always specify which descriptor we map (type
column) and its specific content (description
column). After the mapping of these Darwin Core terms type
and value
, we merge the dataframes to generate one single description extension. We then continue the mapping process by adding the other Darwin Core terms (which content is independent of the type of descriptor, such as language
).
Create separate dataframe:
native_range <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
Inspect values:
native_range %>%
group_by(native_range) %>%
count()
Separate content using the pipe symbol as a separator:
native_range %<>% separate(
native_range,
into = c("native_range_1", "native_range_2", "native_range_3", "native_range_4", "native_range_5"),
sep = "\\s\\|\\s",
remove = FALSE
)
Change from a wide to a long dataset and trim:
native_range %<>% gather(key, value, starts_with("native_range_"), na.rm = TRUE) %>%
mutate(value = str_trim(value))
Inspect unique values:
native_range %>%
group_by(value) %>%
count()
Map to the WGSRPD vocabulary:
native_range %<>% mutate(description = recode(value,
"Africa" = "Africa (WGSRPD:2)",
"Australia" = "Australia (WGSRPD:50)",
"Caribbean" = "Caribbean (WGSRPD:81)",
"Central America" = "Central America (WGSRPD:80)",
"China" = "China (WGSRPD:36)",
"East Asia" = "Eastern Asia (WGSRPD:38)",
"Eastern Asia" = "Eastern Asia (WGSRPD:38)",
"Eastern Europe" = "Eastern Europe (WGSRPD:14)",
"Europe" = "Europe (WGSRPD:1)",
"Galapagos" = "Galapagos (WGSRPD:GAL)",
"Hawaiian Islands" = "Hawaiian Islands (WGSRPD:HI)",
"India" = "Indian Subcontinent (WGSRPD:40)",
"Indonesia" = "Indonesia (WGSRPD:ID)",
"Japan" = "Japan (WGSRPD:JAP)",
"Madagascar" = "Madagascar (WGSRPD:MDG)",
"New Zealand" = "New Zealand (WGSRPD:51)",
"North America" = "Northern America (WGSRPD:7)",
"Northern America" = "Northern America (WGSRPD:7)",
"Panama" = "Panama (WGSRPD:PAN)",
"Philippines" = "Philippines (WGSRPD:PHI)",
"Portugal" = "Portugal (WGSRPD:POR)",
"South America" = "Southern America (WGSRPD:8)",
"Southeast Europe" = "Southeastern Europe (WGSRPD:13)",
"Southern Africa" = "Southern Africa (WGSRPD:27)",
"Spain" = "Spain (WGSRPD:SPA)",
"Tanzania" = "Tanzania (WGSRPD:TAN)",
"Western Asia" = "Western Asia (WGSRPD:34)"
))
Inspect mapped values:
native_range %>%
group_by(value, description) %>%
count()
Add type
of description:
native_range %<>% mutate(type = "native range")
Select taxon_id
, description
and type
:
native_range %<>% select(taxon_id, description, type)
Create separate dataframe:
pathway <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
Inspect values:
pathway %>%
group_by(introduction_pathway) %>%
count()
Separate content using the pipe symbol as a separator:
pathway %<>% separate(
introduction_pathway,
into = c("introduction_pathway_1", "introduction_pathway_2", "introduction_pathway_3"),
sep = "\\s\\|\\s",
remove = FALSE
)
Change from a wide to a long dataset and trim:
pathway %<>% gather(key, value, starts_with("introduction_pathway_"), na.rm = TRUE) %>%
mutate(value = str_trim(value))
Inspect unique values:
pathway %>%
group_by(value) %>%
count()
Add prefix cbd_2014_pathway:
to refer to the CBD standard for pathway information
pathway %<>% mutate(description = paste("cbd_2014_pathway", value, sep = ":"))
Add type
of description:
pathway %<>% mutate(type = "pathway")
Select taxon_id
, description
and type
:
pathway %<>% select(taxon_id, description, type)
Create separate dataframe:
degree_of_establishment <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
Inspect values:
degree_of_establishment %>%
group_by(degree_of_establishment) %>%
count()
Remove empty records:
degree_of_establishment %<>% filter(!is.na(degree_of_establishment))
These categories of degree_of_establishment
are conform the vocabulary of Blackburn et al. 2011 for mapping the degree of establishment. For each level, we here add the prefix Blackburn et al. 2011:
and the categories (A-E):
degree_of_establishment %<>% mutate(description = recode(degree_of_establishment,
"captive" = "captive (blackburn_2011:B1)",
"casual" = "casual (blackburn_2011:C1)",
"colonizing" = "colonizing (blackburn_2011:D1)",
"established" = "established (blackburn_2011:C3)",
"invasive" = "invasive (blackburn_2011:D2)",
"released" = "released (blackburn_2011:B3)",
"reproducing" = "reproducing (blackburn_2011:C2)"
))
Inspect mapped values:
degree_of_establishment %>%
group_by(degree_of_establishment, description) %>%
count()
Add type
of description:
degree_of_establishment %<>% mutate(type = "degree of establishment")
Select taxon_id
, description
and type
:
degree_of_establishment %<>% select(taxon_id, description, type)
Union native range, pathway of introduction and degree of establishment into a single description extension:
description <- bind_rows(native_range, pathway, degree_of_establishment)
description %<>% mutate(dwc_taxonID = taxon_id)
description %<>% mutate(dwc_description = description)
description %<>% mutate(dwc_type = type)
description %<>% mutate(dwc_language = "en")
Only keep the Darwin Core columns:
taxon %<>% select(starts_with("dwc_"))
distribution %<>% select(starts_with("dwc_"))
species_profile %<>% select(starts_with("dwc_"))
description %<>% select(starts_with("dwc_"))
Drop the dwc_
prefix:
colnames(taxon) <- str_remove(colnames(taxon), "dwc_")
colnames(distribution) <- str_remove(colnames(distribution), "dwc_")
colnames(species_profile) <- str_remove(colnames(species_profile), "dwc_")
colnames(description) <- str_remove(colnames(description), "dwc_")
Sort on taxonID
(to maintain some consistency between updates of the dataset):
taxon %<>% arrange(taxonID)
distribution %<>% arrange(taxonID)
species_profile %<>% arrange(taxonID)
description %<>% arrange(taxonID)
Preview taxon core:
taxon %>% head()
Preview distribution extension:
distribution %>% head()
Preview species profile extension:
species_profile %>% head()
Preview description extension:
description %>% head(10)
Save to CSV:
write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")
write_csv(distribution, here("data", "processed", "distribution.csv"), na = "")
write_csv(species_profile, here("data", "processed", "speciesprofile.csv"), na = "")
write_csv(description, here("data", "processed", "description.csv"), na = "")