This document describes how to get and manipulate data from unified checklist for checklist-based indicators.
Load libraries:
library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(janitor) # To clean column names
library(here) # To find files
library(rgbif) # To get taxa from publisehd unified checklist
library(trias) # To get indicator functionalities
Get taxa from the published unified checklist, the Global Register of Introduced and Invasive Species - Belgium:
taxa <- name_usage(
datasetKey = "6d9e952f-948c-4483-9807-575348147c7e",
limit = 10000
)
taxa <-
taxa$data %>%
filter(origin == "SOURCE")
Select columns we are interested to:
checklist_info <- c(
"key", "nubKey", "scientificName", "datasetKey",
"canonicalName", "species", "genus", "family",
"order", "class", "phylum", "kingdom",
"rank", "speciesKey", "genusKey", "familyKey",
"orderKey", "classKey", "phylumKey", "kingdomKey",
"taxonomicStatus", "acceptedKey", "accepted"
)
taxa <-
taxa %>%
select(one_of(checklist_info))
Extract distribution information
distribution <-
taxa %>%
pull(key) %>%
map_dfr(., function(x) {
distribution_details <-name_usage(x, data = "distribution")
return(distribution_details$data)
}
)
distribution <- distribution %>% select(-source)
head(distribution)
Extract description information
description <- taxa %>%
pull(key) %>%
map_dfr(., function(x) {
description_details <-name_usage(x, data = "description")
return(description_details$data)
}
)
description <- description %>% select(-c(key, language, source))
head(description, n = 10)
Description contains a column type
with the following
descriptors:
description %>% distinct(type)
Patch: pathways "introduction pathway"
and "pathway of introduction"
should be renamed to
pathways
. Bug documented in issue #68.
description <-
description %>%
mutate(type = if_else(type %in% c("introduction pathway",
"pathway of introduction"),
"pathway",
type)
)
We are interested to the following types:
types <- c(
"native range",
"degree of establishment",
"pathway"
)
We filter out the other types:
description <-
description %>%
filter(type %in% types)
We tidy this data.frame, thus having different descriptors as different columns:
description <-
description %>%
as_tibble() %>%
pivot_wider(
names_from = type,
values_from = description,
values_fn = list(name = list)
) %>%
unnest_longer(types[1]) %>%
unnest_longer(types[2]) %>%
unnest_longer(types[3])
description %>% head(n = 20)
We also clean column names by standardizing column names (snake_case):
description <-
description %>%
janitor::clean_names(case = "snake") %>%
rename(taxonKey = taxon_key)
names(description)
## [1] "taxonKey" "pathway"
## [3] "degree_of_establishment" "native_range"
degree of establishment
Values in column degree_of_establishment
:
description %>%
distinct(degree_of_establishment)
We map them as follows:
description <-
description %>%
mutate(degree_of_establishment = recode(
degree_of_establishment,
"captive (blackburn_2011:B1)" = "captive",
"blackburn_et_al_2011:B2" = "cultivated",
"released (blackburn_2011:B3)" = "released",
"blackburn_et_al_2011:C0" = "failing",
"casual (blackburn_2011:C1)" = "casual",
"reproducing (blackburn_2011:C2)" = "reproducing",
"established (blackburn_2011:C3)" = "established",
"blackburn_et_al_2011:C3" = "established",
"colonizing (blackburn_2011:D1)" = "colonizing",
"blackburn_et_al_2011:D2" = "invasive",
"blackburn_et_al_2011:E" = "widespreadInvasive"
))
Mapping result:
description %>%
distinct(degree_of_establishment)
We use native range information to create a similar column containing native continent:
description <-
description %>%
mutate(native_continent = case_when(
grepl(pattern = "Africa", native_range, ignore.case = TRUE) ~ "Africa",
grepl(pattern = "America", native_range, ignore.case = TRUE) ~ "America",
grepl(pattern = "Asia", native_range, ignore.case = TRUE) ~ "Asia",
grepl(pattern = "Australia", native_range, ignore.case = TRUE) ~ "Oceania",
grepl(pattern = "nesia", native_range, ignore.case = TRUE) ~ "Oceania",
grepl(pattern = "Europe", native_range, ignore.case = TRUE) ~ "Europe",
TRUE ~ as.character(NA))
)
Mapping result:
description %>%
distinct(native_continent)
pathway
This section contains a series of patches to standardize pathway information.
Add prefix cbd_2014_pathway:
where missing (see issue
#69):
description <-
description %>%
mutate(pathway = if_else(
str_starts(.data$pathway, "cbd_2014_pathway:"),
pathway,
paste0("cbd_2014_pathway:", pathway))
)
Pathway cbd_2014_pathway:agriculture
should be replaced
by cbd_2014_pathway:escape_agriculture
(see issue
#38):
description <-
description %>%
mutate(pathway = if_else(pathway == "cbd_2014_pathway:agriculture",
"cbd_2014_pathway:escape_agriculture",
pathway)
)
Pathway contaminant_plant
should be replaced by
contaminant_on_plants
(see issue
#9)
description <-
description %>%
mutate(pathway = if_else(pathway == "cbd_2014_pathway:contaminant_plant",
"cbd_2014_pathway:contaminant_on_plants",
pathway)
)
Remove pathway unintentional
as it is not a valid
pathway (see issue
#3):
description <-
description %>%
filter(!pathway %in% c("cbd_2014_pathway:unintentional"))
Rename pathway nursery
as
contaminant_nursery
(see issue
#40):
description <-
description %>%
mutate(pathway = if_else(pathway == "cbd_2014_pathway:nursery",
"cbd_2014_pathway:contaminant_nursery",
pathway)
)
Extract species profile
speciesProfiles <- taxa %>%
pull(key) %>%
map_dfr(., function(x) {
species_profiles_details <-name_usage(x, data = "speciesProfile")
return(species_profiles_details$data)
}
)
head(speciesProfiles)
Merge distribution, description and species profile information:
merged_extensions <-
full_join(distribution,
description,
by = "taxonKey"
) %>%
full_join(speciesProfiles,
by = "taxonKey"
)
to merge them all with the taxonomic information:
merged_info <- left_join(taxa, merged_extensions,
by = c("key" = "taxonKey")
)
merged_info %>% head(n = 100)
Some columns should be splitted in order to make the data.frame
completely tidy: temporal
and pathway
.
temporal
The column temporal
contains one or two datums:
In case there are two datums, we split them in
first_observed
and last_observed
. If only a
datum is present, then it is used for both
first_observed
and last_observed
.
merged_info <-
merged_info %>%
separate(
col = temporal,
sep = "/",
into = c("first_observed", "last_observed"),
convert = TRUE,
fill = "right",
remove = TRUE
) %>%
mutate(last_observed = ifelse(is.na(last_observed),
first_observed,
last_observed
))
An example:
Some first_observed
are greater than
last_observed
, (see issue
#41):
wrong_temporal <-
merged_info %>%
filter(last_observed < first_observed)
wrong_temporal %>%
distinct(key, first_observed, last_observed)
Correct them:
# invert first/last_observed columns
wrong_temporal$aux <- wrong_temporal$first_observed
wrong_temporal$first_observed <- wrong_temporal$last_observed
wrong_temporal$last_observed <- wrong_temporal$aux
wrong_temporal$aux <- NULL
wrong_temporal %>%
distinct(key, first_observed, last_observed)
# replace wrong rows with the corrected ones
merged_info <- merged_info %>%
anti_join(wrong_temporal,
by = c("key", "nubKey", "scientificName", "datasetKey",
"canonicalName", "species", "genus", "family", "order",
"class", "phylum", "kingdom", "rank", "speciesKey",
"taxonomicStatus", "locationId", "locality", "country",
"status", "establishmentMeans", "pathway",
"degree_of_establishment", "native_range",
"native_continent", "habitat", "marine", "freshwater",
"terrestrial", "source")) %>%
bind_rows(wrong_temporal) %>%
arrange(key)
pathway
The column pathway
contains a prefix,
cbd_2014_pathway:
and two different pathway levels divided
by symbol _
(Note: this is not valid for pathway
natural_dispersal
, which should not be divided in two
levels as it is a level 2 pathway of level 1 unaided
):
We split pathway
in pathway_level1
and
pathway_level2
assigning value unknown
if
pathway is empty (NA
) or a zero-length string
(""
). We also assign level 2 natural_dispersal
to taxa with pathway level 1 unaided
, as suggested by Tim
Adriaens in trias-project/indicators#61
(comment):
merged_info <-
merged_info %>%
rowwise() %>%
mutate(
pathway_level1 =
str_split_fixed(
str_split(pathway,
pattern = "pathway:"
)[[1]][2],
pattern = "_", n = 2
)[[1]][1],
pathway_level2 =
str_split_fixed(
str_split(pathway, pattern = "pathway:")[[1]][2],
pattern = "_", n = 2
)[[2]][1]
) %>%
ungroup() %>%
mutate(pathway_level2 = ifelse(pathway_level1 == "unaided" &
(pathway_level2 == "" |
is.na(pathway_level2)),
"natural_dispersal",
pathway_level2)) %>%
mutate(pathway_level2 = ifelse(pathway_level1 == "unknown",
"unknown",
pathway_level2)) %>%
mutate(
pathway_level1 = ifelse(is.na(pathway_level1) | pathway_level1 == "",
"unknown",
pathway_level1),
pathway_level2 = ifelse(is.na(pathway_level2) | pathway_level2 == "",
"unknown",
pathway_level2)
) %>%
mutate(
pathway_level1 = ifelse(
pathway_level1 == "natural",
"unaided",
pathway_level1
),
pathway_level2 = ifelse(
pathway_level2 == "dispersal",
"natural_dispersal",
pathway_level2
)) %>%
select(-pathway)
Full mapping:
The data.frame merged_info
is saved as output file:
write_tsv(merged_info,
file = here::here(
"data",
"interim",
"data_input_checklist_indicators.tsv"
),
na = ""
)
This file is the start point for building checklist-based indicators.