5 Unify related information
In this chapter we unify related information for each verified taxon.
5.1 Read data
# Read checklists
checklists <- read_csv(here("data", "raw", "checklists.csv"))
# Read data
taxa <- read_csv(here("data", "interim", "taxa_with_verification.csv"))
distributions <- read_csv(
here("data", "raw", "distributions.csv"),
col_types = cols(
.default = col_character(),
taxonKey = col_double()
)
)
speciesprofiles <- read_csv(here("data", "raw", "speciesprofiles.csv"))
descriptions <- read_csv(
here("data", "raw", "descriptions.csv"),
col_types = cols(
.default = col_character(),
key = col_double(),
taxonKey = col_double()
)
)
5.2 Assign checklist order
Get checklist keys as (ordered) vector.
Assign the checklist order (= index of checklist keys) as an extra column to taxa.
5.3 Unify distribution
- Parse
temporal
(eventDate) information.
distributions_unified <-
distributions %>%
# Split temporal on "/" into startYear and endYear
# If temporal only contains a single year, then endYear will be empty
separate(
temporal,
into = c("startYear", "endYear"),
sep = "/",
remove = FALSE,
convert = TRUE,
extra = "drop",
fill = "right"
) %>%
# Only keep the first 4 characters: 1968-11-21 -> 1968
mutate(
startYear = as.integer(str_sub(startYear, 1, 4)),
endYear = as.integer(str_sub(endYear, 1, 4))
) %>%
# If endYear is empty (no range), populate it with startYear: 2018 & 2018
mutate(endYear = if_else(is.na(endYear), startYear, endYear))
Filter distributions: this was already done in 1.3, where only distributions of interest were kept.
Convert specific localities (from WRIMS) to regional or national localities:
distributions_unified <-
distributions_unified %>%
mutate(locality = recode(locality,
"Belgian part of the North Sea" = "Belgium",
"Belgian Exclusive Economic Zone" = "Belgium",
"Baai van Heist" = "Flemish Region",
"Belgian Coast" = "Flemish Region",
"Blankenberge Harbour" = "Flemish Region",
"Bredene" = "Flemish Region",
"Coastal Polders" = "Flemish Region",
"De Panne" = "Flemish Region",
"Haven van Antwerpen" = "Flemish Region",
"Kanaal Gent - Terneuzen" = "Flemish Region",
"Knokke-Heist" = "Flemish Region",
"Koksijde" = "Flemish Region",
"Nieuwpoort Harbour" = "Flemish Region",
"Oostende" = "Flemish Region",
"Oostende" = "Flemish Region",
"Oostende Harbour" = "Flemish Region",
"port of Bruges-Zeebrugge" = "Flemish Region",
"Port of Ghent" = "Flemish Region",
"Spuikom" = "Flemish Region",
"Zeebrugge" = "Flemish Region",
"Flanders" = "Flemish Region",
"Wallonia" = "Walloon Region"
)) %>%
# Standardize locationId for all
mutate(locationId = case_when(
locality == "Belgium" ~ "ISO_3166:BE",
locality == "Brussels-Capital Region" ~ "ISO_3166:BE-BRU",
locality == "Flemish Region" ~ "ISO_3166:BE-VLG",
locality == "Walloon Region" ~ "ISO_3166:BE-WAL",
TRUE ~ NA_character_
))
- Add a Belgian distribution from regional distributions within a checklist if not present.
distributions_belgium_from_regions <-
distributions_unified %>%
# Group by taxonKey within a checklist
group_by(taxonKey) %>%
# Filter on those groups without a national distribution
filter(!"Belgium" %in% locality) %>%
# Take earliest and latest year from regional distributions
# Don't copy source and remarks
summarize(
startYear = min(startYear, na.rm = TRUE),
endYear = max(endYear, na.rm = TRUE)
) %>%
ungroup() %>%
# Convert dates to integers and set Inf to NA
mutate(
startYear = ifelse(!is.infinite(startYear), as.integer(startYear), NA_integer_),
endYear = ifelse(!is.infinite(endYear), as.integer(endYear), NA_integer_)
) %>%
# Add locality and locationId
mutate(
locality = "Belgium",
locationId = "ISO_3166:BE"
) %>%
# Order columns
select(one_of(names(distributions_unified)))
# Add newly created Belgian distributions to distributions
distributions_unified <-
distributions_unified %>%
bind_rows(distributions_belgium_from_regions) %>%
# Sort by taxonKey
arrange(taxonKey)
- Choose a single distribution per locality within a checklist.
Note: In contrast to the other extensions, the distribution information can come from multiple taxa within the same checklist. For example:
scientific name | eventDate |
---|---|
https://www.gbif.org/species/141266662/verbatim | 1885/1901 |
Medicago monantha subsp. noeana (Boiss.) Greuter & Burdet | 1886/1955 |
Both are considered (verified) synonyms of Medicago monantha Trautv., so their distribution information is merged to 1885/1955
. We note both in taxonKeys
and scientificNames
.
distributions_unified <-
distributions_unified %>%
# Join distribution with taxon to get verificationKey and checklistOrder
left_join(taxa, by = "taxonKey") %>%
# Remove records that have no verificationKey (e.g. one wasn't assigned yet)
filter(!is.na(verificationKey)) %>%
# Group by verificationKey and locality within a checklist
group_by(
datasetKey,
checklistOrder,
verificationKey,
locality,
locationId
) %>%
# Take earliest year, latest year and note taxonKey(s) and scientificName(s)
summarize(
startYear = min(startYear, na.rm = TRUE),
endYear = max(endYear, na.rm = TRUE),
taxonKeys = paste(unique(taxonKey), collapse = "|"),
scientificNames = paste(unique(scientificName), collapse = "|")
) %>%
ungroup() %>%
# Convert dates to integers and set Inf to NA
mutate(
startYear = ifelse(!is.infinite(startYear), as.integer(startYear), NA_integer_),
endYear = ifelse(!is.infinite(endYear), as.integer(endYear), NA_integer_)
)
- Choose a single distribution per locality across checklists.
distributions_unified <-
distributions_unified %>%
# Sort by checklist order (trustworthiness)
arrange(checklistOrder) %>%
# Group by verificationKey and locality across checklists
group_by(
verificationKey,
locality,
locationId
) %>%
# Select year of most trustworthy checklist (first one)
# and note that checklist and its taxonKey(s) and scientificName(s)
summarize(
startYear = first(startYear),
endYear = first(endYear),
datasetKey = first(datasetKey),
taxonKeys = first(taxonKeys),
scientificNames = first(scientificNames)
) %>%
ungroup() %>%
# Sort by verificationKey
arrange(verificationKey)
- Save to CSV.
5.4 Unify species profiles
- Filter species profiles.
speciesprofiles_unified <-
speciesprofiles %>%
# Remove species profiles that contain NA for any of the attributes
# This is rare: normally all attributes are populated or there just isn't a
# species profile for that species
filter(
!is.na(marine) &
!is.na(freshwater) &
!is.na(terrestrial)
)
- Choose a single species profile within a checklist.
speciesprofiles_unified <-
speciesprofiles_unified %>%
# Join species profile with taxon to get verificationKey and checklist order
left_join(taxa, by = "taxonKey") %>%
# Remove records that have no verificationKey (e.g. one wasn't assigned yet)
filter(!is.na(verificationKey)) %>%
# Group by verificationKey within checklist
group_by(
datasetKey,
checklistOrder,
verificationKey
) %>%
# Take first species profile and note taxonKey
summarize(
marine = first(marine),
freshwater = first(freshwater),
terrestrial = first(terrestrial),
taxonKey = first(taxonKey),
scientificName = first(scientificName)
)
- Choose a single species profile across checklists.
speciesprofiles_unified <-
speciesprofiles_unified %>%
# Sort by checklist order (trustworthiness)
arrange(checklistOrder) %>%
# Group by verificationKey across checklists
group_by(verificationKey) %>%
# Select species profile of most trustworthy checklist (first one)
# and note that checklist and its taxonKey
summarize(
marine = first(marine),
freshwater = first(freshwater),
terrestrial = first(terrestrial),
datasetKey = first(datasetKey),
taxonKey = first(taxonKey),
scientificName = first(scientificName)
) %>%
# Sort by verificationKey
arrange(verificationKey)
- Save to CSV.
5.5 Unify descriptions
5.5.1 Native range
Retrieve native range.
Standardize native range:
native_range <-
native_range %>%
mutate(description = recode(description,
"Africa" = "Africa",
"Africa (WGSRPD:2)" = "Africa",
"Algeria" = "Northern Africa",
"America" = "Americas",
"Arctic" = "",
"Asia" = "Asia",
"Asia And Pacific Islands" = "Asia | Asia | Micronesia | Polynesia",
"Australasia (WGSRPD:5)" = "Australia and New Zealand",
"Australia" = "Australia and New Zealand",
"Australia (WGSRPD:50)" = "Australia and New Zealand",
"Austria" = "Western Europe",
"Balearic Islands" = "Southern Europe",
"Balkan" = "Southern Europe | Southern Europe",
"Bolivia, Chile, Argentina" = "South America",
"Brazil, Uruguay, Paraguay, Argentina" = "South America",
"Canada To Peru, Caribbean" = "Americas",
"Canary Is. To Turkey" = "Northern Africa | Northern Africa",
"Canary Islands (WGSRPD:21_CNY)" = "Northern Africa",
"Cape Verde Is., Africa, Madagascar, Comoros" = "Africa",
"Caribbean (WGSRPD:81)" = "Caribbean",
"Caribbean To Brazil" = "Caribbean | Caribbean",
"Carpathians" = "Eastern Europe",
"Caucasus region" = "Western Asia",
"Central America (WGSRPD:80)" = "Central America",
"Central And South America" = "Central America | Central America",
"Central Asia" = "Central Asia",
"central Europe" = "Western Europe | Western Europe | Eastern Europe",
"central Italy" = "Southern Europe",
"central Mediterranean coastal areas" = "Southern Europe | Southern Europe",
"central Turkey" = "Western Asia",
"Central, South America" = "Central America | Central America",
"China" = "Eastern Asia",
"China (WGSRPD:36)" = "Eastern Asia",
"circum western Mediterranean coastal areas" = "Southern Europe | Southern Europe",
"circum-Mediterranean" = "Southern Europe | Southern Europe",
"coastal areas of the western Balkan" = "Southern Europe",
"Corsica" = "Southern Europe",
"Cosmopolitan" = "",
"Costa Rica (WGSRPD:80_COS)" = "Central America",
"Costa Rica, Panama" = "Central America",
"Crete" = "Southern Europe",
"cultivated origin" = "",
"Cyprus, Turkey, Middle East, Egypt, Sudan" = "Western Asia | Western Asia",
"East Asia" = "Eastern Asia",
"eastern Africa" = "Eastern Africa",
"Eastern Africa" = "Eastern Africa",
"Eastern Asia (WGSRPD:38)" = "Eastern Asia",
"Eastern Asian Russia" = "Eastern Asia | Eastern Asia",
"Eastern Europe" = "Eastern Europe",
"Eastern Europe (WGSRPD:14)" = "Eastern Europe",
"Eastern North America" = "Northern America",
"Eastern Palearctic Including China, Korea, Japan" = "Eastern Asia",
"Egypt, Ethiopia, Kenya, Uganda, Rwanda" = "Northern Africa | Northern Africa",
"Ethiopia" = "Northern Africa",
"Eurasia" = "Europe | Europe",
"Eurasia Africa" = "Europe | Europe | Africa",
"Europe (WGSRPD:1)" = "Europe",
"Europe To Azerbaijan, Egypt" = "Europe | Europe | Northern Africa",
"Europe To Northern Africa" = "Europe | Europe",
"Europe, Africa, Yemen, Pakistan, Philippines" = "Europe | Europe | Asia",
"Europe, North Africa To Near East, Turkey, Caucasus, Russia To Central Asia, Iran, Afghanistan, China, Mongolia, Korea" = "Europe | Europe | Western Asia",
"Galapagos (WGSRPD:GAL)" = "South America",
"Greece" = "Southern Europe",
"Hawaiian Islands (WGSRPD:HI)" = "Northern America",
"hybrid origin" = "",
"Iberia" = "Southern Europe",
"Iberian Peninsula" = "Southern Europe",
"India, Southeast Asia To Australia, New Zealand" = "Southern Asia | Southern Asia | Australia and New Zealand",
"Indian Subcontinent (WGSRPD:40)" = "Southern Asia",
"Indo-Pacific" = "Africa | Africa | Australia and New Zealand",
"Indonesia (WGSRPD:ID)" = "Southeastern Asia",
"Italian Peninsula" = "Southern Europe",
"Italy" = "Southern Europe",
"Japan (WGSRPD:38_JAP)" = "Eastern Asia",
"Japan (WGSRPD:JAP)" = "Eastern Asia",
"Macaronesia" = "Northern Africa",
"Macaronesia, Europe, North Africa To Turkey, Caucasus, Turkmenistan, Iran" = "Northern Africa | Northern Africa | Western Asia",
"Macaronesia, North Africa, Europe, Turkey, Caucasus, Near East, Kazakhstan" = "Northern Africa | Northern Africa | Western Asia",
"Madagascar (WGSRPD:MDG)" = "Eastern Africa",
"Mallorca" = "Southern Europe",
"Mediterranean" = "Southern Europe",
"Mediterranean & Portugal" = "Southern Europe",
"Mediterranean To Central Asia" = "Southern Europe | Southern Europe",
"Mediterranean To Russia" = "Southern Europe | Southern Europe",
"Mediterranean To Turkey, Israel" = "Southern Europe | Southern Europe",
"Mediterranean, Africa, India, Myanmar, China" = "Southern Europe | Southern Europe | Asia",
"Mediterranean, Northern Africa" = "Southern Europe | Southern Europe",
"Middle Africa" = "Middle Africa",
"Middle East" = "Western Asia",
"Morocco" = "Northern Africa",
"Nam" = "Northern America",
"Near East" = "Western Asia",
"Nearctic" = "",
"Neotropic" = "",
"New Zealand" = "Australia and New Zealand",
"New Zealand (WGSRPD:51)" = "Australia and New Zealand",
"North Africa" = "Northern Africa",
"North America" = "Northern America",
"North America, Europe, North Africa, Turkey, Israel, Caucasus, Russia (European To Far East), Central Asia, China, Korea" = "Northern America | Northern America | Asia | North Africa",
"North Pacific Ocean" = "",
"North, Central And South America" = "Americas",
"Northeast Asia" = "Eastern Asia",
"Northern Africa" = "Northern Africa",
"Northern Africa And Middle East" = "Northern Africa | Northern Africa",
"Northern Africa, Southern Europe To Turkey" = "Northern Africa | Northern Africa | Western Asia",
"Northern America" = "Northern America",
"Northern America (WGSRPD:7)" = "Northern America",
"northern Balkan" = "Eastern Europe",
"northern coastal areas of the western Mediterranean" = "Southern Europe",
"northern Italy" = "Southern Europe",
"northern part of the Iberian Peninsula" = "Southern Europe",
"northwestern Croatia" = "Southern Europe",
"northwestern Italy" = "Southern Europe",
"nortwestern Africa" = "Northern Africa",
"Palearctic" = "",
"pan-American" = "Americas",
"Panama (WGSRPD:PAN)" = "Central America",
"Pantropical" = "",
"Papua New Guinea" = "Melanesia",
"Philippines (WGSRPD:PHI)" = "Southeastern Asia",
"Ponto-Caspian" = "Eastern Europe",
"Portugal (WGSRPD:POR)" = "Southern Europe",
"Probably Native To North America Only" = "Northern America",
"Russia" = "Eastern Europe",
"Sardinia" = "Southern Europe",
"Sicily" = "Southern Europe",
"Slovenia" = "Eastern Europe",
"South America" = "South America",
"Southeast Asia" = "Southeastern Asia",
"southeastern Alps" = "Southern Europe",
"Southeastern Asia" = "Southeastern Asia",
"southeastern Europe" = "Southern Europe",
"Southeastern Europe (WGSRPD:13)" = "Southern Europe",
"southeastern France" = "Western Europe",
"Southern Africa" = "Southern Africa",
"Southern Africa (WGSRPD:27)" = "Southern Africa",
"southern Alps" = "Southern Europe",
"Southern America (WGSRPD:8)" = "South America",
"Southern Asia" = "Southern Asia",
"southern Europe" = "Southern Europe",
"Southern Europe" = "Southern Europe",
"Southern Europe, Africa, Turkey, Caucasus, Near East, Iran" = "Southern Europe | Southern Europe | Western Asia",
"southern France" = "Western Europe",
"Southern Hemisphere" = "",
"Southwestern Africa" = "Southern Africa",
"southwestern France" = "Western Europe",
"southwestern Germany" = "Western Europe",
"Spain" = "Southern Europe",
"Spain (WGSRPD:SPA)" = "Southern Europe",
"Spain, Italy, North Africa" = "Southern Europe | Southern Europe",
"Sub-Saharan Africa" = "Western Africa | Western Africa | Eastern Africa | Southern Africa",
"Switzerland" = "Western Europe",
"Tanzania (WGSRPD:TAN)" = "Eastern Africa",
"Tasmania (WGSRPD:50_TAS)" = "Australia and New Zealand",
"Temperate Asia" = "Eastern Asia | Eastern Asia | Eastern Europe | Western Asia",
"temperate Asia (WGSRPD:3)" = "Eastern Asia | Eastern Asia | Eastern Europe | Western Asia",
"Tropical Africa" = "",
"Tropical and warm seas" = "",
"Tropical Asia" = "Southern Asia | Southern Asia",
"tropical Asia (WGSRPD:4)" = "Southern Asia | Southern Asia",
"tropical western Africa" = "Western Africa",
"Tunisia" = "Northern Africa",
"unclear" = "",
"United States" = "Northern America",
"Usa To Guatemala" = "Northern America | Northern America",
"West Africa" = "Western Africa",
"Western Africa" = "Western Africa",
"Western Asia (WGSRPD:34)" = "Western Asia",
"Western Atlantic" = "",
"western Balkan" = "Southern Europe",
"western circum-Mediterranean" = "Southern Europe | Southern Europe",
"western Italy" = "Southern Europe",
"western Mediterranean" = "Southern Europe",
"western Mediterranean coastal areas" = "Southern Europe",
"western Mediterranean region" = "Southern Europe",
.default = "" # Change to description to discover new values
))
5.5.2 Degree of establishment
- Rename invasion stage to degree of establishment, so the values can be combined.
5.5.3 Combine and process descriptions
Combine descriptions.
Split and gather descriptions on
|
.Remove
NA
and empty descriptions.Select unique descriptions (within their type) within a checklist.
descriptions_unified <-
descriptions_unified %>%
# Join species profile with taxon to get verificationKey and checklist order
left_join(taxa, by = "taxonKey") %>%
# Remove records that have no verificationKey (e.g. one wasn't assigned yet)
filter(!is.na(verificationKey)) %>%
# Group by type and verificationKey within checklist
group_by(
datasetKey,
checklistOrder,
verificationKey,
type,
description
) %>%
# Take first taxonKey and scientificName
summarize(
taxonKey = first(taxonKey),
scientificName = first(scientificName)
)
- Select unique descriptions (within their type) across checklists:
descriptions_unified <-
descriptions_unified %>%
# Sort by checklist order (trustworthiness)
arrange(checklistOrder) %>%
# Group by type and verificationKey across checklists
group_by(
type,
description,
verificationKey
) %>%
# Select first datasetKey, taxonKey and scientificName
summarize(
datasetKey = first(datasetKey),
taxonKey = first(taxonKey),
scientificName = first(scientificName)
) %>%
# Move verificationKey to beginning and drop checklist_order
select(verificationKey, everything()) %>%
# Sort by verificationKey and type
arrange(verificationKey, type)
- Save to CSV.