• 1 Get taxa from checklists
    • 1.1 Choose checklists
    • 1.2 Get taxa
    • 1.3 Filter on distributions
    • 1.4 Get GBIF backbone taxonomy information
    • 1.5 Show summary and save
  • 2 Get related information
    • 2.1 Read taxa
    • 2.2 Get distributions
    • 2.3 Get species profiles
    • 2.4 Get descriptions
  • 3 Verify taxa
    • 3.1 Read taxa
    • 3.2 Read verification information
    • 3.3 Run and save verification
    • 3.4 Show verification status
      • 3.4.1 Unmatched taxa
      • 3.4.2 Synonyms
      • 3.4.3 Accepted taxa
      • 3.4.4 Erroneous verificationKeys
      • 3.4.5 Additional information
  • 4 Unify taxa
    • 4.1 Read taxa
    • 4.2 Unify taxa
    • 4.3 Get GBIF backbone taxonomy information
    • 4.4 Explicitly remove incorrect taxa
  • 5 Unify related information
    • 5.1 Read data
    • 5.2 Assign checklist order
    • 5.3 Unify distribution
    • 5.4 Unify species profiles
    • 5.5 Unify descriptions
      • 5.5.1 Native range
      • 5.5.2 Degree of establishment
      • 5.5.3 Combine and process descriptions
  • 6 Darwin Core mapping
    • 6.1 Read unified data
    • 6.2 Preview data
    • 6.3 How we cite our sources
    • 6.4 Taxon core
      • 6.4.1 Pre-processing
      • 6.4.2 Term mapping
      • 6.4.3 Post-processing
    • 6.5 Distribution extension
      • 6.5.1 Pre-processing
      • 6.5.2 Term mapping
      • 6.5.3 Post-processing
    • 6.6 Species profile extension
      • 6.6.1 Term mapping
      • 6.6.2 Post-processing
    • 6.7 Description extension
      • 6.7.1 Pre-processing
      • 6.7.2 Term mapping
      • 6.7.3 Post-processing
  • 7 GRIIS mapping
    • 7.1 Read Darwin Core data
    • 7.2 GRIIS format
      • 7.2.1 Pre-processing
      • 7.2.2 Term mapping
      • 7.2.3 eventDate
    • 7.3 Post-processing
  • 8 Get synonyms of all accepted taxa
    • 8.1 Get unified taxa
    • 8.2 Find synonyms

Global Register of Introduced and Invasive Species - Belgium

5 Unify related information

In this chapter we unify related information for each verified taxon.

5.1 Read data

# Read checklists
checklists <- read_csv(here("data", "raw", "checklists.csv"))

# Read data
taxa <- read_csv(here("data", "interim", "taxa_with_verification.csv"))
distributions <- read_csv(
  here("data", "raw", "distributions.csv"),
  col_types = cols(
    .default = col_character(),
    taxonKey = col_double()
  )
)
speciesprofiles <- read_csv(here("data", "raw", "speciesprofiles.csv"))
descriptions <- read_csv(
  here("data", "raw", "descriptions.csv"),
  col_types = cols(
    .default = col_character(),
    key = col_double(),
    taxonKey = col_double()
  )
)

5.2 Assign checklist order

  1. Get checklist keys as (ordered) vector.

  2. Assign the checklist order (= index of checklist keys) as an extra column to taxa.

5.3 Unify distribution

  1. Parse temporal (eventDate) information.
distributions_unified <-
  distributions %>%

  # Split temporal on "/" into startYear and endYear
  # If temporal only contains a single year, then endYear will be empty
  separate(
    temporal,
    into = c("startYear", "endYear"),
    sep = "/",
    remove = FALSE,
    convert = TRUE,
    extra = "drop",
    fill = "right"
  ) %>%

  # Only keep the first 4 characters: 1968-11-21 -> 1968
  mutate(
    startYear = as.integer(str_sub(startYear, 1, 4)),
    endYear = as.integer(str_sub(endYear, 1, 4))
  ) %>%

  # If endYear is empty (no range), populate it with startYear: 2018 & 2018
  mutate(endYear = if_else(is.na(endYear), startYear, endYear))
  1. Filter distributions: this was already done in 1.3, where only distributions of interest were kept.

  2. Convert specific localities (from WRIMS) to regional or national localities:

distributions_unified <-
  distributions_unified %>%
  mutate(locality = recode(locality,
    "Belgian part of the North Sea" = "Belgium",
    "Belgian Exclusive Economic Zone" = "Belgium",
    "Baai van Heist" = "Flemish Region",
    "Belgian Coast" = "Flemish Region",
    "Blankenberge Harbour" = "Flemish Region",
    "Bredene" = "Flemish Region",
    "Coastal Polders" = "Flemish Region",
    "De Panne" = "Flemish Region",
    "Haven van Antwerpen" = "Flemish Region",
    "Kanaal Gent - Terneuzen" = "Flemish Region",
    "Knokke-Heist" = "Flemish Region",
    "Koksijde" = "Flemish Region",
    "Nieuwpoort Harbour" = "Flemish Region",
    "Oostende" = "Flemish Region",
    "Oostende" = "Flemish Region",
    "Oostende Harbour" = "Flemish Region",
    "port of Bruges-Zeebrugge" = "Flemish Region",
    "Port of Ghent" = "Flemish Region",
    "Spuikom" = "Flemish Region",
    "Zeebrugge" = "Flemish Region",
    "Flanders" = "Flemish Region",
    "Wallonia" = "Walloon Region"
  )) %>%
  
  # Standardize locationId for all
  mutate(locationId = case_when(
    locality == "Belgium" ~ "ISO_3166:BE",
    locality == "Brussels-Capital Region" ~ "ISO_3166:BE-BRU",
    locality == "Flemish Region" ~ "ISO_3166:BE-VLG",
    locality == "Walloon Region" ~ "ISO_3166:BE-WAL",
    TRUE ~ NA_character_
  ))
  1. Add a Belgian distribution from regional distributions within a checklist if not present.
distributions_belgium_from_regions <-
  distributions_unified %>%

  # Group by taxonKey within a checklist
  group_by(taxonKey) %>%

  # Filter on those groups without a national distribution
  filter(!"Belgium" %in% locality) %>%

  # Take earliest and latest year from regional distributions
  # Don't copy source and remarks
  summarize(
    startYear = min(startYear, na.rm = TRUE),
    endYear = max(endYear, na.rm = TRUE)
  ) %>%
  ungroup() %>%

  # Convert dates to integers and set Inf to NA
  mutate(
    startYear = ifelse(!is.infinite(startYear), as.integer(startYear), NA_integer_),
    endYear = ifelse(!is.infinite(endYear), as.integer(endYear), NA_integer_)
  ) %>%

  # Add locality and locationId
  mutate(
    locality = "Belgium",
    locationId = "ISO_3166:BE"
  ) %>%

  # Order columns
  select(one_of(names(distributions_unified)))

# Add newly created Belgian distributions to distributions
distributions_unified <-
  distributions_unified %>%
  bind_rows(distributions_belgium_from_regions) %>%

  # Sort by taxonKey
  arrange(taxonKey)
  1. Choose a single distribution per locality within a checklist.

Note: In contrast to the other extensions, the distribution information can come from multiple taxa within the same checklist. For example:

scientific name eventDate
https://www.gbif.org/species/141266662/verbatim 1885/1901
Medicago monantha subsp. noeana (Boiss.) Greuter & Burdet 1886/1955

Both are considered (verified) synonyms of Medicago monantha Trautv., so their distribution information is merged to 1885/1955. We note both in taxonKeys and scientificNames.

distributions_unified <-
  distributions_unified %>%

  # Join distribution with taxon to get verificationKey and checklistOrder
  left_join(taxa, by = "taxonKey") %>%

  # Remove records that have no verificationKey (e.g. one wasn't assigned yet)
  filter(!is.na(verificationKey)) %>%

  # Group by verificationKey and locality within a checklist
  group_by(
    datasetKey,
    checklistOrder,
    verificationKey,
    locality,
    locationId
  ) %>%

  # Take earliest year, latest year and note taxonKey(s) and scientificName(s)
  summarize(
    startYear = min(startYear, na.rm = TRUE),
    endYear = max(endYear, na.rm = TRUE),
    taxonKeys = paste(unique(taxonKey), collapse = "|"),
    scientificNames = paste(unique(scientificName), collapse = "|")
  ) %>%
  ungroup() %>%

  # Convert dates to integers and set Inf to NA
  mutate(
    startYear = ifelse(!is.infinite(startYear), as.integer(startYear), NA_integer_),
    endYear = ifelse(!is.infinite(endYear), as.integer(endYear), NA_integer_)
  )
  1. Choose a single distribution per locality across checklists.
distributions_unified <-
  distributions_unified %>%

  # Sort by checklist order (trustworthiness)
  arrange(checklistOrder) %>%

  # Group by verificationKey and locality across checklists
  group_by(
    verificationKey,
    locality,
    locationId
  ) %>%

  # Select year of most trustworthy checklist (first one)
  # and note that checklist and its taxonKey(s) and scientificName(s)
  summarize(
    startYear = first(startYear),
    endYear = first(endYear),
    datasetKey = first(datasetKey),
    taxonKeys = first(taxonKeys),
    scientificNames = first(scientificNames)
  ) %>%
  ungroup() %>%

  # Sort by verificationKey
  arrange(verificationKey)
  1. Save to CSV.

5.4 Unify species profiles

  1. Filter species profiles.
speciesprofiles_unified <-
  speciesprofiles %>%

  # Remove species profiles that contain NA for any of the attributes
  # This is rare: normally all attributes are populated or there just isn't a
  # species profile for that species
  filter(
    !is.na(marine) &
    !is.na(freshwater) &
    !is.na(terrestrial)
  )
  1. Choose a single species profile within a checklist.
speciesprofiles_unified <-
  speciesprofiles_unified %>%

  # Join species profile with taxon to get verificationKey and checklist order
  left_join(taxa, by = "taxonKey") %>%

  # Remove records that have no verificationKey (e.g. one wasn't assigned yet)
  filter(!is.na(verificationKey)) %>%

  # Group by verificationKey within checklist
  group_by(
    datasetKey,
    checklistOrder,
    verificationKey
  ) %>%

  # Take first species profile and note taxonKey
  summarize(
    marine = first(marine),
    freshwater = first(freshwater),
    terrestrial = first(terrestrial),
    taxonKey = first(taxonKey),
    scientificName = first(scientificName)
  )
  1. Choose a single species profile across checklists.
speciesprofiles_unified <-
  speciesprofiles_unified %>%

  # Sort by checklist order (trustworthiness)
  arrange(checklistOrder) %>%

  # Group by verificationKey across checklists
  group_by(verificationKey) %>%

  # Select species profile of most trustworthy checklist (first one)
  # and note that checklist and its taxonKey
  summarize(
    marine = first(marine),
    freshwater = first(freshwater),
    terrestrial = first(terrestrial),
    datasetKey = first(datasetKey),
    taxonKey = first(taxonKey),
    scientificName = first(scientificName)
  ) %>%

  # Sort by verificationKey
  arrange(verificationKey)
  1. Save to CSV.

5.5 Unify descriptions

5.5.1 Native range

  1. Retrieve native range.

  2. Standardize native range:

native_range <-
  native_range %>%
  mutate(description = recode(description,
    "Africa" = "Africa",
    "Africa (WGSRPD:2)" = "Africa",
    "Algeria" = "Northern Africa",
    "America" = "Americas",
    "Arctic" = "",
    "Asia" = "Asia",
    "Asia And Pacific Islands" = "Asia | Asia | Micronesia | Polynesia",
    "Australasia (WGSRPD:5)" = "Australia and New Zealand",
    "Australia" = "Australia and New Zealand",
    "Australia (WGSRPD:50)" = "Australia and New Zealand",
    "Austria" = "Western Europe",
    "Balearic Islands" = "Southern Europe",
    "Balkan" = "Southern Europe | Southern Europe",
    "Bolivia, Chile, Argentina" = "South America",
    "Brazil, Uruguay, Paraguay, Argentina" = "South America",
    "Canada To Peru, Caribbean" = "Americas",
    "Canary Is. To Turkey" = "Northern Africa | Northern Africa",
    "Canary Islands (WGSRPD:21_CNY)" = "Northern Africa",
    "Cape Verde Is., Africa, Madagascar, Comoros" = "Africa",
    "Caribbean (WGSRPD:81)" = "Caribbean",
    "Caribbean To Brazil" = "Caribbean | Caribbean",
    "Carpathians" = "Eastern Europe",
    "Caucasus region" = "Western Asia",
    "Central America (WGSRPD:80)" = "Central America",
    "Central And South America" = "Central America | Central America",
    "Central Asia" = "Central Asia",
    "central Europe" = "Western Europe | Western Europe | Eastern Europe",
    "central Italy" = "Southern Europe",
    "central Mediterranean coastal areas" = "Southern Europe | Southern Europe",
    "central Turkey" = "Western Asia",
    "Central, South America" = "Central America | Central America",
    "China" = "Eastern Asia",
    "China (WGSRPD:36)" = "Eastern Asia",
    "circum western Mediterranean coastal areas" = "Southern Europe | Southern Europe",
    "circum-Mediterranean" = "Southern Europe | Southern Europe",
    "coastal areas of the western Balkan" = "Southern Europe",
    "Corsica" = "Southern Europe",
    "Cosmopolitan" = "",
    "Costa Rica (WGSRPD:80_COS)" = "Central America",
    "Costa Rica, Panama" = "Central America",
    "Crete" = "Southern Europe",
    "cultivated origin" = "",
    "Cyprus, Turkey, Middle East, Egypt, Sudan" = "Western Asia | Western Asia",
    "East Asia" = "Eastern Asia",
    "eastern Africa" = "Eastern Africa",
    "Eastern Africa" = "Eastern Africa",
    "Eastern Asia (WGSRPD:38)" = "Eastern Asia",
    "Eastern Asian Russia" = "Eastern Asia | Eastern Asia",
    "Eastern Europe" = "Eastern Europe",
    "Eastern Europe (WGSRPD:14)" = "Eastern Europe",
    "Eastern North America" = "Northern America",
    "Eastern Palearctic Including China, Korea, Japan" = "Eastern Asia",
    "Egypt, Ethiopia, Kenya, Uganda, Rwanda" = "Northern Africa | Northern Africa",
    "Ethiopia" = "Northern Africa",
    "Eurasia" = "Europe | Europe",
    "Eurasia Africa" = "Europe | Europe | Africa",
    "Europe (WGSRPD:1)" = "Europe",
    "Europe To Azerbaijan, Egypt" = "Europe | Europe | Northern Africa",
    "Europe To Northern Africa" = "Europe | Europe",
    "Europe, Africa, Yemen, Pakistan, Philippines" = "Europe | Europe | Asia",
    "Europe, North Africa To Near East, Turkey, Caucasus, Russia To Central Asia, Iran, Afghanistan, China, Mongolia, Korea" = "Europe | Europe | Western Asia",
    "Galapagos (WGSRPD:GAL)" = "South America",
    "Greece" = "Southern Europe",
    "Hawaiian Islands (WGSRPD:HI)" = "Northern America",
    "hybrid origin" = "",
    "Iberia" = "Southern Europe",
    "Iberian Peninsula" = "Southern Europe",
    "India, Southeast Asia To Australia, New Zealand" = "Southern Asia | Southern Asia | Australia and New Zealand",
    "Indian Subcontinent (WGSRPD:40)" = "Southern Asia",
    "Indo-Pacific" = "Africa | Africa | Australia and New Zealand",
    "Indonesia (WGSRPD:ID)" = "Southeastern Asia",
    "Italian Peninsula" = "Southern Europe",
    "Italy" = "Southern Europe",
    "Japan (WGSRPD:38_JAP)" = "Eastern Asia",
    "Japan (WGSRPD:JAP)" = "Eastern Asia",
    "Macaronesia" = "Northern Africa",
    "Macaronesia, Europe, North Africa To Turkey, Caucasus, Turkmenistan, Iran" = "Northern Africa | Northern Africa | Western Asia",
    "Macaronesia, North Africa, Europe, Turkey, Caucasus, Near East, Kazakhstan" = "Northern Africa | Northern Africa | Western Asia",
    "Madagascar (WGSRPD:MDG)" = "Eastern Africa",
    "Mallorca" = "Southern Europe",
    "Mediterranean" = "Southern Europe",
    "Mediterranean & Portugal" = "Southern Europe",
    "Mediterranean To Central Asia" = "Southern Europe | Southern Europe",
    "Mediterranean To Russia" = "Southern Europe | Southern Europe",
    "Mediterranean To Turkey, Israel" = "Southern Europe | Southern Europe",
    "Mediterranean, Africa, India, Myanmar, China" = "Southern Europe | Southern Europe | Asia",
    "Mediterranean, Northern Africa" = "Southern Europe | Southern Europe",
    "Middle Africa" = "Middle Africa",
    "Middle East" = "Western Asia",
    "Morocco" = "Northern Africa",
    "Nam" = "Northern America",
    "Near East" = "Western Asia",
    "Nearctic" = "",
    "Neotropic" = "",
    "New Zealand" = "Australia and New Zealand",
    "New Zealand (WGSRPD:51)" = "Australia and New Zealand",
    "North Africa" = "Northern Africa",
    "North America" = "Northern America",
    "North America, Europe, North Africa, Turkey, Israel, Caucasus, Russia (European To Far East), Central Asia, China, Korea" = "Northern America | Northern America | Asia | North Africa",
    "North Pacific Ocean" = "",
    "North, Central And South America" = "Americas",
    "Northeast Asia" = "Eastern Asia",
    "Northern Africa" = "Northern Africa",
    "Northern Africa And Middle East" = "Northern Africa | Northern Africa",
    "Northern Africa, Southern Europe To Turkey" = "Northern Africa | Northern Africa | Western Asia",
    "Northern America" = "Northern America",
    "Northern America (WGSRPD:7)" = "Northern America",
    "northern Balkan" = "Eastern Europe",
    "northern coastal areas of the western Mediterranean" = "Southern Europe",
    "northern Italy" = "Southern Europe",
    "northern part of the Iberian Peninsula" = "Southern Europe",
    "northwestern Croatia" = "Southern Europe",
    "northwestern Italy" = "Southern Europe",
    "nortwestern Africa" = "Northern Africa",
    "Palearctic" = "",
    "pan-American" = "Americas",
    "Panama (WGSRPD:PAN)" = "Central America",
    "Pantropical" = "",
    "Papua New Guinea" = "Melanesia",
    "Philippines (WGSRPD:PHI)" = "Southeastern Asia",
    "Ponto-Caspian" = "Eastern Europe",
    "Portugal (WGSRPD:POR)" = "Southern Europe",
    "Probably Native To North America Only" = "Northern America",
    "Russia" = "Eastern Europe",
    "Sardinia" = "Southern Europe",
    "Sicily" = "Southern Europe",
    "Slovenia" = "Eastern Europe",
    "South America" = "South America",
    "Southeast Asia" = "Southeastern Asia",
    "southeastern Alps" = "Southern Europe",
    "Southeastern Asia" = "Southeastern Asia",
    "southeastern Europe" = "Southern Europe",
    "Southeastern Europe (WGSRPD:13)" = "Southern Europe",
    "southeastern France" = "Western Europe",
    "Southern Africa" = "Southern Africa",
    "Southern Africa (WGSRPD:27)" = "Southern Africa",
    "southern Alps" = "Southern Europe",
    "Southern America (WGSRPD:8)" = "South America",
    "Southern Asia" = "Southern Asia",
    "southern Europe" = "Southern Europe",
    "Southern Europe" = "Southern Europe",
    "Southern Europe, Africa, Turkey, Caucasus, Near East, Iran" = "Southern Europe | Southern Europe | Western Asia",
    "southern France" = "Western Europe",
    "Southern Hemisphere" = "",
    "Southwestern Africa" = "Southern Africa",
    "southwestern France" = "Western Europe",
    "southwestern Germany" = "Western Europe",
    "Spain" = "Southern Europe",
    "Spain (WGSRPD:SPA)" = "Southern Europe",
    "Spain, Italy, North Africa" = "Southern Europe | Southern Europe",
    "Sub-Saharan Africa" = "Western Africa | Western Africa | Eastern Africa | Southern Africa",
    "Switzerland" = "Western Europe",
    "Tanzania (WGSRPD:TAN)" = "Eastern Africa",
    "Tasmania (WGSRPD:50_TAS)" = "Australia and New Zealand",
    "Temperate Asia" = "Eastern Asia | Eastern Asia | Eastern Europe | Western Asia",
    "temperate Asia (WGSRPD:3)" = "Eastern Asia | Eastern Asia | Eastern Europe | Western Asia",
    "Tropical Africa" = "",
    "Tropical and warm seas" = "",
    "Tropical Asia" = "Southern Asia | Southern Asia",
    "tropical Asia (WGSRPD:4)" = "Southern Asia | Southern Asia",
    "tropical western Africa" = "Western Africa",
    "Tunisia" = "Northern Africa",
    "unclear" = "",
    "United States" = "Northern America",
    "Usa To Guatemala" = "Northern America | Northern America",
    "West Africa" = "Western Africa",
    "Western Africa" = "Western Africa",
    "Western Asia (WGSRPD:34)" = "Western Asia",
    "Western Atlantic" = "",
    "western Balkan" = "Southern Europe",
    "western circum-Mediterranean" = "Southern Europe | Southern Europe",
    "western Italy" = "Southern Europe",
    "western Mediterranean" = "Southern Europe",
    "western Mediterranean coastal areas" = "Southern Europe",
    "western Mediterranean region" = "Southern Europe",
    .default = "" # Change to description to discover new values
  ))

5.5.2 Degree of establishment

  1. Rename invasion stage to degree of establishment, so the values can be combined.

5.5.3 Combine and process descriptions

  1. Combine descriptions.

  2. Split and gather descriptions on |.

  3. Remove NA and empty descriptions.

  4. Select unique descriptions (within their type) within a checklist.

descriptions_unified <-
  descriptions_unified %>%

  # Join species profile with taxon to get verificationKey and checklist order
  left_join(taxa, by = "taxonKey") %>%

  # Remove records that have no verificationKey (e.g. one wasn't assigned yet)
  filter(!is.na(verificationKey)) %>%

  # Group by type and verificationKey within checklist
  group_by(
    datasetKey,
    checklistOrder,
    verificationKey,
    type,
    description
  ) %>%

  # Take first taxonKey and scientificName
  summarize(
    taxonKey = first(taxonKey),
    scientificName = first(scientificName)
  )
  1. Select unique descriptions (within their type) across checklists:
descriptions_unified <-
  descriptions_unified %>%

  # Sort by checklist order (trustworthiness)
  arrange(checklistOrder) %>%

  # Group by type and verificationKey across checklists
  group_by(
    type,
    description,
    verificationKey
  ) %>%

  # Select first datasetKey, taxonKey and scientificName
  summarize(
    datasetKey = first(datasetKey),
    taxonKey = first(taxonKey),
    scientificName = first(scientificName)
  ) %>%

  # Move verificationKey to beginning and drop checklist_order
  select(verificationKey, everything()) %>%

  # Sort by verificationKey and type
  arrange(verificationKey, type)
  1. Save to CSV.