In this document we analyze the Belgian datacube to detect possible (re)appearing taxa in the last year. Such detection is based on some basic rules instead of using GAM (Generalized Additive Models) as the number of GBIF observations decreases sensibly during the current year as effect of a publishing delay. We also take into account the appearing/reappearing in protected areas as additional warning.

1 Setup

Load libraries:

library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(here) # To find files
library(rgbif) # To get information from GBIF

2 Get data

We read the output of preprocessing pipeline.

Time series:

df_ts <- read_tsv(
  here::here("data", "interim", "df_timeseries.tsv"),
  na = ""
)

Columns pa_obs and pa_native_obs indicate the presence (1) or absence (0) of the specific taxon and native species at class level.

Preview:

# Get a taxon
taxon <-
  df_ts$taxonKey[10]

# Preview
df_ts %>%
  tidylog::filter(
    taxonKey == taxon,
    year %in% c(2016, 2017),
    eea_cell_code == "1kmE3924N3102"
  )

Taxonomic information:

spec_names <- read_tsv(
  file = here::here(
    "data",
    "interim",
    "timeseries_taxonomic_info.tsv"
  )
)

3 Detect appearing - reappearing species

Years to evaluate:

# Last evaluation year
last_year <- lubridate::year(Sys.Date()) - 2

# First evaluation year
first_year <- last_year - 2

# Evaluation years
years <- seq(first_year, last_year)

We define the minimum latency (in years) before we speak about reappearance. For example, a latency time window of 3 years means that we consider a taxon reappearing in 2019 if observations occur in 2019 and 2015 or before:

years_latency <- 3

3.1 Appearing - reappearing species in Belgium

Lump geographic information:

df_ts_compact <-
  df_ts %>%
  tidylog::group_by(taxonKey, year, classKey) %>%
  tidylog::summarise(
    obs = sum(obs),
    cobs = sum(cobs),
    ncells = sum(pa_obs),
    c_ncells = sum(pa_cobs)
  ) %>%
  tidylog::ungroup()

We define the function is_appearing for appearing/reappearing detection (this function could be moved to TrIAS package trias later):

#' @param df data frame with time series
#' @param eval_years vector with years used for evaluation. Warning is returned
#' if not a sequence. Typically present year and year before.
#' @param latency years of latency (no observations). Strictly positive integer.
#' @param df_reference data frame with time series used to assess reappearing. If NULL (default) df is used.
#' @return data.frame with appearing or reappearing taxa
is_appearing <- function(df,
                         eval_years,
                         latency = 2,
                         df_reference = NULL,
                         data_col = "ncells",
                         id_col = "taxonKey",
                         year_col = "year") {
  df <-
    df %>%
    tidylog::select(id_col, year_col, data_col) %>%
    tidylog::rename(
      "taxonKey" = id_col,
      "year" = year_col,
      "occ" = data_col
    ) %>%
    tidylog::filter(occ > 0) %>%
    tidylog::group_by(taxonKey)

  if (!is.null(df_reference)) {
    df_reference <-
      df_reference %>%
      tidylog::select(id_col, year_col, data_col) %>%
      tidylog::rename(
        "taxonKey" = id_col,
        "year" = year_col,
        "occ" = data_col
      ) %>%
      tidylog::filter(occ > 0) %>%
      tidylog::group_by(taxonKey)
  } else {
    df_reference <- df
  }

  # Taxa present in evaluated years

  taxa_eval_years_df <-
    df %>%
    tidylog::filter(year %in% eval_years) %>%
    tidylog::filter(year == min(year))

  taxa_eval_years <-
    taxa_eval_years_df %>%
    tidylog::distinct(taxonKey) %>%
    dplyr::pull()

  # appearing taxa
  appearing_taxa <-
    df %>%
    tidylog::filter(taxonKey %in% taxa_eval_years) %>%
    tidylog::filter(min(year) %in% eval_years) %>%
    tidylog::filter(year == min(year)) %>%
    tidylog::ungroup()

  appearing_taxa <-
    appearing_taxa %>%
    tidylog::rename(!!sym(data_col) := "occ")

  # reappearing taxa
  reappearing_taxa <-
    df_reference %>%
    tidylog::filter(taxonKey %in% taxa_eval_years) %>%
    tidylog::filter(year < min(eval_years)) %>%
    tidylog::filter(year == max(year)) %>%
    tidylog::ungroup() %>%
    tidylog::rename(last_year_before_reappearing = year) %>%
    tidylog::left_join(taxa_eval_years_df, by = "taxonKey") %>%
    tidylog::group_by(taxonKey) %>%
    tidylog::summarise(n_latent_years = year - last_year_before_reappearing) %>%
    tidylog::filter(n_latent_years > latency) %>%
    tidylog::left_join(taxa_eval_years_df %>% tidylog::select(taxonKey, occ, year),
      by = "taxonKey"
    )

  reappearing_taxa <-
    reappearing_taxa %>%
    tidylog::rename(!!sym(data_col) := "occ")

  return(list(
    appearing_taxa = appearing_taxa,
    reappearing_taxa = reappearing_taxa
  ))
}

Apply function defined above:

appearing_reappearing_taxa <- is_appearing(
  df = df_ts_compact,
  eval_years = years,
  latency = years_latency,
  df_reference = NULL
)

Number of appearing taxa:

nrow(appearing_reappearing_taxa$appearing_taxa)
## [1] 127

Number of reappearing taxa:

nrow(appearing_reappearing_taxa$reappearing_taxa)
## [1] 188

3.2 Appearing - reappearing species in Belgian protected areas

We repeat the workflow defined above to assess which taxa are appearing or reappearing in protected areas.

First we select( data referring to protected areas (Natura2000) and then we lump geographical information:

df_ts_compact_prot_areas <-
  df_ts %>%
  tidylog::filter(natura2000 == TRUE) %>%
  tidylog::group_by(taxonKey, year) %>%
  tidylog::summarise(
    obs = sum(obs),
    cobs = sum(cobs),
    ncells = sum(pa_cobs),
    c_ncells = sum(pa_cobs)
  ) %>%
  tidylog::ungroup()

We apply function is_appearing. Notice that we use occurrences about all Belgium to assess the reappearance in protected areas:

appearing_reappearing_taxa_prot_areas <- is_appearing(
  df = df_ts_compact_prot_areas,
  eval_years = years,
  latency = years_latency,
  df_reference = df_ts_compact
)

Number of appearing taxa:

nrow(appearing_reappearing_taxa_prot_areas$appearing_taxa)
## [1] 146

Number of reappearing taxa:

nrow(appearing_reappearing_taxa_prot_areas$reappearing_taxa)
## [1] 100

3.3 Merge

We merge appearing taxa from all Belgium and from protected areas. We rank taxa by area of occupancy (number of cells) in protected areas and in all Belgium. We do the same for reappearing taxa as well:

app_reapp_global <-
  purrr::map2(
    appearing_reappearing_taxa_prot_areas,
    appearing_reappearing_taxa,
    function(x, y) {
      x <-
        x %>%
        tidylog::rename(ncells_prot_areas = ncells)
      y <-
        y %>%
        tidylog::rename(ncells_BE = ncells)
      tidylog::full_join(x, y) %>%
        tidylog::mutate(
          in_prot_areas = ifelse(!is.na(ncells_prot_areas),
            TRUE,
            FALSE
          ),
          in_BE = ifelse(!is.na(ncells_BE),
            TRUE,
            FALSE
          )
        ) %>%
        arrange(
          desc(ncells_prot_areas),
          desc(ncells_BE)
        )
    }
  )

3.4 Add canonical names

Add taxonomic information for better readability.

app_reapp_global <- purrr::map(
  app_reapp_global,
  function(x) {
    x %>%
      tidylog::left_join(spec_names, by = "taxonKey")
  }
)

4 Table of appearing and reappearing taxa

4.1 Appearing taxa

Reorder columns of data.frame containing appearing taxa:

app_reapp_global$appearing_taxa <-
  app_reapp_global$appearing_taxa %>%
  tidylog::select(
    taxonKey,
    canonicalName,
    year,
    ncells_prot_areas,
    ncells_BE,
    in_prot_areas,
    in_BE,
    class,
    kingdom,
    classKey,
    kingdomKey
  )

Appearing taxa:

app_reapp_global$appearing_taxa

Empty ncells_BE and in_BE equal to FALSE means that the taxon is appearing in protected areas but not appearing outside, i.e. the taxon is maybe reappearing at Belgian level or occurring with a more stable presence. See next section for more details about.

4.2 Reappearing taxa

Reorder columns of data.frame containing reappearing taxa:

app_reapp_global$reappearing_taxa <-
  app_reapp_global$reappearing_taxa %>%
  tidylog::select(
    taxonKey,
    canonicalName,
    year,
    ncells_prot_areas,
    ncells_BE,
    in_prot_areas,
    in_BE,
    n_latent_years,
    class,
    kingdom,
    classKey,
    kingdomKey
  )

Reappearing taxa:

app_reapp_global$reappearing_taxa

Taxa appearing in protected areas and reappearing outside:

app_reapp_global$appearing_taxa %>%
  tidylog::filter(in_BE == FALSE) %>%
  tidylog::filter(taxonKey %in% app_reapp_global$reappearing_taxa$taxonKey) %>%
  tidylog::select(taxonKey, canonicalName)

Taxa appearing in protected areas and with a stable presence outside:

app_reapp_global$appearing_taxa %>%
  tidylog::filter(in_BE == FALSE) %>%
  tidylog::filter(!taxonKey %in% app_reapp_global$reappearing_taxa$taxonKey) %>%
  tidylog::select(taxonKey, canonicalName, class, kingdom, classKey, kingdomKey)

Two examples:

5 Save data

5.1 Appearing taxa

Save appearing taxa:

write_tsv(app_reapp_global$appearing_taxa,
  file = here::here("data", "output", "appearing_taxa.tsv"),
  na = ""
)

5.2 Reappearing taxa

Save reappearing taxa:

write_tsv(app_reapp_global$reappearing_taxa,
  file = here::here("data", "output", "reappearing_taxa.tsv"),
  na = ""
)