In this document we analyze the Belgian datacube to detect possible (re)appearing taxa in the last year. Such detection is based on some basic rules instead of using GAM (Generalized Additive Models) as the number of GBIF observations decreases sensibly during the current year as effect of a publishing delay. We also take into account the appearing/reappearing in protected areas as additional warning.
Load libraries:
library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(here) # To find files
library(rgbif) # To get information from GBIF
We read the output of preprocessing pipeline.
Time series:
df_ts <- read_tsv(
here::here("data", "interim", "df_timeseries.tsv"),
na = ""
)
Columns pa_obs
and pa_native_obs
indicate
the presence (1) or absence (0) of the specific taxon and native species
at class level.
Preview:
# Get a taxon
taxon <-
df_ts$taxonKey[10]
# Preview
df_ts %>%
tidylog::filter(
taxonKey == taxon,
year %in% c(2016, 2017),
eea_cell_code == "1kmE3924N3102"
)
Taxonomic information:
spec_names <- read_tsv(
file = here::here(
"data",
"interim",
"timeseries_taxonomic_info.tsv"
)
)
Years to evaluate:
# Last evaluation year
last_year <- lubridate::year(Sys.Date()) - 2
# First evaluation year
first_year <- last_year - 2
# Evaluation years
years <- seq(first_year, last_year)
We define the minimum latency (in years) before we speak about reappearance. For example, a latency time window of 3 years means that we consider a taxon reappearing in 2019 if observations occur in 2019 and 2015 or before:
years_latency <- 3
Lump geographic information:
df_ts_compact <-
df_ts %>%
tidylog::group_by(taxonKey, year, classKey) %>%
tidylog::summarise(
obs = sum(obs),
cobs = sum(cobs),
ncells = sum(pa_obs),
c_ncells = sum(pa_cobs)
) %>%
tidylog::ungroup()
We define the function is_appearing
for
appearing/reappearing detection (this function could be moved to TrIAS
package trias
later):
#' @param df data frame with time series
#' @param eval_years vector with years used for evaluation. Warning is returned
#' if not a sequence. Typically present year and year before.
#' @param latency years of latency (no observations). Strictly positive integer.
#' @param df_reference data frame with time series used to assess reappearing. If NULL (default) df is used.
#' @return data.frame with appearing or reappearing taxa
is_appearing <- function(df,
eval_years,
latency = 2,
df_reference = NULL,
data_col = "ncells",
id_col = "taxonKey",
year_col = "year") {
df <-
df %>%
tidylog::select(id_col, year_col, data_col) %>%
tidylog::rename(
"taxonKey" = id_col,
"year" = year_col,
"occ" = data_col
) %>%
tidylog::filter(occ > 0) %>%
tidylog::group_by(taxonKey)
if (!is.null(df_reference)) {
df_reference <-
df_reference %>%
tidylog::select(id_col, year_col, data_col) %>%
tidylog::rename(
"taxonKey" = id_col,
"year" = year_col,
"occ" = data_col
) %>%
tidylog::filter(occ > 0) %>%
tidylog::group_by(taxonKey)
} else {
df_reference <- df
}
# Taxa present in evaluated years
taxa_eval_years_df <-
df %>%
tidylog::filter(year %in% eval_years) %>%
tidylog::filter(year == min(year))
taxa_eval_years <-
taxa_eval_years_df %>%
tidylog::distinct(taxonKey) %>%
dplyr::pull()
# appearing taxa
appearing_taxa <-
df %>%
tidylog::filter(taxonKey %in% taxa_eval_years) %>%
tidylog::filter(min(year) %in% eval_years) %>%
tidylog::filter(year == min(year)) %>%
tidylog::ungroup()
appearing_taxa <-
appearing_taxa %>%
tidylog::rename(!!sym(data_col) := "occ")
# reappearing taxa
reappearing_taxa <-
df_reference %>%
tidylog::filter(taxonKey %in% taxa_eval_years) %>%
tidylog::filter(year < min(eval_years)) %>%
tidylog::filter(year == max(year)) %>%
tidylog::ungroup() %>%
tidylog::rename(last_year_before_reappearing = year) %>%
tidylog::left_join(taxa_eval_years_df, by = "taxonKey") %>%
tidylog::group_by(taxonKey) %>%
tidylog::summarise(n_latent_years = year - last_year_before_reappearing) %>%
tidylog::filter(n_latent_years > latency) %>%
tidylog::left_join(taxa_eval_years_df %>% tidylog::select(taxonKey, occ, year),
by = "taxonKey"
)
reappearing_taxa <-
reappearing_taxa %>%
tidylog::rename(!!sym(data_col) := "occ")
return(list(
appearing_taxa = appearing_taxa,
reappearing_taxa = reappearing_taxa
))
}
Apply function defined above:
appearing_reappearing_taxa <- is_appearing(
df = df_ts_compact,
eval_years = years,
latency = years_latency,
df_reference = NULL
)
Number of appearing taxa:
nrow(appearing_reappearing_taxa$appearing_taxa)
## [1] 127
Number of reappearing taxa:
nrow(appearing_reappearing_taxa$reappearing_taxa)
## [1] 188
We repeat the workflow defined above to assess which taxa are appearing or reappearing in protected areas.
First we select( data referring to protected areas (Natura2000) and then we lump geographical information:
df_ts_compact_prot_areas <-
df_ts %>%
tidylog::filter(natura2000 == TRUE) %>%
tidylog::group_by(taxonKey, year) %>%
tidylog::summarise(
obs = sum(obs),
cobs = sum(cobs),
ncells = sum(pa_cobs),
c_ncells = sum(pa_cobs)
) %>%
tidylog::ungroup()
We apply function is_appearing
. Notice that we use
occurrences about all Belgium to assess the reappearance in protected
areas:
appearing_reappearing_taxa_prot_areas <- is_appearing(
df = df_ts_compact_prot_areas,
eval_years = years,
latency = years_latency,
df_reference = df_ts_compact
)
Number of appearing taxa:
nrow(appearing_reappearing_taxa_prot_areas$appearing_taxa)
## [1] 146
Number of reappearing taxa:
nrow(appearing_reappearing_taxa_prot_areas$reappearing_taxa)
## [1] 100
We merge appearing taxa from all Belgium and from protected areas. We rank taxa by area of occupancy (number of cells) in protected areas and in all Belgium. We do the same for reappearing taxa as well:
app_reapp_global <-
purrr::map2(
appearing_reappearing_taxa_prot_areas,
appearing_reappearing_taxa,
function(x, y) {
x <-
x %>%
tidylog::rename(ncells_prot_areas = ncells)
y <-
y %>%
tidylog::rename(ncells_BE = ncells)
tidylog::full_join(x, y) %>%
tidylog::mutate(
in_prot_areas = ifelse(!is.na(ncells_prot_areas),
TRUE,
FALSE
),
in_BE = ifelse(!is.na(ncells_BE),
TRUE,
FALSE
)
) %>%
arrange(
desc(ncells_prot_areas),
desc(ncells_BE)
)
}
)
Add taxonomic information for better readability.
app_reapp_global <- purrr::map(
app_reapp_global,
function(x) {
x %>%
tidylog::left_join(spec_names, by = "taxonKey")
}
)
Reorder columns of data.frame containing appearing taxa:
app_reapp_global$appearing_taxa <-
app_reapp_global$appearing_taxa %>%
tidylog::select(
taxonKey,
canonicalName,
year,
ncells_prot_areas,
ncells_BE,
in_prot_areas,
in_BE,
class,
kingdom,
classKey,
kingdomKey
)
Appearing taxa:
app_reapp_global$appearing_taxa
Empty ncells_BE
and in_BE
equal to FALSE
means that the taxon is appearing in protected areas but not appearing
outside, i.e. the taxon is maybe reappearing at Belgian level or
occurring with a more stable presence. See next section for more details
about.
Reorder columns of data.frame containing reappearing taxa:
app_reapp_global$reappearing_taxa <-
app_reapp_global$reappearing_taxa %>%
tidylog::select(
taxonKey,
canonicalName,
year,
ncells_prot_areas,
ncells_BE,
in_prot_areas,
in_BE,
n_latent_years,
class,
kingdom,
classKey,
kingdomKey
)
Reappearing taxa:
app_reapp_global$reappearing_taxa
Taxa appearing in protected areas and reappearing outside:
app_reapp_global$appearing_taxa %>%
tidylog::filter(in_BE == FALSE) %>%
tidylog::filter(taxonKey %in% app_reapp_global$reappearing_taxa$taxonKey) %>%
tidylog::select(taxonKey, canonicalName)
Taxa appearing in protected areas and with a stable presence outside:
app_reapp_global$appearing_taxa %>%
tidylog::filter(in_BE == FALSE) %>%
tidylog::filter(!taxonKey %in% app_reapp_global$reappearing_taxa$taxonKey) %>%
tidylog::select(taxonKey, canonicalName, class, kingdom, classKey, kingdomKey)
Two examples:
Save appearing taxa:
write_tsv(app_reapp_global$appearing_taxa,
file = here::here("data", "output", "appearing_taxa.tsv"),
na = ""
)
Save reappearing taxa:
write_tsv(app_reapp_global$reappearing_taxa,
file = here::here("data", "output", "reappearing_taxa.tsv"),
na = ""
)