8  Check Date Consistency

8.1 Problem Description

This document describes the process of checking date and time consistency in the camera trap dataset. The goal is to identify records with inconsistent sampling periods, future dates, or species records outside the sampling interval.

8.2 Problem Solving

8.2.1 Common Steps

We start by loading the required functions and packages.

Code
library(hms)
library(tidyverse)
source("R/FUNCTIONS.R")

We load the camera trap setup data for further checks.

Code
ct <- read_sheet(path = "Example", sheet = "Camera_trap", na = c("NA", "na"))

8.2.2 Check for Short Sampling Durations

We identify camera deployments with a sampling duration of less than 24 hours (86,400 seconds).

Code
check_date_before <- map(.x = ct, function(dataset) {
  dataset |>
    dttm_update(date_col = "Start_date", time_col = "Start_time") |>
    dttm_update(date_col = "End_date", time_col = "End_time") |>
    select(-ends_with("_time")) |>
    mutate(
      duration = as.duration(Start_date %--% End_date)
    ) |>
    filter(duration < 86400)
}) |>
  bind_rows(.id = "dataset") |>
  select(dataset, Camera_id, Start_date:duration)

check_date_before
# A tibble: 4 × 30
  dataset  Camera_id Start_date          End_date            Camera_problem
  <chr>    <chr>     <dttm>              <dttm>              <chr>         
1 Example8 cam5      2013-03-28 10:19:00 2013-03-29 00:31:00 Sí            
2 Example8 cam5      2013-03-28 10:19:00 2013-03-29 00:31:00 Sí            
3 Example1 cam06_26  2014-08-02 12:30:00 2013-09-01 10:51:00 Não           
4 Example8 cam5      2013-03-28 10:19:00 2013-03-29 00:31:00 Sí            
# ℹ 25 more variables: Problem1_from <dttm>, Problem1_to <dttm>,
#   Problem2_from <dttm>, Problem2_to <dttm>, Problem3_from <dttm>,
#   Problem3_to <dttm>, Problem4_from <dttm>, Problem4_to <dttm>,
#   Problem5_from <dttm>, Problem5_to <dttm>, Problem6_from <dttm>,
#   Problem6_to <dttm>, Problem7_from <dttm>, Problem7_to <dttm>,
#   Problem8_from <dttm>, Problem8_to <dttm>, Problem9_from <dttm>,
#   Problem9_to <dttm>, Problem10_from <dttm>, Problem10_to <dttm>, …

8.2.3 Check for Excessively Long Sampling Durations

We identify deployments with a sampling duration longer than 3 months (7,776,000 seconds).

Code
check_date_after <- map(.x = ct, function(dataset) {
  dataset |>
    dttm_update(date_col = "Start_date", time_col = "Start_time") |>
    dttm_update(date_col = "End_date", time_col = "End_time") |>
    select(-ends_with("_time")) |>
    mutate(
      duration = as.duration(Start_date %--% End_date)
    ) |>
    filter(duration > 7776000)
}) |>
  bind_rows(.id = "dataset") |>
  select(dataset, Camera_id, Start_date:duration)

check_date_after
# A tibble: 77 × 30
   dataset  Camera_id Start_date          End_date            Camera_problem
   <chr>    <chr>     <dttm>              <dttm>              <chr>         
 1 Example4 MX2_004   2018-09-08 14:50:00 2019-09-29 12:21:00 No            
 2 Example4 MX2_003   2018-09-08 15:29:00 2019-09-08 13:13:00 No            
 3 Example4 MX2_010   2018-09-09 15:19:00 2019-03-04 11:24:00 Sí            
 4 Example4 MX007     2018-09-08 17:58:00 2019-03-05 11:54:00 Sí            
 5 Example4 MX2_009   2018-09-09 14:36:00 2019-03-05 12:16:00 Sí            
 6 Example4 MX2_001   2018-09-09 12:31:00 2019-07-28 11:38:00 Sí            
 7 Example4 MX2_006   2018-09-08 11:51:00 2019-03-05 13:22:00 Sí            
 8 Example4 MX2_013   2018-08-15 13:55:00 2019-01-27 13:52:00 Sí            
 9 Example4 MX2_014   2018-08-15 14:20:00 2019-03-14 14:24:00 Sí            
10 Example4 MX2_015   2018-08-15 15:15:00 2019-05-24 13:38:00 No            
# ℹ 67 more rows
# ℹ 25 more variables: Problem1_from <dttm>, Problem1_to <dttm>,
#   Problem2_from <dttm>, Problem2_to <dttm>, Problem3_from <dttm>,
#   Problem3_to <dttm>, Problem4_from <dttm>, Problem4_to <dttm>,
#   Problem5_from <dttm>, Problem5_to <dttm>, Problem6_from <dttm>,
#   Problem6_to <dttm>, Problem7_from <dttm>, Problem7_to <dttm>,
#   Problem8_from <dttm>, Problem8_to <dttm>, Problem9_from <dttm>, …

8.2.4 Check for Future Dates

We flag deployments with start or end dates set in the future. We defined the threshold date as April 30th, 2025.

Code
check_date_future <- map(.x = ct, function(dataset) {
  data_thresh <- "2025-04-30"
  dataset |>
    mutate(
      date_start = ymd(as.character(Start_date)),
      date_end = ymd(as.character(End_date))
    ) |>
    filter(if_any(starts_with("date_"), ~ .x > data_thresh))
}) |>
  bind_rows(.id = "dataset") |>
  select(dataset, Camera_id, date_start:date_end)

check_date_future
# A tibble: 0 × 4
# ℹ 4 variables: dataset <chr>, Camera_id <chr>, date_start <date>,
#   date_end <date>

8.2.5 Check Species Records Within Sampling Interval

We load the species records data for cross-checking with camera trap intervals.

Code
rec <- read_sheet(
  path = "Example",
  sheet = "Species_records_camera",
  na = c("NA", "na")
)

datasets <- names(rec)

We check if each species record falls within the sampling interval of the corresponding camera. Firstly, we show if there were errors in terms of processing the code. There were none.

Code
species_records_within_ct_date <- list()
error_log <- tibble(dataset = character(), error_message = character())

for (dataset in datasets) {
  message(str_glue("Starting dataset {dataset}\n"))
  tryCatch(
    {
      camera <- ct[[dataset]] |>
        select(Structure_id, Camera_id, Start_date, End_date)

      species_records_within_ct_date[[dataset]] <- rec[[dataset]] |>
        inner_join(camera, by = c("Camera_id", "Structure_id")) |>
        mutate(
          across(ends_with("date"), as_datetime),
          excel_row = row_number() + 1,
          check = case_when(
            Record_date %within% c(Start_date %--% End_date) ~ "YES",
            TRUE ~ "NO"
          )
        ) |>
        filter(check == "NO") |>
        select(excel_row, Species, Camera_id, ends_with("date"), check)
      message(str_glue("Finalizing dataset {dataset}\n"))
    },
    error = function(e) {
      msg <- as.character(e$message)
      error_log <<- bind_rows(
        error_log,
        tibble(dataset = dataset, error_message = msg)
      )
      message(str_glue("Error in dataset {dataset}: {msg}\n"))
      return(NULL)
    }
  )
}

error_log
# A tibble: 0 × 2
# ℹ 2 variables: dataset <chr>, error_message <chr>

In the sequence, we summarize and print the number of records outside the sampling interval for each dataset.

Code
clean_species_records_within_ct_date <- species_records_within_ct_date |>
  discard(~ nrow(.x) == 0)

clean_species_records_within_ct_date |>
  imap_dfr(
    ~ tibble(
      dataset = .y,
      n = nrow(.x)
    )
  )
# A tibble: 7 × 2
  dataset       n
  <chr>     <int>
1 Example1    304
2 Example2      1
3 Example3      1
4 Example4    104
5 Example8     50
6 Example9     53
7 Example13    38

Finally, we export the records outside the sampling interval to an Excel file.

Code
clean_species_records_within_ct_date |>
  openxlsx::write.xlsx(
    "Output/REGISTROS_SP_FORA_DA_DATA.xlsx",
    asTable = TRUE,
    colWidths = "auto"
  )