14  Check Photos

14.1 Problem Description

This script validates the consistency between the photo/video filenames in the spreadsheets and the files present in the media folders. It also organizes media by dataset and type, identifies exact and partial matches, and generates a review spreadsheet.

14.2 Solution

We follow a staged workflow: (1) check whether dataset folder names match Excel filenames; (2) organize raw files into Example/Media/{dataset}/{image|video} (optional); (3) build a media inventory; (4) compare names between spreadsheets and files using stringdist; (5) copy exact matches to Example/Media/{dataset}/{ct|under|over}; (6) save a report of partial matches to Example/12/Output; (7) remove duplicates from the media root.

14.2.1 Common steps

First, we load the project’s utility functions. They include read_sheet, used to read the spreadsheets.

Code
source("R/FUNCTIONS.R")

14.2.2 Functions to organize media

Below we define one function per step of the media organization process. Each function has a single responsibility, which makes the pipeline easier to debug and reuse.

14.2.2.1 list_dataset_folders()

Lists only the first-level folders under the given media_path. These folders represent datasets that should match the Excel filenames.

Code
list_dataset_folders <- function(media_path) {
  if (!dir.exists(media_path)) {
    cli::cli_abort("Folder {media_path} not found.")
  }
  list.dirs(path = media_path, recursive = FALSE)
}

14.2.2.2 list_excel_dataset_names()

Reads the Excel files in a directory and returns their names without extensions. We use this to validate the dataset list.

Code
list_excel_dataset_names <- function(path = "Example/12") {
  read_sheet(path = path, results = FALSE, recurse = FALSE) |>
    names()
}

14.2.2.3 extract_dataset_from_path()

Extracts the dataset name from a full file path. The dataset_index lets you control which path segment should be interpreted as the dataset name.

Code
extract_dataset_from_path <- function(path, dataset_index = 3) {
  stringr::str_split_i(path, pattern = "\\/", dataset_index)
}

14.2.2.4 organize_media_files()

Builds the mapping between raw media files and their standardized destinations in Example/Media/{dataset}/{image|video}. When execute = TRUE, it creates the folders and copies the files.

Code
organize_media_files <- function(
  media_path = "Example/12",
  media_root = "Example/Media",
  dataset_index = 3,
  execute = FALSE
) {
  if (execute && !dir.exists(media_root)) {
    dir.create(media_root, recursive = TRUE)
  }

  folders <- list_dataset_folders(media_path)
  files <- list.files(folders, full.names = TRUE, recursive = TRUE)

  df <- tibble::tibble(file = files) |>
    dplyr::mutate(type = mime::guess_type(file)) |>
    dplyr::filter(stringr::str_detect(type, "^image/|^video")) |>
    dplyr::mutate(
      dataset = extract_dataset_from_path(file, dataset_index = dataset_index),
      media_type = stringr::str_split_i(type, pattern = "\\/", 1),
      folder = stringr::str_glue("{media_root}/{dataset}/{media_type}/"),
      filename = stringr::str_glue("{folder}{basename(file)}")
    )

  if (execute) {
    purrr::walk(unique(df$folder), ~ dir.create(.x, recursive = TRUE))
    file.copy(from = df$file, to = df$filename, recursive = TRUE)
  }

  df
}

14.2.3 Functions to compare names and generate reports

Now we define one function per comparison step: inventory, validation, matching, and output generation.

14.2.3.1 build_media_inventory()

Scans Example/Media and constructs three structures: the full file list, the list excluding ct/under/over, and a split list of already classified media.

Code
build_media_inventory <- function(
  media_root = "Example/Media",
  sources = c("ct", "under", "over")
) {
  if (!dir.exists(media_root)) {
    cli::cli_abort(
      "Media folder {media_root} not found. Run organize_media_files() with execute = TRUE first."
    )
  }

  media_files <- list.files(
    path = media_root,
    recursive = TRUE,
    full.names = TRUE
  ) |>
    dplyr::as_tibble() |>
    dplyr::mutate(
      dataset = stringr::word(value, 3, 3, sep = "\\/"),
      media_type = stringr::word(value, 4, 4, sep = "\\/"),
      file = basename(value)
    )

  media_files_types <- media_files |>
    dplyr::filter(media_type %in% sources)

  media_files_types_list <- media_files_types |>
    (\(df) split(df, df$dataset))() |>
    purrr::map(\(df) split(df, df$media_type))

  media_anti_join <- dplyr::anti_join(media_files, media_files_types)
  media_list <- split(media_anti_join, media_anti_join$dataset)

  list(
    media_files = media_files,
    media_list = media_list,
    media_files_types_list = media_files_types_list
  )
}

14.2.3.2 validate_source()

Confirms that the requested source is one of the accepted values.

Code
validate_source <- function(source, allowed = c("ct", "under", "over")) {
  if (!source %in% allowed) {
    cli::cli_abort(
      "Source {source} is not an accepted value. Accepted values are 'ct', 'under', 'over'"
    )
  }
}

14.2.3.3 load_source_data()

Loads the appropriate sheet based on the source and returns a named list of datasets.

Code
load_source_data <- function(source, path = "Example/12") {
  source <- as.character(source)

  sheet_name <- dplyr::case_match(
    source,
    "ct" ~ "Camera_trap",
    "under" ~ "Underpasses",
    "over" ~ "Overpasses"
  )

  if (is.na(sheet_name)) {
    cli::cli_abort("Source {source} is not mapped to a sheet.")
  }

  read_sheet(
    path = path,
    sheet = sheet_name,
    na = c("NA", "-"),
    recurse = FALSE
  )
}

14.2.3.4 ensure_dataset_names_match()

Checks whether dataset names in the Excel sheets match the dataset folders found in the media inventory.

Code
ensure_dataset_names_match <- function(datasets, media_list) {
  if (length(media_list) == 0) {
    cli::cli_abort(
      "Media inventory is empty. Make sure media files were copied to the Media folder."
    )
  }

  datasets_not_in_common <- dplyr::setdiff(names(datasets), names(media_list))

  if (length(datasets_not_in_common) != 0) {
    cli::cli_abort(
      "There are different datasets between media folder and Excel sheets."
    )
  }
}

14.2.3.5 extract_filenames_on_sheet()

Returns the unique list of filenames referenced in the spreadsheet for a given source.

Code
extract_filenames_on_sheet <- function(df, source) {
  column <- if (source == "ct") "Camera_vision_photo" else "Structure_photo"
  col_sym <- rlang::sym(column)

  df |>
    dplyr::distinct(dplyr::across(dplyr::all_of(column))) |>
    dplyr::filter(!is.na(!!col_sym)) |>
    dplyr::pull(!!col_sym)
}

14.2.3.6 media_candidates()

Filters out files that are already inside the source-specific folders (ct, under, over).

Code
media_candidates <- function(dataset_media, source) {
  dataset_media |>
    dplyr::filter(!stringr::str_detect(value, glue::glue("\\/{source}\\/")))
}

14.2.3.7 stringdist_table()

Builds a distance matrix between sheet filenames and media filenames, and flags rows with any exact match.

Code
stringdist_table <- function(filenames_on_sheet, media) {
  stringdist::stringdistmatrix(
    filenames_on_sheet,
    media,
    method = "lv",
    useNames = "strings"
  ) |>
    as.data.frame() |>
    tibble::rownames_to_column("sheet") |>
    dplyr::as_tibble() |>
    dplyr::mutate(
      match_exactly = dplyr::if_any(dplyr::where(is.numeric), ~ . == 0)
    )
}

14.2.3.8 build_match_candidates()

Converts the matrix to long format and computes matching flags for partial and case-insensitive matches.

Code
build_match_candidates <- function(df_stringdist) {
  df_stringdist |>
    tidyr::pivot_longer(
      cols = -c(sheet, match_exactly),
      names_to = "file",
      values_to = "stringdist"
    ) |>
    dplyr::mutate(
      match_file_no_extension = sheet == tools::file_path_sans_ext(file),
      match_file_diff_capitalization = stringr::str_to_upper(sheet) ==
        stringr::str_to_upper(file),
      match_partially = dplyr::if_any(
        dplyr::where(is.numeric),
        ~ dplyr::between(., 1, 5)
      )
    ) |>
    dplyr::relocate(match_exactly, .after = stringdist) |>
    dplyr::arrange(desc(match_file_no_extension), file)
}

14.2.3.9 get_media_type_files()

Returns the list of files already placed in a given source folder for a dataset.

Code
get_media_type_files <- function(media_files_types_list, dataset, source) {
  if (!dataset %in% names(media_files_types_list)) {
    return(character())
  }
  if (!source %in% names(media_files_types_list[[dataset]])) {
    return(character())
  }
  media_files_types_list[[dataset]][[source]]$file
}

14.2.3.10 dedupe_matches()

Removes redundant candidate pairs, keeping only the most informative combinations.

Code
dedupe_matches <- function(match_candidates, files_to_exclude = character()) {
  filtered_candidates <- match_candidates |>
    dplyr::filter(!file %in% files_to_exclude)

  dup_sheet_file <- filtered_candidates |>
    dplyr::filter(
      match_file_no_extension == TRUE |
        match_file_diff_capitalization == TRUE
    ) |>
    dplyr::select(sheet, file)

  filtered_candidates |>
    dplyr::mutate(
      keep = dplyr::case_when(
        match_file_no_extension == FALSE &
          match_file_diff_capitalization == FALSE &
          sheet %in% dup_sheet_file$sheet ~
          "REMOVE",
        match_file_no_extension == FALSE &
          match_file_diff_capitalization == FALSE &
          file %in% dup_sheet_file$file ~
          "REMOVE",
        TRUE ~ "KEEP"
      )
    ) |>
    dplyr::filter(keep == "KEEP") |>
    dplyr::select(-keep)
}

14.2.3.11 copy_exact_matches()

Copies files whose names match exactly into the source-specific folder and returns the list of copied files.

Code
copy_exact_matches <- function(
  df_stringdist,
  media_without_source,
  target_dir,
  execute = TRUE
) {
  media_match_exactly <- df_stringdist |>
    dplyr::filter(match_exactly == TRUE)

  if (nrow(media_match_exactly) == 0) {
    return(tibble::tibble(file = character(), full_path_to_copy = character()))
  }

  if (execute && !dir.exists(target_dir)) {
    dir.create(target_dir, recursive = TRUE)
  }

  files_found_in_sheet <- df_stringdist |>
    dplyr::filter(match_exactly == TRUE) |>
    dplyr::pull(sheet) |>
    tibble::enframe(value = "file") |>
    dplyr::inner_join(media_without_source, by = "file") |>
    dplyr::mutate(full_path_to_copy = glue::glue("{target_dir}/{file}"))

  if (execute) {
    file.copy(
      from = files_found_in_sheet$value,
      to = files_found_in_sheet$full_path_to_copy,
      overwrite = FALSE,
      copy.date = TRUE
    )
  }

  files_found_in_sheet
}

14.2.3.12 process_dataset()

Runs the full matching logic for one dataset: prepares candidates, copies exact matches, and returns remaining candidates.

Code
process_dataset <- function(
  df,
  dataset_name,
  source,
  media_list,
  media_files_types_list,
  media_root,
  execute_copy = TRUE
) {
  cli::cli_alert_info("Processing dataset {dataset_name}.")

  if (is.null(media_list[[dataset_name]])) {
    cli::cli_abort(
      "Dataset {dataset_name} not found in the media inventory. Check media_root and folder structure."
    )
  }

  filenames_on_sheet <- extract_filenames_on_sheet(df, source)

  if (length(filenames_on_sheet) == 0) {
    return(tibble::tibble())
  }

  media_without_source <- media_candidates(media_list[[dataset_name]], source)

  if (nrow(media_without_source) == 0) {
    return(tibble::tibble())
  }

  media <- media_without_source |>
    dplyr::distinct(file) |>
    dplyr::pull(file)

  df_stringdist <- stringdist_table(filenames_on_sheet, media)

  files_copied <- copy_exact_matches(
    df_stringdist,
    media_without_source,
    glue::glue("{media_root}/{dataset_name}/{source}"),
    execute = execute_copy
  )

  match_candidates <- build_match_candidates(df_stringdist)

  files_in_source <- get_media_type_files(
    media_files_types_list,
    dataset_name,
    source
  )
  dedupe_matches(
    match_candidates,
    files_to_exclude = c(files_copied$file, files_in_source)
  )
}

14.2.3.13 check_match_media()

Applies process_dataset() to every dataset for a given source and returns a combined table.

Code
check_match_media <- function(
  source,
  media_list,
  media_files_types_list,
  media_root,
  path = "Example/12",
  execute_copy = TRUE,
  strict = TRUE
) {
  validate_source(source)
  cli::cli_alert("Starting source {source}")

  datasets <- load_source_data(source, path = path)
  if (strict) {
    ensure_dataset_names_match(datasets, media_list)
  }

  datasets_with_content <- datasets |>
    purrr::keep(~ nrow(.x) > 0)

  res <- purrr::imap(
    datasets_with_content,
    ~ process_dataset(
      .x,
      .y,
      source,
      media_list,
      media_files_types_list,
      media_root,
      execute_copy
    )
  )

  res |>
    purrr::keep(~ nrow(.x) > 0) |>
    dplyr::bind_rows(.id = "dataset") |>
    dplyr::mutate(source = source, .after = dataset)
}

14.2.3.14 save_partial_matches()

Writes only the partial matches to an Excel file inside Example/12/Output.

Code
save_partial_matches <- function(
  result,
  output_path = stringr::str_glue(
    "Example/12/Output/SCRIPT12_check_names_photos_{lubridate::today()}.xlsx"
  )
) {
  if (!dir.exists(dirname(output_path))) {
    dir.create(dirname(output_path), recursive = TRUE)
  }

  partial_matches <- result |>
    purrr::map(
      ~ .x |>
        dplyr::filter(
          match_exactly == FALSE,
          match_partially == TRUE
        )
    ) |>
    dplyr::bind_rows() |>
    tidyr::nest(.by = dataset) |>
    dplyr::arrange(dataset) |>
    dplyr::mutate(data = purrr::set_names(data, dataset)) |>
    dplyr::pull(data)

  if (length(partial_matches) == 0) {
    return(invisible(NULL))
  }

  partial_matches |>
    openxlsx2::write_xlsx(
      output_path,
      as_table = TRUE,
      overwrite = TRUE
    )
}

14.2.3.15 cleanup_media_root()

Removes duplicates from the media root after exact matches were copied to source folders.

Code
cleanup_media_root <- function(
  result,
  media_files,
  sources,
  execute = TRUE
) {
  sources_regex <- glue::glue_collapse(sources, "|")

  files_to_delete <- result |>
    purrr::map(
      ~ .x |>
        dplyr::filter(
          match_exactly == TRUE,
        ) |>
        dplyr::distinct(sheet, .keep_all = TRUE) |>
        dplyr::select(dataset, file = sheet)
    ) |>
    dplyr::bind_rows() |>
    dplyr::inner_join(media_files, by = c("dataset", "file")) |>
    dplyr::filter(
      !stringr::str_detect(value, glue::glue("\\/{sources_regex}\\/"))
    ) |>
    dplyr::pull(value)

  if (execute) {
    file.remove(files_to_delete)
  }

  files_to_delete
}

14.2.3.16 run_check_match_media()

Runs the full pipeline for all sources and returns the combined result invisibly.

Code
run_check_match_media <- function(
  sources,
  media_list,
  media_files,
  media_files_types_list,
  media_root,
  execute_copy = TRUE,
  execute_cleanup = TRUE
) {
  result <- purrr::map(purrr::set_names(sources), function(source) {
    check_match_media(
      source,
      media_list,
      media_files_types_list,
      media_root,
      execute_copy = execute_copy
    )
  })

  save_partial_matches(result)
  cleanup_media_root(result, media_files, sources, execute = execute_cleanup)

  invisible(result)
}

14.2.3.17 preview_match_tables()

Builds the two intermediate tables used for manual inspection: the wide stringdist matrix and the long candidate table.

Code
preview_match_tables <- function(
  source,
  dataset_name,
  media_list,
  media_files_types_list,
  path = "Example/12"
) {
  datasets <- load_source_data(source, path = path)

  if (!dataset_name %in% names(datasets)) {
    cli::cli_abort("Dataset {dataset_name} not found in the spreadsheet.")
  }

  media_without_source <- media_candidates(media_list[[dataset_name]], source)

  media <- media_without_source |>
    dplyr::distinct(file) |>
    dplyr::pull(file)

  filenames_on_sheet <- extract_filenames_on_sheet(
    datasets[[dataset_name]],
    source
  )

  df_stringdist <- stringdist_table(filenames_on_sheet, media)
  match_candidates <- build_match_candidates(df_stringdist)

  files_in_source <- get_media_type_files(
    media_files_types_list,
    dataset_name,
    source
  )
  match_candidates <- dedupe_matches(
    match_candidates,
    files_to_exclude = files_in_source
  )

  list(
    df_stringdist = df_stringdist,
    match_candidates = match_candidates
  )
}

14.2.4 Initial comparison of dataset names

Here we check whether the folder names in Example match the Excel filenames.

Code
media_path <- "Example/12"
folders <- list_dataset_folders(media_path)

names_folders <- basename(folders)
names_excel <- list_excel_dataset_names(path = media_path)

waldo::compare(names_folders, names_excel)
`old`: "Example0" "Example1" "Output"
`new`: "Example0" "Example1"         

14.2.5 Optional organization of raw media

This chunk prepares and executes the copy mapping for files into Example/Media/. If you only want to preview the mapping, set execute = FALSE.

Code
media_map <- organize_media_files(
  media_path = "Example/12",
  media_root = "Example/Media",
  dataset_index = 3,
  execute = TRUE
)
Warning in dir.create(.x, recursive = TRUE): 'Example/Media/Example0/image'
already exists
Warning in dir.create(.x, recursive = TRUE): 'Example/Media/Example1/image'
already exists
Warning in file.copy(from = df$file, to = df$filename, recursive = TRUE):
'recursive' will be ignored as 'to' is not a single existing directory
Code
media_map |>
  dplyr::select(dataset, media_type, file, filename) |>
  dplyr::slice_head(n = 10)
# A tibble: 10 × 4
   dataset  media_type file                                            filename 
   <chr>    <chr>      <chr>                                           <glue>   
 1 Example0 image      Example/12/Example0/PNSB01_PNSB_cam1b_21mar.jpg Example/…
 2 Example0 image      Example/12/Example0/PNSB01_PNSB_cam1c_02mai.jpg Example/…
 3 Example0 image      Example/12/Example0/PNSB01_PNSB_cam1d_13jun.jpg Example/…
 4 Example0 image      Example/12/Example0/PNSB01_PNSB_cam1e_23ago.jpg Example/…
 5 Example0 image      Example/12/Example0/PNSB01_PNSB_cam1f_12jul.jpg Example/…
 6 Example0 image      Example/12/Example0/PNSB01.JPG                  Example/…
 7 Example0 image      Example/12/Example0/PNSB02_PNSB_cam2a_02mai.jpg Example/…
 8 Example0 image      Example/12/Example0/PNSB02_PNSB_cam2b_13jun.jpg Example/…
 9 Example0 image      Example/12/Example0/PNSB02_PNSB_cam2d_12jul.jpg Example/…
10 Example0 image      Example/12/Example0/PNSB02.jpg                  Example/…

14.2.6 Media inventory

We now create an inventory of all files already organized under Example/Media/, separating what is in the root from what has already been classified as ct, under, or over.

Code
media_inventory <- build_media_inventory(media_root = "Example/Media")
media_files <- media_inventory$media_files
media_list <- media_inventory$media_list
media_files_types_list <- media_inventory$media_files_types_list

media_files |>
  dplyr::count(media_type, sort = TRUE)
# A tibble: 4 × 2
  media_type     n
  <chr>      <int>
1 image         47
2 ct            19
3 over          10
4 under          4

14.2.7 Intermediate outputs preview

To make review easier, the chunk below shows two intermediate outputs: (1) the wide stringdist table with match_exactly; (2) the long table with match_partially and other flags. Adjust example_source or example_dataset as needed.

Code
example_source <- "ct"
example_datasets <- load_source_data(example_source)

example_dataset <- intersect(names(example_datasets), names(media_list))[1]

if (!is.na(example_dataset)) {
  preview <- preview_match_tables(
    source = example_source,
    dataset_name = example_dataset,
    media_list = media_list,
    media_files_types_list = media_files_types_list
  )

  preview$df_stringdist |>
    dplyr::slice_head(n = 5)

  preview$match_candidates |>
    dplyr::slice_head(n = 20)
}
# A tibble: 20 × 7
   sheet                   file  stringdist match_exactly match_file_no_extens…¹
   <chr>                   <chr>      <dbl> <lgl>         <lgl>                 
 1 PNSB01_PNSB_cam1b_21ma… PNSB…         20 TRUE          FALSE                 
 2 PNSB01_PNSB_cam1c_02ma… PNSB…         20 TRUE          FALSE                 
 3 PNSB01_PNSB_cam1d_13ju… PNSB…         20 TRUE          FALSE                 
 4 PNSB01_PNSB_cam1e_23ag… PNSB…         20 TRUE          FALSE                 
 5 PNSB02_PNSB_cam2a_02ma… PNSB…         21 TRUE          FALSE                 
 6 PNSB02_PNSB_cam2b_13ju… PNSB…         20 TRUE          FALSE                 
 7 PNSB03_PNSB_cam3b_13ju… PNSB…         20 TRUE          FALSE                 
 8 PNSB04_PNSB_cam4c_21ma… PNSB…         20 TRUE          FALSE                 
 9 PNSB04_PNSB_cam4d_02ma… PNSB…         21 TRUE          FALSE                 
10 PNSB04_PNSB_cam4e_13ju… PNSB…         20 TRUE          FALSE                 
11 PNSB01_PNSB_cam1f_12ju… PNSB…         20 TRUE          FALSE                 
12 PNSB02_PNSB_cam2d_12ju… PNSB…         20 TRUE          FALSE                 
13 PNSB03_PNSB_cam3d_12ju… PNSB…         20 TRUE          FALSE                 
14 PNSB05_PNSB_cam5c_12ju… PNSB…         20 TRUE          FALSE                 
15 PNSB06_PNSB_cam6a_12ju… PNSB…         20 TRUE          FALSE                 
16 PNSB01_PNSB_cam1b_21ma… PNSB…         17 TRUE          FALSE                 
17 PNSB01_PNSB_cam1c_02ma… PNSB…         17 TRUE          FALSE                 
18 PNSB01_PNSB_cam1d_13ju… PNSB…         18 TRUE          FALSE                 
19 PNSB01_PNSB_cam1e_23ag… PNSB…         17 TRUE          FALSE                 
20 PNSB02_PNSB_cam2a_02ma… PNSB…         17 TRUE          FALSE                 
# ℹ abbreviated name: ¹​match_file_no_extension
# ℹ 2 more variables: match_file_diff_capitalization <lgl>,
#   match_partially <lgl>

14.2.8 Run the checker

Finally, we run the full pipeline. It generates the Excel file with partial matches and removes duplicates from the Example/Media/ root. If you only want to simulate cleanup, set execute_cleanup = FALSE.

Code
sources <- c("ct", "under", "over")

result <- run_check_match_media(
  sources = sources,
  media_list = media_list,
  media_files = media_files,
  media_files_types_list = media_files_types_list,
  media_root = "Example/Media",
  execute_copy = TRUE,
  execute_cleanup = TRUE
)

14.2.9 Results summary

This last step summarizes records that remained as candidates (partial matches) or inconsistencies.

Code
result |>
  purrr::map(
    ~ .x |>
      dplyr::filter(
        match_exactly == FALSE,
        match_partially == TRUE
      )
  ) |>
  dplyr::bind_rows(.id = "source") |>
  dplyr::glimpse()
Rows: 15
Columns: 9
$ dataset                        <chr> "Example1", "Example1", "Example1", "Ex…
$ source                         <chr> "ct", "ct", "ct", "ct", "ct", "ct", "ct…
$ sheet                          <chr> "DSCF0028 - frame at 0m9s", "DSCF0033 -…
$ file                           <chr> "DSCF0028 - frame at 0m9s.jpg", "DSCF00…
$ stringdist                     <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 5, 2, 1, 3, 1, …
$ match_exactly                  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ match_file_no_extension        <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
$ match_file_diff_capitalization <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ match_partially                <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…