Code
source("R/FUNCTIONS.R")This script validates the consistency between the photo/video filenames in the spreadsheets and the files present in the media folders. It also organizes media by dataset and type, identifies exact and partial matches, and generates a review spreadsheet.
We follow a staged workflow: (1) check whether dataset folder names match Excel filenames; (2) organize raw files into Example/Media/{dataset}/{image|video} (optional); (3) build a media inventory; (4) compare names between spreadsheets and files using stringdist; (5) copy exact matches to Example/Media/{dataset}/{ct|under|over}; (6) save a report of partial matches to Example/12/Output; (7) remove duplicates from the media root.
First, we load the project’s utility functions. They include read_sheet, used to read the spreadsheets.
source("R/FUNCTIONS.R")Below we define one function per step of the media organization process. Each function has a single responsibility, which makes the pipeline easier to debug and reuse.
list_dataset_folders()Lists only the first-level folders under the given media_path. These folders represent datasets that should match the Excel filenames.
list_dataset_folders <- function(media_path) {
if (!dir.exists(media_path)) {
cli::cli_abort("Folder {media_path} not found.")
}
list.dirs(path = media_path, recursive = FALSE)
}list_excel_dataset_names()Reads the Excel files in a directory and returns their names without extensions. We use this to validate the dataset list.
list_excel_dataset_names <- function(path = "Example/12") {
read_sheet(path = path, results = FALSE, recurse = FALSE) |>
names()
}extract_dataset_from_path()Extracts the dataset name from a full file path. The dataset_index lets you control which path segment should be interpreted as the dataset name.
extract_dataset_from_path <- function(path, dataset_index = 3) {
stringr::str_split_i(path, pattern = "\\/", dataset_index)
}organize_media_files()Builds the mapping between raw media files and their standardized destinations in Example/Media/{dataset}/{image|video}. When execute = TRUE, it creates the folders and copies the files.
organize_media_files <- function(
media_path = "Example/12",
media_root = "Example/Media",
dataset_index = 3,
execute = FALSE
) {
if (execute && !dir.exists(media_root)) {
dir.create(media_root, recursive = TRUE)
}
folders <- list_dataset_folders(media_path)
files <- list.files(folders, full.names = TRUE, recursive = TRUE)
df <- tibble::tibble(file = files) |>
dplyr::mutate(type = mime::guess_type(file)) |>
dplyr::filter(stringr::str_detect(type, "^image/|^video")) |>
dplyr::mutate(
dataset = extract_dataset_from_path(file, dataset_index = dataset_index),
media_type = stringr::str_split_i(type, pattern = "\\/", 1),
folder = stringr::str_glue("{media_root}/{dataset}/{media_type}/"),
filename = stringr::str_glue("{folder}{basename(file)}")
)
if (execute) {
purrr::walk(unique(df$folder), ~ dir.create(.x, recursive = TRUE))
file.copy(from = df$file, to = df$filename, recursive = TRUE)
}
df
}Now we define one function per comparison step: inventory, validation, matching, and output generation.
build_media_inventory()Scans Example/Media and constructs three structures: the full file list, the list excluding ct/under/over, and a split list of already classified media.
build_media_inventory <- function(
media_root = "Example/Media",
sources = c("ct", "under", "over")
) {
if (!dir.exists(media_root)) {
cli::cli_abort(
"Media folder {media_root} not found. Run organize_media_files() with execute = TRUE first."
)
}
media_files <- list.files(
path = media_root,
recursive = TRUE,
full.names = TRUE
) |>
dplyr::as_tibble() |>
dplyr::mutate(
dataset = stringr::word(value, 3, 3, sep = "\\/"),
media_type = stringr::word(value, 4, 4, sep = "\\/"),
file = basename(value)
)
media_files_types <- media_files |>
dplyr::filter(media_type %in% sources)
media_files_types_list <- media_files_types |>
(\(df) split(df, df$dataset))() |>
purrr::map(\(df) split(df, df$media_type))
media_anti_join <- dplyr::anti_join(media_files, media_files_types)
media_list <- split(media_anti_join, media_anti_join$dataset)
list(
media_files = media_files,
media_list = media_list,
media_files_types_list = media_files_types_list
)
}validate_source()Confirms that the requested source is one of the accepted values.
validate_source <- function(source, allowed = c("ct", "under", "over")) {
if (!source %in% allowed) {
cli::cli_abort(
"Source {source} is not an accepted value. Accepted values are 'ct', 'under', 'over'"
)
}
}load_source_data()Loads the appropriate sheet based on the source and returns a named list of datasets.
load_source_data <- function(source, path = "Example/12") {
source <- as.character(source)
sheet_name <- dplyr::case_match(
source,
"ct" ~ "Camera_trap",
"under" ~ "Underpasses",
"over" ~ "Overpasses"
)
if (is.na(sheet_name)) {
cli::cli_abort("Source {source} is not mapped to a sheet.")
}
read_sheet(
path = path,
sheet = sheet_name,
na = c("NA", "-"),
recurse = FALSE
)
}ensure_dataset_names_match()Checks whether dataset names in the Excel sheets match the dataset folders found in the media inventory.
ensure_dataset_names_match <- function(datasets, media_list) {
if (length(media_list) == 0) {
cli::cli_abort(
"Media inventory is empty. Make sure media files were copied to the Media folder."
)
}
datasets_not_in_common <- dplyr::setdiff(names(datasets), names(media_list))
if (length(datasets_not_in_common) != 0) {
cli::cli_abort(
"There are different datasets between media folder and Excel sheets."
)
}
}extract_filenames_on_sheet()Returns the unique list of filenames referenced in the spreadsheet for a given source.
extract_filenames_on_sheet <- function(df, source) {
column <- if (source == "ct") "Camera_vision_photo" else "Structure_photo"
col_sym <- rlang::sym(column)
df |>
dplyr::distinct(dplyr::across(dplyr::all_of(column))) |>
dplyr::filter(!is.na(!!col_sym)) |>
dplyr::pull(!!col_sym)
}media_candidates()Filters out files that are already inside the source-specific folders (ct, under, over).
media_candidates <- function(dataset_media, source) {
dataset_media |>
dplyr::filter(!stringr::str_detect(value, glue::glue("\\/{source}\\/")))
}stringdist_table()Builds a distance matrix between sheet filenames and media filenames, and flags rows with any exact match.
stringdist_table <- function(filenames_on_sheet, media) {
stringdist::stringdistmatrix(
filenames_on_sheet,
media,
method = "lv",
useNames = "strings"
) |>
as.data.frame() |>
tibble::rownames_to_column("sheet") |>
dplyr::as_tibble() |>
dplyr::mutate(
match_exactly = dplyr::if_any(dplyr::where(is.numeric), ~ . == 0)
)
}build_match_candidates()Converts the matrix to long format and computes matching flags for partial and case-insensitive matches.
build_match_candidates <- function(df_stringdist) {
df_stringdist |>
tidyr::pivot_longer(
cols = -c(sheet, match_exactly),
names_to = "file",
values_to = "stringdist"
) |>
dplyr::mutate(
match_file_no_extension = sheet == tools::file_path_sans_ext(file),
match_file_diff_capitalization = stringr::str_to_upper(sheet) ==
stringr::str_to_upper(file),
match_partially = dplyr::if_any(
dplyr::where(is.numeric),
~ dplyr::between(., 1, 5)
)
) |>
dplyr::relocate(match_exactly, .after = stringdist) |>
dplyr::arrange(desc(match_file_no_extension), file)
}get_media_type_files()Returns the list of files already placed in a given source folder for a dataset.
get_media_type_files <- function(media_files_types_list, dataset, source) {
if (!dataset %in% names(media_files_types_list)) {
return(character())
}
if (!source %in% names(media_files_types_list[[dataset]])) {
return(character())
}
media_files_types_list[[dataset]][[source]]$file
}dedupe_matches()Removes redundant candidate pairs, keeping only the most informative combinations.
dedupe_matches <- function(match_candidates, files_to_exclude = character()) {
filtered_candidates <- match_candidates |>
dplyr::filter(!file %in% files_to_exclude)
dup_sheet_file <- filtered_candidates |>
dplyr::filter(
match_file_no_extension == TRUE |
match_file_diff_capitalization == TRUE
) |>
dplyr::select(sheet, file)
filtered_candidates |>
dplyr::mutate(
keep = dplyr::case_when(
match_file_no_extension == FALSE &
match_file_diff_capitalization == FALSE &
sheet %in% dup_sheet_file$sheet ~
"REMOVE",
match_file_no_extension == FALSE &
match_file_diff_capitalization == FALSE &
file %in% dup_sheet_file$file ~
"REMOVE",
TRUE ~ "KEEP"
)
) |>
dplyr::filter(keep == "KEEP") |>
dplyr::select(-keep)
}copy_exact_matches()Copies files whose names match exactly into the source-specific folder and returns the list of copied files.
copy_exact_matches <- function(
df_stringdist,
media_without_source,
target_dir,
execute = TRUE
) {
media_match_exactly <- df_stringdist |>
dplyr::filter(match_exactly == TRUE)
if (nrow(media_match_exactly) == 0) {
return(tibble::tibble(file = character(), full_path_to_copy = character()))
}
if (execute && !dir.exists(target_dir)) {
dir.create(target_dir, recursive = TRUE)
}
files_found_in_sheet <- df_stringdist |>
dplyr::filter(match_exactly == TRUE) |>
dplyr::pull(sheet) |>
tibble::enframe(value = "file") |>
dplyr::inner_join(media_without_source, by = "file") |>
dplyr::mutate(full_path_to_copy = glue::glue("{target_dir}/{file}"))
if (execute) {
file.copy(
from = files_found_in_sheet$value,
to = files_found_in_sheet$full_path_to_copy,
overwrite = FALSE,
copy.date = TRUE
)
}
files_found_in_sheet
}process_dataset()Runs the full matching logic for one dataset: prepares candidates, copies exact matches, and returns remaining candidates.
process_dataset <- function(
df,
dataset_name,
source,
media_list,
media_files_types_list,
media_root,
execute_copy = TRUE
) {
cli::cli_alert_info("Processing dataset {dataset_name}.")
if (is.null(media_list[[dataset_name]])) {
cli::cli_abort(
"Dataset {dataset_name} not found in the media inventory. Check media_root and folder structure."
)
}
filenames_on_sheet <- extract_filenames_on_sheet(df, source)
if (length(filenames_on_sheet) == 0) {
return(tibble::tibble())
}
media_without_source <- media_candidates(media_list[[dataset_name]], source)
if (nrow(media_without_source) == 0) {
return(tibble::tibble())
}
media <- media_without_source |>
dplyr::distinct(file) |>
dplyr::pull(file)
df_stringdist <- stringdist_table(filenames_on_sheet, media)
files_copied <- copy_exact_matches(
df_stringdist,
media_without_source,
glue::glue("{media_root}/{dataset_name}/{source}"),
execute = execute_copy
)
match_candidates <- build_match_candidates(df_stringdist)
files_in_source <- get_media_type_files(
media_files_types_list,
dataset_name,
source
)
dedupe_matches(
match_candidates,
files_to_exclude = c(files_copied$file, files_in_source)
)
}check_match_media()Applies process_dataset() to every dataset for a given source and returns a combined table.
check_match_media <- function(
source,
media_list,
media_files_types_list,
media_root,
path = "Example/12",
execute_copy = TRUE,
strict = TRUE
) {
validate_source(source)
cli::cli_alert("Starting source {source}")
datasets <- load_source_data(source, path = path)
if (strict) {
ensure_dataset_names_match(datasets, media_list)
}
datasets_with_content <- datasets |>
purrr::keep(~ nrow(.x) > 0)
res <- purrr::imap(
datasets_with_content,
~ process_dataset(
.x,
.y,
source,
media_list,
media_files_types_list,
media_root,
execute_copy
)
)
res |>
purrr::keep(~ nrow(.x) > 0) |>
dplyr::bind_rows(.id = "dataset") |>
dplyr::mutate(source = source, .after = dataset)
}save_partial_matches()Writes only the partial matches to an Excel file inside Example/12/Output.
save_partial_matches <- function(
result,
output_path = stringr::str_glue(
"Example/12/Output/SCRIPT12_check_names_photos_{lubridate::today()}.xlsx"
)
) {
if (!dir.exists(dirname(output_path))) {
dir.create(dirname(output_path), recursive = TRUE)
}
partial_matches <- result |>
purrr::map(
~ .x |>
dplyr::filter(
match_exactly == FALSE,
match_partially == TRUE
)
) |>
dplyr::bind_rows() |>
tidyr::nest(.by = dataset) |>
dplyr::arrange(dataset) |>
dplyr::mutate(data = purrr::set_names(data, dataset)) |>
dplyr::pull(data)
if (length(partial_matches) == 0) {
return(invisible(NULL))
}
partial_matches |>
openxlsx2::write_xlsx(
output_path,
as_table = TRUE,
overwrite = TRUE
)
}cleanup_media_root()Removes duplicates from the media root after exact matches were copied to source folders.
cleanup_media_root <- function(
result,
media_files,
sources,
execute = TRUE
) {
sources_regex <- glue::glue_collapse(sources, "|")
files_to_delete <- result |>
purrr::map(
~ .x |>
dplyr::filter(
match_exactly == TRUE,
) |>
dplyr::distinct(sheet, .keep_all = TRUE) |>
dplyr::select(dataset, file = sheet)
) |>
dplyr::bind_rows() |>
dplyr::inner_join(media_files, by = c("dataset", "file")) |>
dplyr::filter(
!stringr::str_detect(value, glue::glue("\\/{sources_regex}\\/"))
) |>
dplyr::pull(value)
if (execute) {
file.remove(files_to_delete)
}
files_to_delete
}run_check_match_media()Runs the full pipeline for all sources and returns the combined result invisibly.
run_check_match_media <- function(
sources,
media_list,
media_files,
media_files_types_list,
media_root,
execute_copy = TRUE,
execute_cleanup = TRUE
) {
result <- purrr::map(purrr::set_names(sources), function(source) {
check_match_media(
source,
media_list,
media_files_types_list,
media_root,
execute_copy = execute_copy
)
})
save_partial_matches(result)
cleanup_media_root(result, media_files, sources, execute = execute_cleanup)
invisible(result)
}preview_match_tables()Builds the two intermediate tables used for manual inspection: the wide stringdist matrix and the long candidate table.
preview_match_tables <- function(
source,
dataset_name,
media_list,
media_files_types_list,
path = "Example/12"
) {
datasets <- load_source_data(source, path = path)
if (!dataset_name %in% names(datasets)) {
cli::cli_abort("Dataset {dataset_name} not found in the spreadsheet.")
}
media_without_source <- media_candidates(media_list[[dataset_name]], source)
media <- media_without_source |>
dplyr::distinct(file) |>
dplyr::pull(file)
filenames_on_sheet <- extract_filenames_on_sheet(
datasets[[dataset_name]],
source
)
df_stringdist <- stringdist_table(filenames_on_sheet, media)
match_candidates <- build_match_candidates(df_stringdist)
files_in_source <- get_media_type_files(
media_files_types_list,
dataset_name,
source
)
match_candidates <- dedupe_matches(
match_candidates,
files_to_exclude = files_in_source
)
list(
df_stringdist = df_stringdist,
match_candidates = match_candidates
)
}Here we check whether the folder names in Example match the Excel filenames.
media_path <- "Example/12"
folders <- list_dataset_folders(media_path)
names_folders <- basename(folders)
names_excel <- list_excel_dataset_names(path = media_path)
waldo::compare(names_folders, names_excel)`old`: "Example0" "Example1" "Output"
`new`: "Example0" "Example1"
This chunk prepares and executes the copy mapping for files into Example/Media/. If you only want to preview the mapping, set execute = FALSE.
media_map <- organize_media_files(
media_path = "Example/12",
media_root = "Example/Media",
dataset_index = 3,
execute = TRUE
)Warning in dir.create(.x, recursive = TRUE): 'Example/Media/Example0/image'
already exists
Warning in dir.create(.x, recursive = TRUE): 'Example/Media/Example1/image'
already exists
Warning in file.copy(from = df$file, to = df$filename, recursive = TRUE):
'recursive' will be ignored as 'to' is not a single existing directory
media_map |>
dplyr::select(dataset, media_type, file, filename) |>
dplyr::slice_head(n = 10)# A tibble: 10 × 4
dataset media_type file filename
<chr> <chr> <chr> <glue>
1 Example0 image Example/12/Example0/PNSB01_PNSB_cam1b_21mar.jpg Example/…
2 Example0 image Example/12/Example0/PNSB01_PNSB_cam1c_02mai.jpg Example/…
3 Example0 image Example/12/Example0/PNSB01_PNSB_cam1d_13jun.jpg Example/…
4 Example0 image Example/12/Example0/PNSB01_PNSB_cam1e_23ago.jpg Example/…
5 Example0 image Example/12/Example0/PNSB01_PNSB_cam1f_12jul.jpg Example/…
6 Example0 image Example/12/Example0/PNSB01.JPG Example/…
7 Example0 image Example/12/Example0/PNSB02_PNSB_cam2a_02mai.jpg Example/…
8 Example0 image Example/12/Example0/PNSB02_PNSB_cam2b_13jun.jpg Example/…
9 Example0 image Example/12/Example0/PNSB02_PNSB_cam2d_12jul.jpg Example/…
10 Example0 image Example/12/Example0/PNSB02.jpg Example/…
We now create an inventory of all files already organized under Example/Media/, separating what is in the root from what has already been classified as ct, under, or over.
media_inventory <- build_media_inventory(media_root = "Example/Media")
media_files <- media_inventory$media_files
media_list <- media_inventory$media_list
media_files_types_list <- media_inventory$media_files_types_list
media_files |>
dplyr::count(media_type, sort = TRUE)# A tibble: 4 × 2
media_type n
<chr> <int>
1 image 47
2 ct 19
3 over 10
4 under 4
To make review easier, the chunk below shows two intermediate outputs: (1) the wide stringdist table with match_exactly; (2) the long table with match_partially and other flags. Adjust example_source or example_dataset as needed.
example_source <- "ct"
example_datasets <- load_source_data(example_source)
example_dataset <- intersect(names(example_datasets), names(media_list))[1]
if (!is.na(example_dataset)) {
preview <- preview_match_tables(
source = example_source,
dataset_name = example_dataset,
media_list = media_list,
media_files_types_list = media_files_types_list
)
preview$df_stringdist |>
dplyr::slice_head(n = 5)
preview$match_candidates |>
dplyr::slice_head(n = 20)
}# A tibble: 20 × 7
sheet file stringdist match_exactly match_file_no_extens…¹
<chr> <chr> <dbl> <lgl> <lgl>
1 PNSB01_PNSB_cam1b_21ma… PNSB… 20 TRUE FALSE
2 PNSB01_PNSB_cam1c_02ma… PNSB… 20 TRUE FALSE
3 PNSB01_PNSB_cam1d_13ju… PNSB… 20 TRUE FALSE
4 PNSB01_PNSB_cam1e_23ag… PNSB… 20 TRUE FALSE
5 PNSB02_PNSB_cam2a_02ma… PNSB… 21 TRUE FALSE
6 PNSB02_PNSB_cam2b_13ju… PNSB… 20 TRUE FALSE
7 PNSB03_PNSB_cam3b_13ju… PNSB… 20 TRUE FALSE
8 PNSB04_PNSB_cam4c_21ma… PNSB… 20 TRUE FALSE
9 PNSB04_PNSB_cam4d_02ma… PNSB… 21 TRUE FALSE
10 PNSB04_PNSB_cam4e_13ju… PNSB… 20 TRUE FALSE
11 PNSB01_PNSB_cam1f_12ju… PNSB… 20 TRUE FALSE
12 PNSB02_PNSB_cam2d_12ju… PNSB… 20 TRUE FALSE
13 PNSB03_PNSB_cam3d_12ju… PNSB… 20 TRUE FALSE
14 PNSB05_PNSB_cam5c_12ju… PNSB… 20 TRUE FALSE
15 PNSB06_PNSB_cam6a_12ju… PNSB… 20 TRUE FALSE
16 PNSB01_PNSB_cam1b_21ma… PNSB… 17 TRUE FALSE
17 PNSB01_PNSB_cam1c_02ma… PNSB… 17 TRUE FALSE
18 PNSB01_PNSB_cam1d_13ju… PNSB… 18 TRUE FALSE
19 PNSB01_PNSB_cam1e_23ag… PNSB… 17 TRUE FALSE
20 PNSB02_PNSB_cam2a_02ma… PNSB… 17 TRUE FALSE
# ℹ abbreviated name: ¹match_file_no_extension
# ℹ 2 more variables: match_file_diff_capitalization <lgl>,
# match_partially <lgl>
Finally, we run the full pipeline. It generates the Excel file with partial matches and removes duplicates from the Example/Media/ root. If you only want to simulate cleanup, set execute_cleanup = FALSE.
sources <- c("ct", "under", "over")
result <- run_check_match_media(
sources = sources,
media_list = media_list,
media_files = media_files,
media_files_types_list = media_files_types_list,
media_root = "Example/Media",
execute_copy = TRUE,
execute_cleanup = TRUE
)This last step summarizes records that remained as candidates (partial matches) or inconsistencies.
result |>
purrr::map(
~ .x |>
dplyr::filter(
match_exactly == FALSE,
match_partially == TRUE
)
) |>
dplyr::bind_rows(.id = "source") |>
dplyr::glimpse()Rows: 15
Columns: 9
$ dataset <chr> "Example1", "Example1", "Example1", "Ex…
$ source <chr> "ct", "ct", "ct", "ct", "ct", "ct", "ct…
$ sheet <chr> "DSCF0028 - frame at 0m9s", "DSCF0033 -…
$ file <chr> "DSCF0028 - frame at 0m9s.jpg", "DSCF00…
$ stringdist <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 5, 2, 1, 3, 1, …
$ match_exactly <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ match_file_no_extension <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
$ match_file_diff_capitalization <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ match_partially <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…