Skip to content

Commit e50d930

Browse files
authored
Fix problem with not proper loading cached results in duplicate mode (#1086)
* Fix problem with not proper loading cached results in duplicate mode * Dbg
1 parent 0462324 commit e50d930

File tree

4 files changed

+33
-18
lines changed

4 files changed

+33
-18
lines changed

Changelog.md

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
## Version 6.1.0 - ?
2-
- BREAKING CHANGE - Changed cache saving method, deduplicated, optimized and simplified procedure - [#1072](https://github.com/qarmin/czkawka/pull/1072)
2+
- BREAKING CHANGE - Changed cache saving method, deduplicated, optimized and simplified procedure(all files needs to be hashed again) - [#1072](https://github.com/qarmin/czkawka/pull/1072)
33
- Remove up to 170ms of delay after ending scan - [#1070](https://github.com/qarmin/czkawka/pull/1070)
44
- Added logger with useful info when debugging app (level can be adjusted via e.g. `RUST_LOG=debug` env) - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070)
5-
- Core code cleanup - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070)
5+
- Core code cleanup - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070), [#1082](https://github.com/qarmin/czkawka/pull/1082)
66
- Updated list of bad extensions and support for finding invalid jar files - [#1070](https://github.com/qarmin/czkawka/pull/1070)
7+
- More default excluded items on Windows(like pagefile) - [#1074](https://github.com/qarmin/czkawka/pull/1074)
8+
- Unified printing/saving method to files/terminal and fixed some differences/bugs - [#1082](https://github.com/qarmin/czkawka/pull/1082)
9+
- Uses fun_time library to print how much functions take time - [#1082](https://github.com/qarmin/czkawka/pull/1082)
10+
- Added exporting results into json file format - [#1083](https://github.com/qarmin/czkawka/pull/1083)
11+
- Added new test/regression suite for CI - [#1083](https://github.com/qarmin/czkawka/pull/1083)
12+
- Added ability to use relative paths - [#1083](https://github.com/qarmin/czkawka/pull/1083)
13+
- Fixed stability problem, that could remove invalid file in CLI - [#1083](https://github.com/qarmin/czkawka/pull/1083)
14+
- Fixed problem with invalid cache loading - [#0000]
715
- Fix Windows gui crashes by using gtk 4.6 instead 4.8 or 4.10 - [#992](https://github.com/qarmin/czkawka/pull/992)
816
- Fixed printing info about duplicated music files - [#1016](https://github.com/qarmin/czkawka/pull/1016)
917
- Fixed printing info about duplicated video files - [#1017](https://github.com/qarmin/czkawka/pull/1017)

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
- Temporary Files - Finds temporary files
2121
- Similar Images - Finds images which are not exactly the same (different resolution, watermarks)
2222
- Similar Videos - Looks for visually similar videos
23-
- Same Music - Searches for music with the same artist, album etc.
23+
- Same Music - Searches for similar music by tags or by reading content and comparing it
2424
- Invalid Symbolic Links - Shows symbolic links which point to non-existent files/directories
2525
- Broken Files - Finds files that are invalid or corrupted
2626
- Bad Extensions - Lists files whose content not match with their extension

czkawka_core/src/duplicate.rs

+21-13
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use std::collections::HashMap;
12
use std::collections::{BTreeMap, HashSet};
23
use std::fmt::Debug;
34
use std::fs::File;
@@ -424,11 +425,14 @@ impl DuplicateFinder {
424425
debug!("prehash_load_cache_at_start - started diff between loaded and prechecked files");
425426
for (size, mut vec_file_entry) in mem::take(&mut self.files_with_identical_size) {
426427
if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) {
427-
// TODO maybe hashset is not needed when using < 4 elements
428-
let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::<HashSet<_>>();
428+
// TODO maybe hashmap is not needed when using < 4 elements
429+
let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new();
430+
for file_entry in cached_vec_file_entry {
431+
cached_path_entries.insert(&file_entry.path, file_entry.clone());
432+
}
429433
for file_entry in vec_file_entry {
430-
if cached_path_entries.contains(&file_entry.path) {
431-
records_already_cached.entry(size).or_default().push(file_entry);
434+
if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) {
435+
records_already_cached.entry(size).or_default().push(cached_file_entry);
432436
} else {
433437
non_cached_files_to_check.entry(size).or_default().push(file_entry);
434438
}
@@ -508,7 +512,7 @@ impl DuplicateFinder {
508512
debug!("Starting calculating prehash");
509513
#[allow(clippy::type_complexity)]
510514
let pre_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>)> = non_cached_files_to_check
511-
.par_iter()
515+
.into_par_iter()
512516
.map(|(size, vec_file_entry)| {
513517
let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
514518
let mut errors: Vec<String> = Vec::new();
@@ -519,15 +523,16 @@ impl DuplicateFinder {
519523
check_was_stopped.store(true, Ordering::Relaxed);
520524
return None;
521525
}
522-
for file_entry in vec_file_entry {
523-
match hash_calculation(&mut buffer, file_entry, &check_type, 0) {
526+
for mut file_entry in vec_file_entry {
527+
match hash_calculation(&mut buffer, &file_entry, &check_type, 0) {
524528
Ok(hash_string) => {
525-
hashmap_with_hash.entry(hash_string.clone()).or_default().push(file_entry.clone());
529+
file_entry.hash = hash_string.clone();
530+
hashmap_with_hash.entry(hash_string.clone()).or_default().push(file_entry);
526531
}
527532
Err(s) => errors.push(s),
528533
}
529534
}
530-
Some((*size, hashmap_with_hash, errors))
535+
Some((size, hashmap_with_hash, errors))
531536
})
532537
.while_some()
533538
.collect();
@@ -581,11 +586,14 @@ impl DuplicateFinder {
581586
debug!("full_hashing_load_cache_at_start - started diff between loaded and prechecked files");
582587
for (size, mut vec_file_entry) in pre_checked_map {
583588
if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) {
584-
// TODO maybe hashset is not needed when using < 4 elements
585-
let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::<HashSet<_>>();
589+
// TODO maybe hashmap is not needed when using < 4 elements
590+
let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new();
591+
for file_entry in cached_vec_file_entry {
592+
cached_path_entries.insert(&file_entry.path, file_entry.clone());
593+
}
586594
for file_entry in vec_file_entry {
587-
if cached_path_entries.contains(&file_entry.path) {
588-
records_already_cached.entry(size).or_default().push(file_entry);
595+
if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) {
596+
records_already_cached.entry(size).or_default().push(cached_file_entry);
589597
} else {
590598
non_cached_files_to_check.entry(size).or_default().push(file_entry);
591599
}

czkawka_core/src/similar_images.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,7 @@ impl SimilarImages {
783783
// Validating if group contains duplicated results
784784
let mut result_hashset: HashSet<String> = Default::default();
785785
let mut found = false;
786-
// dbg!(collected_similar_images.len());
786+
787787
for vec_file_entry in collected_similar_images.values() {
788788
if vec_file_entry.is_empty() {
789789
println!("Empty group");
@@ -1338,7 +1338,6 @@ mod tests {
13381338

13391339
similar_images.find_similar_hashes(None, None);
13401340
let res = similar_images.get_similar_images();
1341-
// dbg!(&res);
13421341
assert!(res.is_empty());
13431342
}
13441343
}

0 commit comments

Comments
 (0)