diff --git a/LICENSE.txt b/LICENSE.txt index 977ad40e..cc573189 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Elise Landman +Copyright (c) 2025 Elise Landman Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index e66975cc..15e66a14 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ search.stats 'seconds_elapsed': 5.14}, 'parameters': {'similarity_mse': 0, 'rotate': True, - 'lazy': True, + 'same_dim': True, 'processes': 5, 'chunksize': None}, 'files_searched': 3232, @@ -143,12 +143,12 @@ difPy supports the following parameters: ```python difPy.build(*directory, recursive=True, in_folder=False, limit_extensions=True, px_size=50, - show_progress=True, processes=None) + show_progress=True, processes=os.cpu_count()) ``` ```python -difPy.search(difpy_obj, similarity='duplicates', rotate=True, lazy=True, show_progress=True, - processes=None, chunksize=None) +difPy.search(difpy_obj, similarity='duplicates', rotate=True, same_dim=True, show_progress=True, + processes=os.cpu_count(), chunksize=None) ``` :notebook: For a **detailed usage guide**, please view the official **[difPy Usage Documentation](https://difpy.readthedocs.io/)**. @@ -172,14 +172,14 @@ difPy CLI supports the following arguments: dif.py [-h] [-D DIRECTORY [DIRECTORY ...]] [-Z OUTPUT_DIRECTORY] [-r {True,False}] [-i {True,False}] [-le {True,False}] [-px PX_SIZE] [-s SIMILARITY] [-ro {True,False}] - [-la {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] + [-dim {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] [-mv MOVE_TO] [-d {True,False}] [-sd {True,False}] [-p {True,False}] ``` | | Parameter | | Parameter | | :---: | ------ | :---: | ------ | -| `-D` | directory | `-la` | lazy | +| `-D` | directory | `-dim` | same_dim | | `-Z` | output_directory | `-proc` | processes | | `-r`| recursive | `-ch` | chunksize | | `-i`| in_folder | `-mv` | move_to | diff --git a/difPy/dif.py b/difPy/dif.py index 6ee59885..2028e8a8 100644 --- a/difPy/dif.py +++ b/difPy/dif.py @@ -25,7 +25,7 @@ class build: ''' A class used to initialize difPy and build its image repository ''' - def __init__(self, *directory, recursive=True, in_folder=False, limit_extensions=True, px_size=50, show_progress=True, processes=None, **kwargs): + def __init__(self, *directory, recursive=True, in_folder=False, limit_extensions=True, px_size=50, show_progress=True, processes=os.cpu_count(), **kwargs): ''' Parameters ---------- @@ -85,7 +85,7 @@ def _main(self): _help._progress_bar(count, total_count, task='preparing files') # generate build statistics - stats = _generate_stats().build(start_time=start_time, end_time=end_time, total_files=len(filename_dictionary), invalid_files=invalid_files, skipped_files=skipped_files, directory=self.__directory, recursive=self.__recursive, in_folder=self.__in_folder, limit_extensions=self.__limit_extensions, px_size=self.__px_size, processes=self.__processes) + stats = _generate_stats.build(total_files=len(filename_dictionary), invalid_files=invalid_files, skipped_files=skipped_files, directory=self.__directory, start_time=start_time, end_time=end_time, recursive=self.__recursive, in_folder=self.__in_folder, limit_extensions=self.__limit_extensions, px_size=self.__px_size, processes=self.__processes) if self.__show_progress: count += 1 @@ -95,7 +95,7 @@ def _main(self): def _get_files(self): # Function that searches for files in the input directories - valid_files_all = np.array([], dtype=object) # Initialize as empty numpy array + valid_files_all = np.array([], dtype=object) skipped_files_all = np.array([], dtype=object) if self.__in_folder: @@ -251,7 +251,7 @@ class search: ''' A class used to search for matches in a difPy image repository ''' - def __init__(self, difpy_obj, similarity='duplicates', rotate=True, lazy=True, show_progress=True, processes=None, chunksize=None, **kwargs): + def __init__(self, difpy_obj, similarity='duplicates', rotate=True, same_dim=True, show_progress=True, processes=os.cpu_count(), chunksize=None, **kwargs): ''' Parameters ---------- @@ -261,7 +261,7 @@ def __init__(self, difpy_obj, similarity='duplicates', rotate=True, lazy=True, s Image comparison similarity threshold (mse) (default is 'duplicates', 0) rotate : bool (optional) Rotates images on comparison (default is True) - lazy : bool (optional) + same_dim : bool (optional) Only searches for duplicate/similar images that have the same dimensions (width x height in pixels) (default is True) show_progress : bool (optional) Show the difPy progress bar in console (default is True) @@ -275,7 +275,7 @@ def __init__(self, difpy_obj, similarity='duplicates', rotate=True, lazy=True, s self.__difpy_obj = difpy_obj self.__similarity = _validate_param._similarity(similarity) self.__rotate = _validate_param._rotate(rotate) - self.__lazy = _validate_param._lazy(lazy, self.__similarity) + self.__same_dim = _validate_param._same_dim(same_dim, self.__similarity) self.__show_progress = _validate_param._show_progress(show_progress) self.__processes = _validate_param._processes(processes) self.__chunksize = _validate_param._chunksize(chunksize) @@ -309,7 +309,7 @@ def _main(self): end_time = datetime.now() # generate process stats - stats = _generate_stats().search(build_stats=self.__difpy_obj.stats, start_time=start_time, end_time=end_time, similarity = self.__similarity, rotate=self.__rotate, lazy=self.__lazy, processes=self.__processes, files_searched=len(self.__difpy_obj._tensor_dictionary), duplicate_count=duplicate_count, similar_count=similar_count, chunksize=self.__chunksize) + stats = _generate_stats.search(build_stats=self.__difpy_obj.stats, start_time=start_time, end_time=end_time, similarity = self.__similarity, rotate=self.__rotate, same_dim=self.__same_dim, processes=self.__processes, files_searched=len(self.__difpy_obj._tensor_dictionary), duplicate_count=duplicate_count, similar_count=similar_count, chunksize=self.__chunksize) return result, lower_quality, stats @@ -344,7 +344,6 @@ def _search_union(self): result_raw = result_raw + output self.__count += 1 if self.__show_progress: - print(self.__count, end="\r") _help._progress_bar(self.__count, len(self.__difpy_obj._tensor_dictionary.keys())-1, task=f'searching files') # format the end result @@ -439,7 +438,7 @@ def _find_matches(self, ids): tensor_shape_A = self.__difpy_obj._id_to_shape_dictionary[id_A] tensor_shape_B = self.__difpy_obj._id_to_shape_dictionary[id_B] - if self.__lazy: + if self.__same_dim: # check if two tensors have the same dimensions if _compare_imgs._compare_shape(tensor_shape_A, tensor_shape_B): # check if two tensors are equal @@ -470,7 +469,7 @@ def _find_matches_batch(self, ids): ids_B_list = np.asarray([x[1] for x in ids]) tensor_B_list = np.asarray([self.__difpy_obj._tensor_dictionary[x[1]] for x in ids]) - if self.__lazy: + if self.__same_dim: # compare only those that have the same shape shape_A_list = [sorted(self.__difpy_obj._id_to_shape_dictionary[id_A])]*len(ids) shape_B_list = [sorted(self.__difpy_obj._id_to_shape_dictionary[id_B]) for id_B in ids_B_list] @@ -746,60 +745,64 @@ class _generate_stats: ''' A class for generating statistics on the difPy processes ''' - def __init__(self): - # Initialize the stats dict - self.stats = dict() - - def build(self, **kwargs): + def build(**kwargs): # Function that generates stats for the Build process - seconds_elapsed = np.round((kwargs['end_time'] - kwargs['start_time']).total_seconds(), 4) + directory = kwargs['directory'] total_files = kwargs['total_files'] invalid_files = kwargs['invalid_files'] for file in kwargs['skipped_files']: invalid_files.update({str(Path(file)) : 'Unsupported file type'}) - self.stats.update({'directory' : kwargs['directory']}) - - self.stats.update({'process' : {'build': {}}}) - self.stats['process']['build'].update({'duration' : {'start': kwargs['start_time'].isoformat(), - 'end' : kwargs['end_time'].isoformat(), - 'seconds_elapsed' : seconds_elapsed - }}) - self.stats['process']['build'].update({'parameters': {'recursive' : kwargs['recursive'], - 'in_folder' : kwargs['in_folder'], - 'limit_extensions' : kwargs['limit_extensions'], - 'px_size' : kwargs['px_size'], - 'processes' : kwargs['processes'] - }}) - self.stats.update({'total_files' : total_files+len(invalid_files)}) - self.stats.update({'invalid_files': {'count' : len(invalid_files), - 'logs' : invalid_files}}) - - return self.stats - - def search(self, **kwargs): - # Function that generates stats for the Search process - stats = kwargs['build_stats'] - seconds_elapsed = np.round((kwargs['end_time'] - kwargs['start_time']).total_seconds(), 4) - stats['process'].update({'search' : {}}) - stats['process']['search'].update({'duration' : {'start': kwargs['start_time'].isoformat(), - 'end' : kwargs['end_time'].isoformat(), - 'seconds_elapsed' : seconds_elapsed - }}) - stats['process']['search'].update({'parameters' : {'similarity_mse': kwargs['similarity'], - 'rotate' : kwargs['rotate'], - 'lazy' : kwargs['lazy'], - 'processes' : kwargs['processes'], - 'chunksize' : kwargs['chunksize'] - }}) - stats['process']['search'].update({'files_searched' : kwargs['files_searched']}) - if kwargs['similarity'] == 0: - matches_output = {'duplicates': kwargs['duplicate_count']} - else: - matches_output = {'duplicates': kwargs['duplicate_count'], - 'similar' : kwargs['similar_count']} - stats['process']['search'].update({'matches_found' : matches_output}) - return stats + build_stats = { + 'directory' : directory, + 'total_files' : total_files+len(invalid_files), + 'invalid_files' : { + 'count' : len(invalid_files), + 'logs' : invalid_files + }, + 'process': { + 'build' : { + 'duration' : { + 'start' : kwargs['start_time'].isoformat(), + 'end' : kwargs['end_time'].isoformat(), + 'seconds_elapsed' : np.round((kwargs['end_time'] - kwargs['start_time']).total_seconds(), 4), + }, + 'parameters' : { + 'recursive' : kwargs['recursive'], + 'in_folder' : kwargs['in_folder'], + 'limit_extensions' : kwargs['limit_extensions'], + 'px_size' : kwargs['px_size'], + 'processes' : kwargs['processes'], + } + } + } + } + return build_stats + + def search(**kwargs): + # Function that generates stats for the Search process + search_stats = { + 'search' : { + 'duration' : { + 'start' : kwargs['start_time'].isoformat(), + 'end' : kwargs['end_time'].isoformat(), + 'seconds_elapsed' : np.round((kwargs['end_time'] - kwargs['start_time']).total_seconds(), 4), + }, + 'parameters' : { + 'similarity_mse' : kwargs['similarity'], + 'rotate' : kwargs['rotate'], + 'same_dim' : kwargs['same_dim'], + 'processes' : kwargs['processes'], + 'chunksize' : kwargs['chunksize'] + }, + 'files_searched' : kwargs['files_searched'], + 'matches_found' : { + 'duplicates': kwargs['duplicate_count'], + 'similar' : kwargs['similar_count']} + } + } + kwargs['build_stats']['process'].update(search_stats) + return kwargs['build_stats'] class _validate_param: ''' @@ -884,14 +887,14 @@ def _rotate(rotate): raise Exception('Invalid value for "rotate" parameter: must be of type BOOL.') return rotate - def _lazy(lazy, similarity): - # Function that validates the 'lazy' input parameter - if not isinstance(lazy, bool): - raise Exception('Invalid value for "lazy" parameter: must be of type BOOL.') - if lazy: + def _same_dim(same_dim, similarity): + # Function that validates the 'same_dim' input parameter + if not isinstance(same_dim, bool): + raise Exception('Invalid value for "same_dim" parameter: must be of type BOOL.') + if same_dim: if similarity > 0: - lazy = False - return lazy + same_dim = False + return same_dim def _show_progress(show_progress): # Function that validates the 'show_progress' input parameter @@ -902,10 +905,9 @@ def _show_progress(show_progress): def _processes(processes): # Function that validates the 'processes' input parameter if not isinstance(processes, int): - if not processes == None: - raise Exception('Invalid value for "processes" parameter: must be of type INT.') - else: - processes = os.cpu_count() + raise Exception('Invalid value for "processes" parameter: must be of type INT.') + if processes > os.cpu_count(): + raise Exception('Invalid value for "processes" parameter: must be <= the number of CPU cores (os.cpu_count()).') return processes def _chunksize(chunksize): @@ -941,6 +943,9 @@ def _move_to(dir): def _kwargs(kwargs): if "logs" in kwargs: warnings.warn('Parameter "logs" was deprecated with difPy v4.1. Using it might lead to an exception in future versions. Consider updating your script.', FutureWarning, stacklevel=2) + if "lazy" in kwargs: + warnings.warn('Parameter "lazy" was renamed to "same_dim" with difPy v4.2. Using it might lead to an exception in future versions. Consider updating your script.', FutureWarning, stacklevel=2) + class _help: ''' @@ -977,31 +982,24 @@ def _strtobool(value: str) -> bool: parser.add_argument('-px', '--px_size', type=int, help='Compression size of images in pixels.', required=False, default=50) parser.add_argument('-s', '--similarity', type=_help._convert_str_to_int, help='Similarity grade (mse).', required=False, default='duplicates') parser.add_argument('-ro', '--rotate', type=lambda x: bool(_help._strtobool(x)), help='Rotate images during comparison process.', required=False, choices=[True, False], default=True) - parser.add_argument('-la', '--lazy', type=lambda x: bool(_help._strtobool(x)), help='Compares image dimensions before comparison process.', required=False, choices=[True, False], default=True) + parser.add_argument('-dim', '--same_dim', type=lambda x: bool(_help._strtobool(x)), help='Only compare image having the same dimensions (width x height)', required=False, choices=[True, False], default=True) parser.add_argument('-mv', '--move_to', type=str, help='Output directory path of lower quality images among matches.', required=False, default=None) parser.add_argument('-d', '--delete', type=lambda x: bool(_help._strtobool(x)), help='Delete lower quality images among matches.', required=False, choices=[True, False], default=False) parser.add_argument('-sd', '--silent_del', type=lambda x: bool(_help._strtobool(x)), help='Suppress the user confirmation when deleting images.', required=False, choices=[True, False], default=False) parser.add_argument('-p', '--show_progress', type=lambda x: bool(_help._strtobool(x)), help='Show the real-time progress of difPy.', required=False, choices=[True, False], default=True) - parser.add_argument('-proc', '--processes', type=_help._convert_str_to_int, help=' Number of worker processes for multiprocessing.', required=False, default=None) + parser.add_argument('-proc', '--processes', type=_help._convert_str_to_int, help=' Number of worker processes for multiprocessing.', required=False, default=os.cpu_count()) parser.add_argument('-ch', '--chunksize', type=_help._convert_str_to_int, help='Only relevant when dataset > 5k images. Sets the batch size at which the job is simultaneously processed when multiprocessing.', required=False, default=None) parser.add_argument('-l', '--logs', type=lambda x: bool(_help._strtobool(x)), help='(Deprecated) Collect statistics during the process.', required=False, choices=[True, False], default=None) + parser.add_argument('-la', '--lazy', type=lambda x: bool(_help._strtobool(x)), help='(Deprecated) Only compare image having the same dimensions (width x height).', required=False, choices=[True, False], default=None) args = parser.parse_args() + # validate input arguments if args.logs != None: _validate_param._kwargs(["logs"]) - # initialize difPy - dif = build(args.directory, recursive=args.recursive, in_folder=args.in_folder, limit_extensions=args.limit_extensions, px_size=args.px_size, show_progress=args.show_progress, processes=args.processes, ) - - # perform search - se = search(dif, similarity=args.similarity, rotate=args.rotate, lazy=args.lazy, processes=args.processes, chunksize=args.chunksize) - - # create filenames for the output files - timestamp = datetime.now().strftime("%Y%m%d%H%M%S") - result_file = f'difPy_{timestamp}_results.json' - lq_file = f'difPy_{timestamp}_lower_quality.txt' - stats_file = f'difPy_{timestamp}_stats.json' + if args.lazy != None: + _validate_param._kwargs(["lazy"]) # check if 'output_directory' parameter exists if args.output_directory != None: @@ -1011,23 +1009,35 @@ def _strtobool(value: str) -> bool: else: dir = os.getcwd() + # check if 'move_to' and 'delete' are both given + if args.move_to != None and args.delete != None: + raise Exception(f'"move_to" and "delete" parameter are mutually exclusive. Please select one of them.') + + # run difPy + dif = build(args.directory, recursive=args.recursive, in_folder=args.in_folder, limit_extensions=args.limit_extensions, px_size=args.px_size, show_progress=args.show_progress, processes=args.processes, ) + se = search(dif, similarity=args.similarity, rotate=args.rotate, same_dim=args.same_dim, processes=args.processes, chunksize=args.chunksize) + + # create filenames for the output files + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + result_file = f'difPy_{timestamp}_results.json' + lq_file = f'difPy_{timestamp}_lower_quality.txt' + stats_file = f'difPy_{timestamp}_stats.json' + # output 'search.results' to file with open(os.path.join(dir, result_file), 'w') as file: json.dump(se.result, file) - # output 'search.stats' to file with open(os.path.join(dir, stats_file), 'w') as file: json.dump(se.stats, file) + # output 'search.lower_quality' to file + with open(os.path.join(dir, lq_file), 'w') as file: + json.dump(se.lower_quality, file) # check 'move_to' parameter if args.move_to != None: # move lower quality files se.move_to(args.move_to) - # output 'search.lower_quality' to file - with open(os.path.join(dir, lq_file), 'w') as file: - json.dump(se.lower_quality, file) - # check 'delete' parameter if args.delete: # delete search.lower_quality files diff --git a/difPy/version.py b/difPy/version.py index c8fc114e..b3302028 100644 --- a/difPy/version.py +++ b/difPy/version.py @@ -1 +1 @@ -__version__ = '4.1.3' \ No newline at end of file +__version__ = '4.2.0' \ No newline at end of file diff --git a/docs/getting_started/basic_usage.rst b/docs/01_getting_started/basic_usage.rst similarity index 100% rename from docs/getting_started/basic_usage.rst rename to docs/01_getting_started/basic_usage.rst diff --git a/docs/getting_started/cli_usage.rst b/docs/01_getting_started/cli_usage.rst similarity index 94% rename from docs/getting_started/cli_usage.rst rename to docs/01_getting_started/cli_usage.rst index 3c4f7b24..0b02483f 100644 --- a/docs/getting_started/cli_usage.rst +++ b/docs/01_getting_started/cli_usage.rst @@ -24,7 +24,7 @@ difPy in the CLI supports the following arguments: dif.py [-h] [-D DIRECTORY [DIRECTORY ...]] [-Z OUTPUT_DIRECTORY] [-r {True,False}] [-i {True,False}] [-le {True,False}] [-px PX_SIZE] [-s SIMILARITY] [-ro {True,False}] - [-la {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] + [-dim {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] [-mv MOVE_TO] [-d {True,False}] [-sd {True,False}] [-p {True,False}] @@ -33,7 +33,7 @@ difPy in the CLI supports the following arguments: :widths: 5, 10, 5, 10 :class: tight-table - ``-D``,:ref:`directory`,``-la``,:ref:`lazy` + ``-D``,:ref:`directory`,``-la``,:ref:`same_dim` ``-Z``,output_directory,``-proc``,:ref:`processes` ``-r``,:ref:`recursive`,``-ch``,:ref:`chunksize` ``-i``,:ref:`in_folder`,``-mv``,move_to (see :ref:`search.move_to`) diff --git a/docs/getting_started/installation.rst b/docs/01_getting_started/installation.rst similarity index 100% rename from docs/getting_started/installation.rst rename to docs/01_getting_started/installation.rst diff --git a/docs/getting_started/output.rst b/docs/01_getting_started/output.rst similarity index 98% rename from docs/getting_started/output.rst rename to docs/01_getting_started/output.rst index ee892975..ed8d147f 100644 --- a/docs/getting_started/output.rst +++ b/docs/01_getting_started/output.rst @@ -92,7 +92,7 @@ A **JSON formatted collection** with statistics on the completed difPy process: 'seconds_elapsed': 5.14}, 'parameters': {'similarity_mse': 0, 'rotate': True, - 'lazy': True, + 'same_dim': True, 'processes': 5, 'chunksize': None}, 'files_searched': 3228, diff --git a/docs/methods/build.rst b/docs/02_methods/build.rst similarity index 96% rename from docs/methods/build.rst rename to docs/02_methods/build.rst index ffc3a52f..0f954241 100644 --- a/docs/methods/build.rst +++ b/docs/02_methods/build.rst @@ -20,11 +20,11 @@ Upon completion, ``difPy.build()`` returns a ``dif`` object that can be used in :ref:`directory`,"``str``, ``list``",, :ref:`recursive`,``bool``,``True``,``False`` - :ref:`in_folder`,"``bool``, ``False``",``True`` + :ref:`in_folder`,``bool``,``True``,``False`` :ref:`limit_extensions`,``bool``,``True``,``False`` - :ref:`px_size`,"``int``, ``float``",50, ``int`` + :ref:`px_size`,``int``,50, "``int`` >= 10 and <= 5000" :ref:`show_progress`,``bool``,``True``,``False`` - :ref:`processes`,``int``,``None`` (``os.cpu_count()``), ``int`` + :ref:`processes`,``int``,``os.cpu_count()``, "``int`` >= 1 and <= ``os.cpu_count()``" .. note:: @@ -131,7 +131,7 @@ processes (int) ++++++++++++ .. warning:: - Recommended not to change default value. Only adjust this value if you know what you are doing. + Recommended not to change default value. Only adjust this value if you know what you are doing. See :ref:`Adjusting processes and chunksize`. difPy leverages `Multiprocessing`_ to speed up the image comparison process, meaning multiple comparison tasks will be performed in parallel. The ``processes`` parameter defines the maximum number of worker processes (i. e. parallel tasks) to perform when multiprocessing. The higher the parameter, the more performance can be achieved, but in turn, the more computing resources will be required. To learn more, please refer to the `Python Multiprocessing documentation`_. diff --git a/docs/methods/search.rst b/docs/02_methods/search.rst similarity index 58% rename from docs/methods/search.rst rename to docs/02_methods/search.rst index d2b34bd6..a3a4b036 100644 --- a/docs/methods/search.rst +++ b/docs/02_methods/search.rst @@ -11,7 +11,7 @@ After the search is completed, further actions can be performed using :ref:`sear .. code-block:: python - difPy.search(difPy_obj, similarity='duplicates', lazy=True, rotate=True, processes=None, chunksize=None, show_progress=False, logs=True) + difPy.search(difPy_obj, similarity='duplicates', same_dim=True, rotate=True, processes=None, chunksize=None, show_progress=False, logs=True) ``difPy.search`` supports the following parameters: @@ -21,12 +21,12 @@ After the search is completed, further actions can be performed using :ref:`sear :class: tight-table :ref:`difPy_obj`,"``difPy_obj``",, - :ref:`similarity`,"``str``, ``int``",``'duplicates'``, "``'similar'``, any ``int`` or ``float``" - :ref:`lazy`,``bool``,``True``,``False`` + :ref:`similarity`,"``str``, ``int``, ``float``",``'duplicates'``, "``'similar'``, ``int`` or ``float`` >= 0" + :ref:`same_dim`,``bool``,``True``,``False`` :ref:`rotate`,``bool``,``True``,``False`` - :ref:`show_progress2`,``bool``,``True``,``False`` - :ref:`processes`,``int``,``None`` (``os.cpu_count()``), any ``int`` - :ref:`chunksize`,``int``,``None``, any ``int`` + :ref:`show_progress`,``bool``,``True``,``False`` + :ref:`processes`,``int``,``os.cpu_count()``, "``int`` >= 1 and <= ``os.cpu_count()``" + :ref:`chunksize`,``int``,``None``, "``int`` >= 1" .. _difPy_obj: @@ -37,7 +37,7 @@ The required ``difPy_obj`` parameter should be pointing to the ``dif`` object th .. _similarity: -similarity (str, int) +similarity (str, int, float) ++++++++++++ difPy compares the images to find duplicates or similarities, based on the MSE (Mean Squared Error) between both image tensors. The target similarity rate i. e. MSE value is set with the ``similarity`` parameter. @@ -46,34 +46,30 @@ difPy compares the images to find duplicates or similarities, based on the MSE ( ``"similar"`` = searches for similar images. MSE threshold is set to ``5``. -The search for similar images can be useful when searching for duplicate files that might have different file **types** (i. e. imageA.png has a duplicate imageA.jpg) and/or different file **sizes** (f. e. imageA.png (100MB) has a duplicate imageA.png (50MB)). In these cases, the MSE between the two image tensors might not be exactly == 0, hence they would not be classified as being duplicates even though in reality they are. Setting ``similarity`` to ``"similar"`` searches for duplicates with a certain tolerance, increasing the likelihood of finding duplicate images of different file types and sizes. Depending on which ``similarity`` level is chosen, the ``lazy`` parameter should be adjusted accordingly (see :ref:`lazy`). +The search for similar images can be useful when searching for duplicate files that: -.. figure:: docs/static/assets/choosing_similarity.png - :width: 540 - :height: 390 - :alt: Setting the "similarity" & "lazy" Parameter - :align: center +* have different file **types** (f. e. imageA.png has a duplicate imageA.jpg) +* have different file **sizes** (f. e. imageA.png (100MB) has a duplicate imageA.png (50MB)) +* are **cropped** versions of one another (f. e. imageA.png is a cropped version of imageB.png) (in this case, :ref:`same_dim` should be set to ``False``) - Setting the "similarity" and "lazy" parameter +In these cases, the MSE between the two image tensors might not be exactly == 0, hence they would not be classified as being duplicates even though in reality they are. Setting ``similarity`` to ``"similar"`` searches for duplicates with a certain tolerance, increasing the likelihood of finding duplicate images of different file types and sizes. **Manual setting**: the match MSE threshold can be adjusted manually by setting the ``similarity`` parameter to any ``int`` or ``float``. difPy will then search for images that match an MSE threshold **equal to or lower than** the one specified. -.. _lazy: +.. _same_dim: -lazy (bool) +same_dim (bool) ++++++++++++ -By default, difPy searches using a Lazy algorithm. This algorithm assumes that the image matches we are looking for have **the same dimensions**, i. e.duplicate images have the same width and height. If two images do not have the same dimensions, they are automatically assumed to not be duplicates. Therefore, because these images are skipped, this algorithm can provide a significant **improvement in performance**. +By default, when searching for matches, difPy assumes images to have **the same dimensions** (width x height). -``True`` = (default) applies the Lazy algorithm +``True`` = (default) assumes matches have the same dimensions -``False`` = regular algorithm is used +``False`` = assumes matches can have different dimensions -**When should the Lazy algorithm not be used?** -The Lazy algorithm can speed up the comparison process significantly. Nonetheless, the algorithm might not be suited for your use case and might result in missing some matches. Depending on which ``similarity`` level is chosen, the ``lazy`` parameter should be adjusted accordingly (see :ref:`similarity`). Set ``lazy = False`` if you are searching for duplicate images with: - -* different **file types** (i. e. imageA.png is a duplicate of imageA.jpg) -* and/or different **file sizes** (i. e. imageA.png (100MB) is a duplicate of imageA_compressed.png (50MB)) +.. note:: + ``same_dim`` should be set to ``False`` if you are searching for image matches that have different **file types** (i. e. imageA.png is a duplicate of imageA.jpg) + and/or if images are **cropped** versions of one another. .. _rotate: @@ -102,7 +98,7 @@ chunksize (int) ++++++++++++ .. warning:: - Recommended not to change default value. Only adjust this value if you know what you are doing. + Recommended not to change default value. Only adjust this value if you know what you are doing. See :ref:`Adjusting processes and chunksize`. ``chunksize`` is only used when dealing with image datasets of **more than 5k images**. See the ":ref:`Using difPy with Large Datasets`" section for further details. diff --git a/docs/methods/search_delete.rst b/docs/02_methods/search_delete.rst similarity index 100% rename from docs/methods/search_delete.rst rename to docs/02_methods/search_delete.rst diff --git a/docs/methods/search_moveto.rst b/docs/02_methods/search_moveto.rst similarity index 100% rename from docs/methods/search_moveto.rst rename to docs/02_methods/search_moveto.rst diff --git a/docs/contributing/contributing.rst b/docs/03_contributing/contributing.rst similarity index 100% rename from docs/contributing/contributing.rst rename to docs/03_contributing/contributing.rst diff --git a/docs/contributing/support.rst b/docs/03_contributing/support.rst similarity index 100% rename from docs/contributing/support.rst rename to docs/03_contributing/support.rst diff --git a/docs/resources/desktop.rst b/docs/04_resources/desktop.rst similarity index 84% rename from docs/resources/desktop.rst rename to docs/04_resources/desktop.rst index 8283dc2f..0e48bd07 100644 --- a/docs/resources/desktop.rst +++ b/docs/04_resources/desktop.rst @@ -24,7 +24,7 @@ Installation Basic Usage ^^^^^^^^^^ -To start a new search, open the difPy for Desktop app and click the "New Search". The search process is divided into two steps: (1) import folders and (2) configure search. +To start a new search, open the difPy for Desktop app and click the "New Search" button on the main menu. The search process is divided into two steps: (1) import folders and (2) configure search. .. _dsk_import: @@ -60,7 +60,8 @@ After importing the folder(s), you can configure what type of similarity search .. note:: **How does similarity search work?** - difPy compares the images pixel by pixel and calculates the Mean Squared Error (MSE) between the images. The MSE is a measure of the similarity between two images. The lower the MSE value, the more similar the images are. When similarity is set to "duplicates", difPy will only return matches with an MSE value of 0. When similarity is set to "similar", difPy will return matches with an MSE value of 5 or lower. Currently the similarity MSE value can not be customized in the desktop app. If you need a different MSE value, please use the `difPy Python package / Command Line Version `_. + + difPy compares the images pixel by pixel and calculates the Mean Squared Error (MSE) between the images. The MSE is a measure of the similarity between two images. The lower the MSE value, the more similar the images are. When similarity is set to "duplicates", difPy will only return matches with an MSE value of 0. When similarity is set to "similar", difPy will return matches with an MSE value of 5 or lower. Currently the similarity MSE value can not be customized in the desktop app. If you need a different MSE value, please use the `difPy Python package `_. Additionally, you can configure the following advanced search settings: @@ -98,20 +99,20 @@ Advanced Settings From the difPy settings on the main menu, you can access advanced search settings. .. warning:: - It is not recommended to change the advanced settings unless you know what you are doing. + It is not recommended to change these settings unless you know what you are doing. See :ref:`Adjusting processes and chunksize`. -**Processes**: defines the maximum number of worker processes (i. e. parallel tasks) to perform when multiprocessing. The more processes, the faster the search, but the more processing power the app will use. See :ref:`processes` for more information. +**Processes**: defines the maximum number of worker processes (i. e. parallel tasks) to perform when multiprocessing. The more processes, the faster the search, but the more processing power (CPU) the app will use. See :ref:`processes` for more information. -**Chunksize**: defines the number of image sets that should be compared at once per process. The higher the chunksize, the faster the search, but the more memory the app will use. See :ref:`chunksize` for more information. +**Chunksize**: defines the number of image sets that should be compared at once per process. The higher the chunksize, the faster the search, but the more memory (RAM) the app will use. See :ref:`chunksize` for more information. -The ``process`` and ``chunksize`` are only used when difPy receives more than 5k images to process. With large datasets, it can make sense to adjust these parameters. For example, in order to lower the overall CPU overhead, you could lower ``processes``. In order to decrease memory usage, you could decrease ``chunksize``. The higher both parameters, the more performance you will gain, but the more resources the app will use. +The ``process`` and ``chunksize`` become relevant if difPy received more than 5k images to process. With large datasets, it can make sense to adjust these parameters. For example, in order to lower the overall CPU overhead, you could lower ``processes``. In order to decrease memory usage, you could decrease ``chunksize``. The higher both parameters, the more performance you will gain, but the more resources the app will use. See :ref:`Adjusting processes and chunksize` for more information. .. _dsk_limitations: Limitations ^^^^^^^^^^ -* Using the difPy desktop app for large datasets can lead to slower processing times. For better performance, with large datasets (> 10k images) it is recommended to use the `difPy Python package / Command Line Version `_. instead. +* Using the difPy desktop app for large datasets can lead to slower processing times. For better performance, with large datasets (> 10k images) it is recommended to use the `difPy Python package `_ instead. * The desktop app is currently only available to beta testers on Windows. @@ -122,6 +123,6 @@ Limitations Give Feedback / Report Bug ^^^^^^^^^^ -🐞 Did you encounter an issue with the difPy desktop app? Please report it `here `_. +🐞 Did you encounter an issue with the difPy desktop app? `Report it here `_. -🗨️ Do you have feedback about the difPy desktop app? Share your feedback with us `here `_. \ No newline at end of file +🗨️ Do you have feedback about the difPy desktop app? `Share your feedback here `_. \ No newline at end of file diff --git a/docs/resources/large_datasets.rst b/docs/04_resources/large_datasets.rst similarity index 61% rename from docs/resources/large_datasets.rst rename to docs/04_resources/large_datasets.rst index 2e27598e..ba1bce82 100644 --- a/docs/resources/large_datasets.rst +++ b/docs/04_resources/large_datasets.rst @@ -31,4 +31,24 @@ When difPy receives a **"large" dataset** (> 5k images), a different algorithm i The picture above visualizes how chunks are processed by the chunking algorithm. Each of the image columns represent a chunk. -The ``chunksize`` parameter defines **how many of these chunks will be processed at once** (see :ref:`chunksize`). By default, ``chunksize`` is set to ``None`` which implies: ``1'000'000 / number of images in dataset``. This ratio is used to automatically size the ``chunksize`` according to the size of the dataset, with the goal of keeping memory consumption low. This is a good technique for datasets smaller than 1 million images. As soon as the number of images will reach more, then heavier memory consumption increase will become inevitable, as the number of potential image combinations (matches) becomes increasingly large. **It is not recommended to adjust this parameter manually**. +The ``chunksize`` parameter defines **how many of these chunks will be processed at once** (see :ref:`chunksize`). By default, ``chunksize`` is set to ``None`` which implies: ``1'000'000 / number of images in dataset``. This ratio is used to automatically size the ``chunksize`` according to the size of the dataset, with the goal of keeping memory consumption low. This is a good technique for datasets smaller than 1 million images. As soon as the number of images will reach more, then heavier memory consumption increase will become inevitable, as the number of potential image combinations (matches) becomes increasingly large. **It is not recommended to adjust this parameter manually except if you know what you are doing**. + +.. _Adjusting processes and chunksize: + +Adjusting 'processes' and 'chunksize' +^^^^^^^^^^ + +For most use cases, it is **not required** to adjust the :ref:`processes` and :ref:`chunksize` parameters. Nonetheless, depending on the size of your dataset and the specs of your machine, it can make sense to adjust them. + +difPy will consume as much memory and processing power as is can get in order to process the image dataset as fast as possible. Depending on the specs of your machine, this can lead to a **big spike in CPU usage and memory usage** for large datasets. In case you want to avoid such spikes, it is recommended to make the following adjustments: + +* To lower the overhead on your CPU, reduce the ``processes`` parameter. + +* To lower the overhead on your RAM, reduce the ``chunksize`` parameter. + +Reducing these will imply longer processing times, but will keep your CPU and RAM usage low. The higher both parameters, the more performance you will gain, but the more resources dfiPy will use. + +.. note:: + Example: You have a dataset of 10k images. Your machine has 16 cores and 32GB of RAM. + + For this scenario, the default value for ``processes`` is ``16`` and ``chunksize`` is ``1'000'000 / number of images in dataset = 100``. To reduce the overhead on your CPU, you could set ``processes`` to ``14`` (or lower). To reduce the overhead on your RAM, you could set ``chunksize`` to ``80`` (or lower). \ No newline at end of file diff --git a/docs/resources/report_bug.rst b/docs/04_resources/report_bug.rst similarity index 100% rename from docs/resources/report_bug.rst rename to docs/04_resources/report_bug.rst diff --git a/docs/resources/supported_filetypes.rst b/docs/04_resources/supported_filetypes.rst similarity index 100% rename from docs/resources/supported_filetypes.rst rename to docs/04_resources/supported_filetypes.rst diff --git a/docs/conf.py b/docs/conf.py index ee002819..92b2db3d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,11 +3,11 @@ # -- Project information project = 'difPy Guide' -copyright = '2024, Elise Landman' +copyright = '2025, Elise Landman' author = 'Elise Landman' -release = 'v4.1.3' -version = 'v4.1.3' +release = 'v4.2.0' +version = 'v4.2.0' # -- General configuration diff --git a/docs/index.rst b/docs/index.rst index ae531266..c88eeb71 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,38 +3,38 @@ :hidden: :caption: Getting started - /getting_started/installation - /getting_started/basic_usage - /getting_started/cli_usage - /getting_started/output + /01_getting_started/installation + /01_getting_started/basic_usage + /01_getting_started/cli_usage + /01_getting_started/output .. toctree:: :maxdepth: 2 :hidden: :caption: Methods and parameters - /methods/build - /methods/search - /methods/search_moveto - /methods/search_delete + /02_methods/build + /02_methods/search + /02_methods/search_moveto + /02_methods/search_delete .. toctree:: :maxdepth: 2 :hidden: :caption: Contributing - /contributing/contributing - /contributing/support + /03_contributing/contributing + /03_contributing/support .. toctree:: :maxdepth: 2 :hidden: :caption: Further Resources - /resources/desktop - /resources/large_datasets - /resources/supported_filetypes - /resources/report_bug + /04_resources/desktop + /04_resources/large_datasets + /04_resources/supported_filetypes + /04_resources/report_bug difPy Guide ===================================