elisemercury · elisemercury · Jan 7, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Elise Landman
+Copyright (c) 2025 Elise Landman
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ search.stats
                                      'seconds_elapsed': 5.14},
                         'parameters': {'similarity_mse': 0,
                                        'rotate': True,
-                                       'lazy': True,
+                                       'same_dim': True,
                                        'processes': 5,
                                        'chunksize': None},
                         'files_searched': 3232,
@@ -143,12 +143,12 @@ difPy supports the following parameters:
 
 ```python
 difPy.build(*directory, recursive=True, in_folder=False, limit_extensions=True, px_size=50, 
-            show_progress=True, processes=None)
+            show_progress=True, processes=os.cpu_count())
 ```
 
 ```python
-difPy.search(difpy_obj, similarity='duplicates', rotate=True, lazy=True, show_progress=True, 
-             processes=None, chunksize=None)
+difPy.search(difpy_obj, similarity='duplicates', rotate=True, same_dim=True, show_progress=True, 
+             processes=os.cpu_count(), chunksize=None)
 ```
 
 :notebook: For a **detailed usage guide**, please view the official **[difPy Usage Documentation](https://difpy.readthedocs.io/)**.
@@ -172,14 +172,14 @@ difPy CLI supports the following arguments:
 dif.py [-h] [-D DIRECTORY [DIRECTORY ...]] [-Z OUTPUT_DIRECTORY] 
        [-r {True,False}] [-i {True,False}] [-le {True,False}] 
        [-px PX_SIZE]  [-s SIMILARITY] [-ro {True,False}]
-       [-la {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] 
+       [-dim {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] 
        [-mv MOVE_TO] [-d {True,False}] [-sd {True,False}]
        [-p {True,False}]
 ```
 
 | | Parameter | | Parameter |
 | :---: | ------ | :---: | ------ | 
-| `-D` | directory | `-la` | lazy |
+| `-D` | directory | `-dim` | same_dim |
 | `-Z` | output_directory | `-proc` | processes | 
 | `-r`| recursive | `-ch` | chunksize |
 | `-i`| in_folder | `-mv` | move_to |

diff --git a/difPy/dif.py b/difPy/dif.py
diff --git a/difPy/version.py b/difPy/version.py
@@ -1 +1 @@
-__version__ = '4.1.3'
+__version__ = '4.2.0'
diff --git a/docs/getting_started/basic_usage.rst → docs/01_getting_started/basic_usage.rst b/docs/getting_started/basic_usage.rst → docs/01_getting_started/basic_usage.rst
diff --git a/docs/getting_started/cli_usage.rst → docs/01_getting_started/cli_usage.rst b/docs/getting_started/cli_usage.rst → docs/01_getting_started/cli_usage.rst
@@ -24,7 +24,7 @@ difPy in the CLI supports the following arguments:
    dif.py [-h] [-D DIRECTORY [DIRECTORY ...]] [-Z OUTPUT_DIRECTORY] 
           [-r {True,False}] [-i {True,False}] [-le {True,False}] 
           [-px PX_SIZE]  [-s SIMILARITY] [-ro {True,False}]
-          [-la {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] 
+          [-dim {True,False}] [-proc PROCESSES] [-ch CHUNKSIZE] 
           [-mv MOVE_TO] [-d {True,False}] [-sd {True,False}]
           [-p {True,False}]
 
@@ -33,7 +33,7 @@ difPy in the CLI supports the following arguments:
    :widths: 5, 10, 5, 10
    :class: tight-table
 
-   ``-D``,:ref:`directory`,``-la``,:ref:`lazy`
+   ``-D``,:ref:`directory`,``-la``,:ref:`same_dim`
    ``-Z``,output_directory,``-proc``,:ref:`processes`
    ``-r``,:ref:`recursive`,``-ch``,:ref:`chunksize`
    ``-i``,:ref:`in_folder`,``-mv``,move_to (see :ref:`search.move_to`)

diff --git a/docs/getting_started/installation.rst → docs/01_getting_started/installation.rst b/docs/getting_started/installation.rst → docs/01_getting_started/installation.rst
diff --git a/docs/getting_started/output.rst → docs/01_getting_started/output.rst b/docs/getting_started/output.rst → docs/01_getting_started/output.rst
@@ -92,7 +92,7 @@ A **JSON formatted collection** with statistics on the completed difPy process:
                                         'seconds_elapsed': 5.14},
                            'parameters': {'similarity_mse': 0,
                                           'rotate': True,
-                                          'lazy': True,
+                                          'same_dim': True,
                                           'processes': 5,
                                           'chunksize': None},
                            'files_searched': 3228,

diff --git a/docs/methods/build.rst → docs/02_methods/build.rst b/docs/methods/build.rst → docs/02_methods/build.rst
@@ -20,11 +20,11 @@ Upon completion, ``difPy.build()`` returns a ``dif`` object that can be used in
 
    :ref:`directory`,"``str``, ``list``",,
    :ref:`recursive`,``bool``,``True``,``False``
-   :ref:`in_folder`,"``bool``, ``False``",``True``
+   :ref:`in_folder`,``bool``,``True``,``False``
    :ref:`limit_extensions`,``bool``,``True``,``False``
-   :ref:`px_size`,"``int``, ``float``",50, ``int``
+   :ref:`px_size`,``int``,50, "``int`` >= 10 and <= 5000"
    :ref:`show_progress`,``bool``,``True``,``False``
-   :ref:`processes`,``int``,``None`` (``os.cpu_count()``), ``int``
+   :ref:`processes`,``int``,``os.cpu_count()``, "``int`` >= 1 and <= ``os.cpu_count()``"
 
 .. note::
 
@@ -131,7 +131,7 @@ processes (int)
 ++++++++++++
 
 .. warning::
-   Recommended not to change default value. Only adjust this value if you know what you are doing.
+   Recommended not to change default value. Only adjust this value if you know what you are doing. See :ref:`Adjusting processes and chunksize`.
 
 difPy leverages `Multiprocessing`_ to speed up the image comparison process, meaning multiple comparison tasks will be performed in parallel. The ``processes`` parameter defines the maximum number of worker processes (i. e. parallel tasks) to perform when multiprocessing. The higher the parameter, the more performance can be achieved, but in turn, the more computing resources will be required. To learn more, please refer to the `Python Multiprocessing documentation`_. 
 

diff --git a/docs/methods/search.rst → docs/02_methods/search.rst b/docs/methods/search.rst → docs/02_methods/search.rst
@@ -11,7 +11,7 @@ After the search is completed, further actions can be performed using :ref:`sear
 
 .. code-block:: python
 
-   difPy.search(difPy_obj, similarity='duplicates', lazy=True, rotate=True, processes=None, chunksize=None, show_progress=False, logs=True)
+   difPy.search(difPy_obj, similarity='duplicates', same_dim=True, rotate=True, processes=None, chunksize=None, show_progress=False, logs=True)
 
 ``difPy.search`` supports the following parameters:
 
@@ -21,12 +21,12 @@ After the search is completed, further actions can be performed using :ref:`sear
    :class: tight-table
 
    :ref:`difPy_obj`,"``difPy_obj``",,
-   :ref:`similarity`,"``str``, ``int``",``'duplicates'``, "``'similar'``, any ``int`` or ``float``"
-   :ref:`lazy`,``bool``,``True``,``False``
+   :ref:`similarity`,"``str``, ``int``, ``float``",``'duplicates'``, "``'similar'``, ``int`` or ``float`` >= 0"
+   :ref:`same_dim`,``bool``,``True``,``False``
    :ref:`rotate`,``bool``,``True``,``False``
-   :ref:`show_progress2`,``bool``,``True``,``False``
-   :ref:`processes`,``int``,``None`` (``os.cpu_count()``), any ``int``
-   :ref:`chunksize`,``int``,``None``, any ``int``
+   :ref:`show_progress`,``bool``,``True``,``False``
+   :ref:`processes`,``int``,``os.cpu_count()``, "``int`` >= 1 and <= ``os.cpu_count()``"
+   :ref:`chunksize`,``int``,``None``, "``int`` >= 1"
 
 .. _difPy_obj:
 
@@ -37,7 +37,7 @@ The required ``difPy_obj`` parameter should be pointing to the ``dif`` object th
 
 .. _similarity: 
 
-similarity (str, int)
+similarity (str, int, float)
 ++++++++++++
 
 difPy compares the images to find duplicates or similarities, based on the MSE (Mean Squared Error) between both image tensors. The target similarity rate i. e. MSE value is set with the ``similarity`` parameter. 
@@ -46,34 +46,30 @@ difPy compares the images to find duplicates or similarities, based on the MSE (
 
 ``"similar"`` = searches for similar images. MSE threshold is set to ``5``.
 
-The search for similar images can be useful when searching for duplicate files that might have different file **types** (i. e. imageA.png has a duplicate imageA.jpg) and/or different file **sizes** (f. e. imageA.png (100MB) has a duplicate imageA.png (50MB)). In these cases, the MSE between the two image tensors might not be exactly == 0, hence they would not be classified as being duplicates even though in reality they are. Setting ``similarity`` to ``"similar"`` searches for duplicates with a certain tolerance, increasing the likelihood of finding duplicate images of different file types and sizes. Depending on which ``similarity`` level is chosen, the ``lazy`` parameter should be adjusted accordingly (see :ref:`lazy`).
+The search for similar images can be useful when searching for duplicate files that:
 
-.. figure:: docs/static/assets/choosing_similarity.png
-   :width: 540
-   :height: 390
-   :alt: Setting the "similarity" & "lazy" Parameter
-   :align: center
+* have different file **types** (f. e. imageA.png has a duplicate imageA.jpg) 
+* have different file **sizes** (f. e. imageA.png (100MB) has a duplicate imageA.png (50MB))
+* are **cropped** versions of one another (f. e. imageA.png is a cropped version of imageB.png) (in this case, :ref:`same_dim` should be set to ``False``)
 
-   Setting the "similarity" and "lazy" parameter
+In these cases, the MSE between the two image tensors might not be exactly == 0, hence they would not be classified as being duplicates even though in reality they are. Setting ``similarity`` to ``"similar"`` searches for duplicates with a certain tolerance, increasing the likelihood of finding duplicate images of different file types and sizes. 
 
 **Manual setting**: the match MSE threshold can be adjusted manually by setting the ``similarity`` parameter to any ``int`` or ``float``. difPy will then search for images that match an MSE threshold **equal to or lower than** the one specified.
 
-.. _lazy:
+.. _same_dim:
 
-lazy (bool)
+same_dim (bool)
 ++++++++++++
 
-By default, difPy searches using a Lazy algorithm. This algorithm assumes that the image matches we are looking for have **the same dimensions**, i. e.duplicate images have the same width and height. If two images do not have the same dimensions, they are automatically assumed to not be duplicates. Therefore, because these images are skipped, this algorithm can provide a significant **improvement in performance**.
+By default, when searching for matches, difPy assumes images to have **the same dimensions** (width x height).
 
-``True`` = (default) applies the Lazy algorithm
+``True`` = (default) assumes matches have the same dimensions
 
-``False`` = regular algorithm is used
+``False`` = assumes matches can have different dimensions
 
-**When should the Lazy algorithm not be used?**
-The Lazy algorithm can speed up the comparison process significantly. Nonetheless, the algorithm might not be suited for your use case and might result in missing some matches. Depending on which ``similarity`` level is chosen, the ``lazy`` parameter should be adjusted accordingly (see :ref:`similarity`). Set ``lazy = False`` if you are searching for duplicate images with:
-
-*  different **file types** (i. e. imageA.png is a duplicate of imageA.jpg)
-*  and/or different **file sizes** (i. e. imageA.png (100MB) is a duplicate of imageA_compressed.png (50MB))
+.. note::
+   ``same_dim`` should be set to ``False`` if you are searching for image matches that have different **file types** (i. e. imageA.png is a duplicate of imageA.jpg)
+   and/or if images are **cropped** versions of one another.
 
 .. _rotate:
 
@@ -102,7 +98,7 @@ chunksize (int)
 ++++++++++++
 
 .. warning::
-   Recommended not to change default value. Only adjust this value if you know what you are doing.
+   Recommended not to change default value. Only adjust this value if you know what you are doing. See :ref:`Adjusting processes and chunksize`.
 
 ``chunksize`` is only used when dealing with image datasets of **more than 5k images**. See the ":ref:`Using difPy with Large Datasets`" section for further details.
 

diff --git a/docs/methods/search_delete.rst → docs/02_methods/search_delete.rst b/docs/methods/search_delete.rst → docs/02_methods/search_delete.rst
diff --git a/docs/methods/search_moveto.rst → docs/02_methods/search_moveto.rst b/docs/methods/search_moveto.rst → docs/02_methods/search_moveto.rst
diff --git a/docs/contributing/contributing.rst → docs/03_contributing/contributing.rst b/docs/contributing/contributing.rst → docs/03_contributing/contributing.rst
diff --git a/docs/contributing/support.rst → docs/03_contributing/support.rst b/docs/contributing/support.rst → docs/03_contributing/support.rst
diff --git a/docs/resources/desktop.rst → docs/04_resources/desktop.rst b/docs/resources/desktop.rst → docs/04_resources/desktop.rst
@@ -24,7 +24,7 @@ Installation
 Basic Usage
 ^^^^^^^^^^
 
-To start a new search, open the difPy for Desktop app and click the "New Search". The search process is divided into two steps: (1) import folders and (2) configure search.
+To start a new search, open the difPy for Desktop app and click the "New Search" button on the main menu. The search process is divided into two steps: (1) import folders and (2) configure search.
 
 .. _dsk_import:
 
@@ -60,7 +60,8 @@ After importing the folder(s), you can configure what type of similarity search
 
 .. note::
     **How does similarity search work?**
-    difPy compares the images pixel by pixel and calculates the Mean Squared Error (MSE) between the images. The MSE is a measure of the similarity between two images. The lower the MSE value, the more similar the images are. When similarity is set to "duplicates", difPy will only return matches with an MSE value of 0. When similarity is set to "similar", difPy will return matches with an MSE value of 5 or lower. Currently the similarity MSE value can not be customized in the desktop app. If you need a different MSE value, please use the `difPy Python package / Command Line Version <https://pypi.org/project/difPy/>`_.
+
+    difPy compares the images pixel by pixel and calculates the Mean Squared Error (MSE) between the images. The MSE is a measure of the similarity between two images. The lower the MSE value, the more similar the images are. When similarity is set to "duplicates", difPy will only return matches with an MSE value of 0. When similarity is set to "similar", difPy will return matches with an MSE value of 5 or lower. Currently the similarity MSE value can not be customized in the desktop app. If you need a different MSE value, please use the `difPy Python package <https://pypi.org/project/difPy/>`_.
 
 Additionally, you can configure the following advanced search settings:
 
@@ -98,20 +99,20 @@ Advanced Settings
 From the difPy settings on the main menu, you can access advanced search settings. 
 
 .. warning::
-    It is not recommended to change the advanced settings unless you know what you are doing.
+    It is not recommended to change these settings unless you know what you are doing. See :ref:`Adjusting processes and chunksize`.
 
-**Processes**: defines the maximum number of worker processes (i. e. parallel tasks) to perform when multiprocessing. The more processes, the faster the search, but the more processing power the app will use. See :ref:`processes` for more information.
+**Processes**: defines the maximum number of worker processes (i. e. parallel tasks) to perform when multiprocessing. The more processes, the faster the search, but the more processing power (CPU) the app will use. See :ref:`processes` for more information.
 
-**Chunksize**: defines the number of image sets that should be compared at once per process. The higher the chunksize, the faster the search, but the more memory the app will use. See :ref:`chunksize` for more information.
+**Chunksize**: defines the number of image sets that should be compared at once per process. The higher the chunksize, the faster the search, but the more memory (RAM) the app will use. See :ref:`chunksize` for more information.
 
-The ``process`` and ``chunksize`` are only used when difPy receives more than 5k images to process. With large datasets, it can make sense to adjust these parameters. For example, in order to lower the overall CPU overhead, you could lower ``processes``. In order to decrease memory usage, you could decrease ``chunksize``. The higher both parameters, the more performance you will gain, but the more resources the app will use.
+The ``process`` and ``chunksize`` become relevant if difPy received more than 5k images to process. With large datasets, it can make sense to adjust these parameters. For example, in order to lower the overall CPU overhead, you could lower ``processes``. In order to decrease memory usage, you could decrease ``chunksize``. The higher both parameters, the more performance you will gain, but the more resources the app will use. See :ref:`Adjusting processes and chunksize` for more information.
 
 .. _dsk_limitations:
 
 Limitations
 ^^^^^^^^^^
 
-* Using the difPy desktop app for large datasets can lead to slower processing times. For better performance, with large datasets (> 10k images) it is recommended to use the `difPy Python package / Command Line Version <https://pypi.org/project/difPy/>`_. instead.
+* Using the difPy desktop app for large datasets can lead to slower processing times. For better performance, with large datasets (> 10k images) it is recommended to use the `difPy Python package <https://pypi.org/project/difPy/>`_ instead.
 
 * The desktop app is currently only available to beta testers on Windows.
 
@@ -122,6 +123,6 @@ Limitations
 Give Feedback / Report Bug
 ^^^^^^^^^^
 
-🐞 Did you encounter an issue with the difPy desktop app? Please report it `here <https://go.difpy.app/desktop-bug>`_.
+🐞 Did you encounter an issue with the difPy desktop app? `Report it here <https://go.difpy.app/desktop-bug>`_.
 
-🗨️ Do you have feedback about the difPy desktop app? Share your feedback with us `here <https://go.difpy.app/desktop-feedback>`_.
+🗨️ Do you have feedback about the difPy desktop app? `Share your feedback here <https://go.difpy.app/desktop-feedback>`_.
diff --git a/docs/resources/large_datasets.rst → docs/04_resources/large_datasets.rst b/docs/resources/large_datasets.rst → docs/04_resources/large_datasets.rst
@@ -31,4 +31,24 @@ When difPy receives a **"large" dataset** (> 5k images), a different algorithm i
 
 The picture above visualizes how chunks are processed by the chunking algorithm. Each of the image columns represent a chunk. 
 
-The ``chunksize`` parameter defines **how many of these chunks will be processed at once** (see :ref:`chunksize`). By default, ``chunksize`` is set to ``None`` which implies: ``1'000'000 / number of images in dataset``. This ratio is used to automatically size the ``chunksize`` according to the size of the dataset, with the goal of keeping memory consumption low. This is a good technique for datasets smaller than 1 million images. As soon as the number of images will reach more, then heavier memory consumption increase will become inevitable, as the number of potential image combinations (matches) becomes increasingly large. **It is not recommended to adjust this parameter manually**.
+The ``chunksize`` parameter defines **how many of these chunks will be processed at once** (see :ref:`chunksize`). By default, ``chunksize`` is set to ``None`` which implies: ``1'000'000 / number of images in dataset``. This ratio is used to automatically size the ``chunksize`` according to the size of the dataset, with the goal of keeping memory consumption low. This is a good technique for datasets smaller than 1 million images. As soon as the number of images will reach more, then heavier memory consumption increase will become inevitable, as the number of potential image combinations (matches) becomes increasingly large. **It is not recommended to adjust this parameter manually except if you know what you are doing**.
+
+.. _Adjusting processes and chunksize:
+
+Adjusting 'processes' and 'chunksize'
+^^^^^^^^^^
+
+For most use cases, it is **not required** to adjust the :ref:`processes` and :ref:`chunksize` parameters. Nonetheless, depending on the size of your dataset and the specs of your machine, it can make sense to adjust them.
+
+difPy will consume as much memory and processing power as is can get in order to process the image dataset as fast as possible. Depending on the specs of your machine, this can lead to a **big spike in CPU usage and memory usage** for large datasets. In case you want to avoid such spikes, it is recommended to make the following adjustments:
+
+* To lower the overhead on your CPU, reduce the ``processes`` parameter. 
+
+* To lower the overhead on your RAM, reduce the ``chunksize`` parameter.
+
+Reducing these will imply longer processing times, but will keep your CPU and RAM usage low. The higher both parameters, the more performance you will gain, but the more resources dfiPy will use.
+
+.. note::
+   Example: You have a dataset of 10k images. Your machine has 16 cores and 32GB of RAM. 
+
+   For this scenario, the default value for ``processes`` is ``16`` and ``chunksize`` is ``1'000'000 / number of images in dataset = 100``. To reduce the overhead on your CPU, you could set ``processes`` to ``14`` (or lower). To reduce the overhead on your RAM, you could set ``chunksize`` to ``80`` (or lower).
diff --git a/docs/resources/report_bug.rst → docs/04_resources/report_bug.rst b/docs/resources/report_bug.rst → docs/04_resources/report_bug.rst
diff --git a/docs/resources/supported_filetypes.rst → docs/04_resources/supported_filetypes.rst b/docs/resources/supported_filetypes.rst → docs/04_resources/supported_filetypes.rst
diff --git a/docs/conf.py b/docs/conf.py
@@ -3,11 +3,11 @@
 # -- Project information
 
 project = 'difPy Guide'
-copyright = '2024, Elise Landman'
+copyright = '2025, Elise Landman'
 author = 'Elise Landman'
 
-release = 'v4.1.3'
-version = 'v4.1.3'
+release = 'v4.2.0'
+version = 'v4.2.0'
 
 # -- General configuration