diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 172dcd538..185ef70bd 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -1,14 +1,3 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-#
-# ******** NOTE ********
-# We have attempted to detect the languages in your repository. Please check
-# the `language` matrix defined below to confirm you have the correct set of
-# supported CodeQL languages.
-#
 name: "CodeQL"
 
 on:
@@ -22,11 +11,11 @@ on:
 jobs:
   analyze:
     name: Analyze
-    runs-on: ${{ (matrix.language == 'c-cpp' && 'ubuntu-20.04-64core') || 'ubuntu-latest' }}
-    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    runs-on: ubuntu-20.04-64core
+    timeout-minutes: 360
     permissions:
-      actions: read
-      contents: read
+      actions: write
+      contents: write
       security-events: write
 
     strategy:
@@ -36,7 +25,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         lfs: true
         submodules: 'recursive'
@@ -44,19 +33,27 @@ jobs:
     - if: matrix.language == 'c-cpp'
       name: Setup environment
       run: |
+        sudo apt update -y && sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
         sudo apt update -y && sudo apt install -y --no-install-recommends \
         git git-lfs gcc-11 g++-11 ninja-build ccache libgtest-dev libgmock-dev \
-        shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils
+        shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils \
+        texlive-latex-extra ghostscript graphviz \
+        && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \
+        && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/bin/ /usr/local/ \
+        && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/share/ /usr/local/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/doc/ /usr/local/ \
+        && rm -rf /tmp/cmake-3.20.1*
 
     - if: matrix.language == 'c-cpp'
       name: Install Python Dependencies
       run: |
-        python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 breathe exhale \
-        recommonmark graphviz numpy==1.24.1
+        sudo apt update -y && sudo apt install -y --no-install-recommends \
+        python3 python3-pip python3-dev python3-distutils doxygen && sudo rm -rf /var/lib/apt/lists/* \
+        && python3 -m pip install sphinx-rtd-theme sphinx breathe exhale recommonmark graphviz \
+        && python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1
 
     - if: matrix.language == 'c-cpp'
       name: Install CUDA Toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
+      uses: Jimver/cuda-toolkit@v0.2.15
       id: cuda-toolkit
       with:
         cuda: '11.7.1'
@@ -72,22 +69,44 @@ jobs:
           nvcc -V
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
       with:
         languages: ${{ matrix.language }}
         queries: +security-and-quality
 
     - if: matrix.language != 'c-cpp'
       name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      uses: github/codeql-action/autobuild@v3
 
     - if: matrix.language == 'c-cpp'
       name: Build CMake project
       run: |
         echo "Running CMake project build script"
-        ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF" $*
+        ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" $*
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3
       with:
         category: "/language:${{matrix.language}}"
+
+    - if: matrix.language == 'c-cpp' && github.event_name == 'push'
+      name: Build Docs and Clean up Sphinx Build Directory
+      run: |
+        ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON" $*
+        find build/docs/sphinx -name '*.doctree' -delete
+        find build/docs/sphinx -name '*.map' -delete
+        find build/docs/sphinx -name '*.pickle' -delete
+        find build/docs/sphinx -name '*.inv' -delete
+        find build/docs/sphinx -name '*.gz' -delete
+
+    - if: matrix.language == 'c-cpp' && github.event_name == 'push'
+      name: Create .nojekyll file
+      run: touch build/docs/sphinx/.nojekyll
+
+    - if: matrix.language == 'c-cpp' && github.event_name == 'push'
+      name: Deploy to GitHub Pages
+      uses: JamesIves/github-pages-deploy-action@v4
+      with:
+        folder: build/docs/sphinx
+        branch: gh-pages
+        clean: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index f5197e2ed..000000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-
-default_stages:
-  - merge-commit
-  - commit
-  - post-rewrite
-
-repos:
-- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.3.0
-  hooks:
-  - id: check-merge-conflict
-  - id: trailing-whitespace
-  - id: end-of-file-fixer
-  - id: mixed-line-ending
-    args: ['--fix=lf']
-  - id: check-executables-have-shebangs
-  - id: check-shebang-scripts-are-executable
-  - id: detect-private-key
-  - id: check-added-large-files
-  - id: check-case-conflict
-  - id: destroyed-symlinks
-  - id: check-yaml
-    exclude: '.clang-format'
-
-- repo: https://github.com/detailyang/pre-commit-shell
-  rev: 1.0.5
-  hooks:
-  - id: shell-lint
-    args: ['-x', '--severity=warning']
-
-- repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v14.0.6
-  hooks:
-  - id: clang-format
-
-- repo: https://github.com/pryorda/dockerfilelint-precommit-hooks
-  rev: v0.1.0
-  hooks:
-  - id: dockerfilelint
-
-- repo: https://github.com/psf/black
-  rev: 22.10.0
-  hooks:
-  - id: black
-
-- repo: https://github.com/pycqa/flake8
-  rev: 5.0.4
-  hooks:
-  - id: flake8
-    args:
-      - "--max-line-length=110"
-
-- repo: local
-  hooks:
-  - id: copyright_check
-    name: 'check copyright message'
-    language: system
-    types: ['file', 'text', 'markdown']
-    exclude_types: ['xml', 'json', 'csv']
-    entry: ./lint/copyright_check.sh
-    exclude: 'models/.*'
-  - id: lfs_check
-    name: 'check LFS objects'
-    language: system
-    entry: ./lint/lfs_check.sh
-    require_serial: true
-
-- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
-  rev: v9.13.0
-  hooks:
-  - id: commitlint
-    stages: [commit-msg]
-    args: ['--config','lint/commitlint.config.js']
-    additional_dependencies: ['@commitlint/config-conventional']
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0f98aedef..841801962 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.7.0
+        VERSION 0.8.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 37852a875..ed687adb8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,7 +16,7 @@
 
 # Contributing to CV-CUDA
 
-**As of release v0.7.0-beta, CV-CUDA is not accepting outside contribution.**
+**Currently, CV-CUDA is not accepting outside contributions.**
 
 Contributions to CV-CUDA fall into the following categories:
 
@@ -28,8 +28,8 @@ Contributions to CV-CUDA fall into the following categories:
 1. To propose a new feature, please file a new feature request
    [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the
    intended feature and discuss the design and implementation with the team and
-   community. NOTE: Currently, as of release v0.7.0-beta, CV-CUDA is not accepting
-   outside contribution.
+   community. NOTE: Currently, CV-CUDA is not accepting
+   outside contributions.
 1. To ask a general question, please sumbit a question
    [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need
    more context on a particular issue, please ask in a comment.
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index 83e42f22c..1b63db508 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -80,6 +80,7 @@ CV-CUDA includes:
 | Reformat | Converts a planar image into non-planar and vice versa |
 | Remap | Maps pixels in an image with one projection to another projection in a new image. |
 | Resize | Changes the size and scale of an image |
+| ResizeCropConvertReformat | Performs fused Resize-Crop-Convert-Reformat sequence with optional channel reordering |
 | Rotate | Rotates a 2D array in multiples of 90 degrees |
 | SIFT | Identifies and describes features in images that are invariant to scale rotation and affine distortion. |
 | Thresholding | Chooses a global threshold value that is the same for all pixels across the image. |
diff --git a/README.md b/README.md
index 4eaf478d2..5a9af904d 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.7.0--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.8.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
 
@@ -76,10 +76,10 @@ Choose the installation method that meets your environment needs.
 
 Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be found [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example:
    ```shell
-   pip install cvcuda_<cu_ver>-0.7.0b0-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
+   pip install cvcuda_<cu_ver>-<x.x.x>-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
    ```
 
-where `<cu_ver>` is the desired CUDA version, `<py_ver>` is the desired Python version and `<arch>` is the desired architecture.
+where `<cu_ver>` is the desired CUDA version, `<x.x.x>` is the CV-CUDA release version, `<py_ver>` is the desired Python version and `<arch>` is the desired architecture.
 
 Please note that the Python wheels are standalone, they include both the C++/CUDA libraries and the Python bindings.
 
@@ -185,8 +185,8 @@ Install the dependencies required to  build the documentation:
 
 On Ubuntu, install the following packages using `apt` and `pip`:
 ```shell
-apt install -y doxygen graphviz python3 python3-pip
-python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
+apt install -y doxygen graphviz python3 python3-pip sphinx
+python3 -m pip install breathe exhale recommonmark graphviz sphinx-rtd-theme
 ```
 
 Build the documentation:
@@ -249,7 +249,7 @@ pip install cvcuda_cu12-<x.x.x>-cp310-cp310-linux_x86_64.whl
 
 CV-CUDA is an open source project. As part of the Open Source Community, we are
 committed to the cycle of learning, improving, and updating that makes this
-community thrive. However, as of release v0.7.0-beta, CV-CUDA is not yet ready
+community thrive. However, CV-CUDA is not yet ready
 for external contributions.
 
 To understand the process for contributing the CV-CUDA, see our
diff --git a/bench/BenchResizeCropConvertReformat.cpp b/bench/BenchResizeCropConvertReformat.cpp
new file mode 100644
index 000000000..f058a10e0
--- /dev/null
+++ b/bench/BenchResizeCropConvertReformat.cpp
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BenchUtils.hpp"
+
+#include <cvcuda/OpResizeCropConvertReformat.hpp>
+#include <nvcv/cuda/TypeTraits.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template<typename T>
+inline void ResizeCropConvertReformat(nvbench::state &state, nvbench::type_list<T>)
+try
+{
+    long3 srcShape = benchutils::GetShape<3>(state.get_string("shape"));
+    long  varShape = state.get_int64("varShape");
+
+    NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation"));
+
+    int2 cropPos{1, 1};
+
+    NVCVSize2D resize;
+
+    if (state.get_string("resizeType") == "EXPAND")
+    {
+        resize = NVCVSize2D{(int)(srcShape.y * 2), (int)(srcShape.z * 2)};
+    }
+    else if (state.get_string("resizeType") == "CONTRACT")
+    {
+        resize = NVCVSize2D{(int)(srcShape.y / 2), (int)(srcShape.z / 2)};
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType"));
+    }
+
+    NVCVChannelManip manip;
+
+    if (state.get_string("manip") == "NO_OP")
+    {
+        manip = NVCV_CHANNEL_NO_OP;
+    }
+    else if (state.get_string("manip") == "REVERSE")
+    {
+        manip = NVCV_CHANNEL_REVERSE;
+    }
+    else
+    {
+        throw std::invalid_argument("Invalid channel manipulation = " + state.get_string("manip"));
+    }
+
+    using BT = nvcv::cuda::BaseType<T>;
+    long nc  = nvcv::cuda::NumElements<T>;
+
+    long3 dstShape{srcShape.x, resize.h - cropPos.y, resize.w - cropPos.x};
+
+    if (dstShape.y <= 0 || dstShape.z <= 0)
+    {
+        throw std::invalid_argument("Invalid shape and resizeType");
+    }
+
+    state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T));
+    state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T));
+
+    cvcuda::ResizeCropConvertReformat op;
+
+    // clang-format off
+
+    if (varShape < 0) // negative var shape means use Tensor
+    {
+        nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, nc}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, nc}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillTensor<T>(src, benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &resize, &interpType, &cropPos, &manip](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, resize, interpType, cropPos, manip);
+        });
+    }
+    else // zero and positive var shape means use ImageBatchVarShape
+    {
+        nvcv::ImageBatchVarShape src(srcShape.x);
+        nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, nc}, "NHWC"}, benchutils::GetDataType<BT>());
+
+        benchutils::FillImageBatch<T>(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape},
+                                      benchutils::RandomValues<T>());
+
+        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &resize, &interpType, &cropPos, &manip](nvbench::launch &launch)
+        {
+            op(launch.get_stream(), src, dst, resize, interpType, cropPos, manip);
+        });
+    }
+}
+catch (const std::exception &err)
+{
+    state.skip(err.what());
+}
+
+// clang-format on
+
+using ResizeCropConvertReformatTypes = nvbench::type_list<uchar3>;
+
+NVBENCH_BENCH_TYPES(ResizeCropConvertReformat, NVBENCH_TYPE_AXES(ResizeCropConvertReformatTypes))
+    .set_type_axes_names({"InOutDataType"})
+    .add_string_axis("shape", {"1x1080x1920"})
+    .add_int64_axis("varShape", {-1, 0})
+    .add_string_axis("resizeType", {"EXPAND"})
+    .add_string_axis("manip", {"NO_OP"})
+    .add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index 3ca000274..a685f08ab 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -51,6 +51,7 @@ set(bench_sources
     BenchConvertTo.cpp
     BenchCopyMakeBorder.cpp
     BenchCropFlipNormalizeReformat.cpp
+    BenchResizeCropConvertReformat.cpp
     BenchCustomCrop.cpp
     BenchErase.cpp
     BenchGammaContrast.cpp
diff --git a/bench/python/all_ops/op_as_image.py b/bench/python/all_ops/op_as_image.py
new file mode 100644
index 000000000..bae98afa5
--- /dev/null
+++ b/bench/python/all_ops/op_as_image.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvcv
+
+# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
+# things may throw unexpected errors.
+import pycuda.driver as cuda  # noqa: F401
+from bench_utils import AbstractOpBase
+
+
+class OpAsImageFromNVCVImage(AbstractOpBase):
+    def setup(self, input):
+        # dummy run that does not use cache
+        img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
+
+        self.imglist = []
+        for _ in range(10):
+            img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
+            self.imglist.append(img.cuda())
+        self.cycle = 0
+
+    def run(self, input):
+        nvcv.as_image(self.imglist[self.cycle % len(self.imglist)])
+        self.cycle += 1
+        return
diff --git a/bench/python/all_ops/op_as_images.py b/bench/python/all_ops/op_as_images.py
new file mode 100644
index 000000000..d5022cf23
--- /dev/null
+++ b/bench/python/all_ops/op_as_images.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvcv
+
+# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
+# things may throw unexpected errors.
+import pycuda.driver as cuda  # noqa: F401
+from bench_utils import AbstractOpBase
+
+
+class OpAsImagesFromNVCVImage(AbstractOpBase):
+    def setup(self, input):
+        # dummy run that does not use cache
+        nvcv.ImageBatchVarShape(100)
+        img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
+
+        self.imglists = []
+        for _ in range(10):
+            imglist = []
+            for _ in range(100):
+                img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
+                imglist.append(img.cuda())
+            self.imglists.append(imglist)
+        self.cycle = 0
+
+    def run(self, input):
+        nvcv.as_images(self.imglists[self.cycle % len(self.imglists)])
+        self.cycle += 1
+        return
diff --git a/bench/python/all_ops/op_resize_crop_convert_reformat.py b/bench/python/all_ops/op_resize_crop_convert_reformat.py
new file mode 100644
index 000000000..46001f068
--- /dev/null
+++ b/bench/python/all_ops/op_resize_crop_convert_reformat.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
+# things may throw unexpected errors.
+import pycuda.driver as cuda  # noqa: F401
+
+from bench_utils import AbstractOpBase
+import nvcv
+import cvcuda
+
+
+class OpResizeCropConvertReformat(AbstractOpBase):
+    def setup(self, input):
+        resize = 256
+        crop = 224
+        delta_shape = resize - crop
+        start = delta_shape // 2
+        self.resize_dim = (resize, resize)
+        self.resize_interpolation = cvcuda.Interp.LINEAR
+        self.crop_rect = cvcuda.RectI(start, start, crop, crop)
+
+    def run(self, input):
+        return cvcuda.resize_crop_convert_reformat(
+            input,
+            self.resize_dim,
+            self.resize_interpolation,
+            self.crop_rect,
+            layout="NHWC",
+            data_type=nvcv.Type.U8,
+            manip=cvcuda.ChannelManip.REVERSE,
+        )
diff --git a/bench/python/bench_utils.py b/bench/python/bench_utils.py
index 91c9511a7..23578438e 100644
--- a/bench/python/bench_utils.py
+++ b/bench/python/bench_utils.py
@@ -266,19 +266,21 @@ def summarize_runs(
 
     results = []
 
-    for op in baseline_perf["mean_all_batches"]["run_bench"]:
+    for op in baseline_perf["data_mean_all_procs"]["run_bench"]:
         if op.startswith("Op"):
             op_name = op[2:]
 
             row_dict = {}
 
             # Fetch the time and parameters from the JSON for baseline run.
-            baseline_run_time = baseline_perf["mean_all_batches"]["run_bench"][op][
+            baseline_run_time = baseline_perf["data_mean_all_procs"]["run_bench"][op][
                 "run_op"
-            ]["cpu_time_minus_warmup_per_item"]
+            ]["cpu_time_minus_warmup_per_item"]["mean"]
 
             op_params = list(
-                baseline_perf["mean_all_batches"]["run_bench"][op]["op_params"].keys()
+                baseline_perf["data_mean_all_procs"]["run_bench"][op][
+                    "op_params"
+                ].keys()
             )[0]
 
             row_dict["operator name"] = op_name
@@ -290,13 +292,15 @@ def summarize_runs(
                     # Check if the OP was present.
                     if (
                         op
-                        in compare_perfs[compare_run_name]["mean_all_batches"][
+                        in compare_perfs[compare_run_name]["data_mean_all_procs"][
                             "run_bench"
                         ]
                     ):
                         compare_run_time = compare_perfs[compare_run_name][
-                            "mean_all_batches"
-                        ]["run_bench"][op]["run_op"]["cpu_time_minus_warmup_per_item"]
+                            "data_mean_all_procs"
+                        ]["run_bench"][op]["run_op"]["cpu_time_minus_warmup_per_item"][
+                            "mean"
+                        ]
                     else:
                         compare_run_time = None
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 41baeada7..addfb8e02 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -19,7 +19,7 @@
 # Usage: build_docs.sh [build folder]
 
 build_type="release"
-build_dir="build"
+build_dir="build-rel"
 
 if [[ $# -ge 1 ]]; then
    build_dir=$1
diff --git a/docker/config b/docker/config
index 56df16cb9..563a3a99c 100644
--- a/docker/config
+++ b/docker/config
@@ -23,7 +23,7 @@ IMAGE_URL_BASE=''
 # change is done, such as removing some package, or updating
 # packaged versions that introduces incompatibilities.
 TAG_IMAGE=6
-TAG_IMAGE_SAMPLES=6.1
+TAG_IMAGE_SAMPLES=7
 TAG_IMAGE_TEST=5
 
 VER_CUDA=11.7.1
diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv
index 85e45f080..9e286370f 100644
--- a/docs/sphinx/content/cvcuda_oplist.csv
+++ b/docs/sphinx/content/cvcuda_oplist.csv
@@ -43,6 +43,7 @@ RandomResizedCrop,Crops a random portion of an image and resizes it to a specifi
 Reformat,Converts a planar image into non-planar and vice versa
 Remap,Maps pixels in an image with one projection to another projection in a new image.
 Resize,Changes the size and scale of an image
+ResizeCropConvertReformat,Performs fused Resize-Crop-Convert-Reformat sequence with optional channel reordering.
 Rotate,Rotates a 2D array in multiples of 90 degrees
 SIFT,Identifies and matches features in images that are invariant to scale rotation and affine distortion.
 Thresholding,Chooses a global threshold value that is the same for all pixels across the image.
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 254a0bf63..ceeb3265e 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -124,12 +124,13 @@ Copyright
     :maxdepth: 1
     :hidden:
 
-    Beta.5 <relnotes/v0.7.0-beta>
-    Beta.4 <relnotes/v0.6.0-beta>
-    Beta.3 <relnotes/v0.5.0-beta>
-    Beta.2 <relnotes/v0.4.0-beta>
-    Beta.1 <relnotes/v0.3.1-beta>
-    Beta <relnotes/v0.3.0-beta>
-    Alpha.1 <relnotes/v0.2.1-alpha>
-    Alpha <relnotes/v0.2.0-alpha>
-    Pre-Alpha <relnotes/v0.1.0-prealpha>
+    v0.8.0-beta <relnotes/v0.8.0-beta>
+    v0.7.0-beta <relnotes/v0.7.0-beta>
+    v0.6.0-beta <relnotes/v0.6.0-beta>
+    v0.5.0-beta <relnotes/v0.5.0-beta>
+    v0.4.0-beta <relnotes/v0.4.0-beta>
+    v0.3.1-beta <relnotes/v0.3.1-beta>
+    v0.3.0-beta <relnotes/v0.3.0-beta>
+    v0.2.1-alpha <relnotes/v0.2.1-alpha>
+    v0.2.0-alpha <relnotes/v0.2.0-alpha>
+    v0.1.0-prealpha <relnotes/v0.1.0-prealpha>
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index 6c05a33d8..f1969c8ed 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -74,13 +74,13 @@ You can download the CV-CUDA tar, deb or wheel packages from `the asset section
 
 * Python Wheel File Installation
 
-    Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.7.0-beta>`_
+    Download the appropriate .whl file for your computer architecture, Python and CUDA version from `the asset section of the latest release <https://github.com/CVCUDA/CV-CUDA/releases>`_
 
     Execute the following command to install appropriate CV-CUDA Python wheel ::
 
-        pip install cvcuda_<cu_ver>-0.7.0b0-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
+        pip install cvcuda_<cu_ver>-<x.x.x>-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
 
-    where <cu_ver> is the desired CUDA version, <py_ver> the desired Python version and <arch> the desired architecture.
+    where <cu_ver> is the desired CUDA version, <x.x.x> the CV-CUDA release version, <py_ver> the desired Python version and <arch> the desired architecture.
 
     Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings.
 
diff --git a/docs/sphinx/relnotes/v0.1.0-prealpha.rst b/docs/sphinx/relnotes/v0.1.0-prealpha.rst
index 474486572..ddb876c0e 100644
--- a/docs/sphinx/relnotes/v0.1.0-prealpha.rst
+++ b/docs/sphinx/relnotes/v0.1.0-prealpha.rst
@@ -16,7 +16,7 @@
 
 .. _v0.1.0-prealpha:
 
-PreAlpha
+v0.1.0-preAlpha
 ========
 
 CV-CUDA-0.1.0 is the first release of CV-CUDA. This release is for evaluation purposes only.
diff --git a/docs/sphinx/relnotes/v0.2.0-alpha.rst b/docs/sphinx/relnotes/v0.2.0-alpha.rst
index 0e8687216..b6ef95ade 100644
--- a/docs/sphinx/relnotes/v0.2.0-alpha.rst
+++ b/docs/sphinx/relnotes/v0.2.0-alpha.rst
@@ -16,7 +16,7 @@
 
 .. _v0.2.0-alpha:
 
-Alpha
+v0.2.0-alpha
 ========
 
 CV-CUDA 0.2.0 is the first open-source release of the project.
diff --git a/docs/sphinx/relnotes/v0.2.1-alpha.rst b/docs/sphinx/relnotes/v0.2.1-alpha.rst
index b77ee6d65..65d5b8861 100644
--- a/docs/sphinx/relnotes/v0.2.1-alpha.rst
+++ b/docs/sphinx/relnotes/v0.2.1-alpha.rst
@@ -16,7 +16,7 @@
 
 .. _v0.2.1-alpha:
 
-Alpha.1
+v0.2.1-alpha
 =======
 
 General
diff --git a/docs/sphinx/relnotes/v0.3.0-beta.rst b/docs/sphinx/relnotes/v0.3.0-beta.rst
index f63c6d584..6473da539 100644
--- a/docs/sphinx/relnotes/v0.3.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.3.0-beta.rst
@@ -16,7 +16,7 @@
 
 .. _v0.3.0-beta:
 
-Beta
+v0.3.0-beta
 ====
 
 CV-CUDA 0.3.0 is the next open-source release of the project.
diff --git a/docs/sphinx/relnotes/v0.3.1-beta.rst b/docs/sphinx/relnotes/v0.3.1-beta.rst
index 970359c98..c04a3d1f7 100644
--- a/docs/sphinx/relnotes/v0.3.1-beta.rst
+++ b/docs/sphinx/relnotes/v0.3.1-beta.rst
@@ -16,7 +16,7 @@
 
 .. _v0.3.1-beta:
 
-Beta.1
+0.3.1-beta
 ======
 
 The v0.3.1 release provides several bug fixes along with documentation updates and performance improvements.
diff --git a/docs/sphinx/relnotes/v0.4.0-beta.rst b/docs/sphinx/relnotes/v0.4.0-beta.rst
index f6d99cb39..0f38b3138 100644
--- a/docs/sphinx/relnotes/v0.4.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.4.0-beta.rst
@@ -16,7 +16,7 @@
 
 .. _v0.4.0-beta:
 
-Beta.2
+v0.4.0-beta
 ======
 
 CV-CUDA 0.4.0 is a major release of the library providing multiple new operators, Jetson Orin support, and updated API documentation.
diff --git a/docs/sphinx/relnotes/v0.5.0-beta.rst b/docs/sphinx/relnotes/v0.5.0-beta.rst
index bd3633197..a15c1b98b 100644
--- a/docs/sphinx/relnotes/v0.5.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.5.0-beta.rst
@@ -16,7 +16,7 @@
 
 .. _v0.5.0-beta:
 
-Beta.3
+v0.5.0-beta
 ======
 
 CV-CUDA 0.5.0 is a comprehensive update introducing new security, compliance, and performance enhancements, alongside bug fixes and new features.
diff --git a/docs/sphinx/relnotes/v0.6.0-beta.rst b/docs/sphinx/relnotes/v0.6.0-beta.rst
index ca0995a67..c199fb2bd 100644
--- a/docs/sphinx/relnotes/v0.6.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.6.0-beta.rst
@@ -16,7 +16,7 @@
 
 .. _v0.6.0-beta:
 
-Beta.4
+v0.6.0-beta
 ======
 
 CV-CUDA 0.6.0 is a comprehensive update introducing new packaging and documentation enhancements, along with bug fixes and new features.
diff --git a/docs/sphinx/relnotes/v0.7.0-beta.rst b/docs/sphinx/relnotes/v0.7.0-beta.rst
index 5ad3ae437..196d236b4 100644
--- a/docs/sphinx/relnotes/v0.7.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.7.0-beta.rst
@@ -16,10 +16,10 @@
 
 .. _v0.7.0-beta:
 
-Beta.5
+v0.7.0-beta
 ======
 
-CV-CUDA 0.7.0 introduces performance and support enhancements, along with bug fixes and new features.
+CV-CUDA v0.7.0 introduces performance and support enhancements, along with bug fixes and new features.
 
 Release Highlights
 ------------------
diff --git a/docs/sphinx/relnotes/v0.8.0-beta.rst b/docs/sphinx/relnotes/v0.8.0-beta.rst
new file mode 100644
index 000000000..59e97fab6
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.8.0-beta.rst
@@ -0,0 +1,66 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.8.0-beta:
+
+v0.8.0-beta
+======
+
+Release Highlights
+------------------
+
+CV-CUDA v0.8.0 includes the following changes:
+
+* **New Operator**:
+
+  * Introduced fused 'ResizeCropConvertReformat' operator
+
+* **New Features**:
+
+  * Improved initialization of Image and ImageBatchVarShape: enabled efficient cache reuse
+  * Improved benchmarking utilities: added throughput computation and power/clock monitoring
+  * Added tests to Resize, BilateralFilter, CvtColor, Erase, JointBilateralFilter, PillowResize
+
+* **Bug Fixes**:
+
+  * Fixed potential crash when using custom streams
+  * Switched PairwiseMatcher to use L2-norm as default
+  * Fixed documentation of CropFlipNormalizeReformat
+
+
+
+Compatibility and Known Limitations
+-----------------------------------
+
+See main README on `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt
index 66b53d87f..b533a105d 100644
--- a/python/mod_cvcuda/CMakeLists.txt
+++ b/python/mod_cvcuda/CMakeLists.txt
@@ -21,6 +21,8 @@ nvcv_python_add_module(
     OUTPUT_NAME cvcuda
     SOURCES
         Main.cpp
+        ChannelManipType.cpp
+        OpResizeCropConvertReformat.cpp
         OpPairwiseMatcher.cpp
         PairwiseMatcherType.cpp
         NormType.cpp
diff --git a/python/mod_cvcuda/ChannelManipType.cpp b/python/mod_cvcuda/ChannelManipType.cpp
new file mode 100644
index 000000000..e32085100
--- /dev/null
+++ b/python/mod_cvcuda/ChannelManipType.cpp
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ChannelManipType.hpp"
+
+#include <cvcuda/Types.h>
+
+namespace cvcudapy {
+
+void ExportChannelManipType(py::module &m)
+{
+    py::enum_<NVCVChannelManip>(m, "ChannelManip")
+        .value("NO_OP", NVCV_CHANNEL_NO_OP, "No manipulation (i.e., channels are unchanged)")
+        .value("REVERSE", NVCV_CHANNEL_REVERSE, "Reverse channel order (e.g., RGB to BGR, RGBA to ABGR, etc.)");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/ChannelManipType.hpp b/python/mod_cvcuda/ChannelManipType.hpp
new file mode 100644
index 000000000..ee0a30ff1
--- /dev/null
+++ b/python/mod_cvcuda/ChannelManipType.hpp
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_PYTHON_CHANNELMANIPTYPE_HPP
+#define NVCV_PYTHON_CHANNELMANIPTYPE_HPP
+
+#include <pybind11/pybind11.h>
+
+namespace cvcudapy {
+namespace py = ::pybind11;
+
+void ExportChannelManipType(py::module &m);
+} // namespace cvcudapy
+
+#endif // NVCV_PYTHON_CHANNELMANIPTYPE_HPP
diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp
index aff67174b..f5c26574f 100644
--- a/python/mod_cvcuda/Main.cpp
+++ b/python/mod_cvcuda/Main.cpp
@@ -17,6 +17,7 @@
 
 #include "AdaptiveThresholdType.hpp"
 #include "BorderType.hpp"
+#include "ChannelManipType.hpp"
 #include "ColorConversionCode.hpp"
 #include "ConnectivityType.hpp"
 #include "InterpolationType.hpp"
@@ -77,6 +78,7 @@ PYBIND11_MODULE(cvcuda, m)
 
     // Operators' auxiliary entities
     ExportInterpolationType(m);
+    ExportChannelManipType(m);
     ExportBorderType(m);
     ExportMorphologyType(m);
     ExportColorConversionCode(m);
@@ -92,6 +94,7 @@ PYBIND11_MODULE(cvcuda, m)
     ExportPairwiseMatcherType(m);
 
     // CV-CUDA Operators
+    ExportOpResizeCropConvertReformat(m);
     ExportOpPairwiseMatcher(m);
     ExportOpLabel(m);
     ExportOpOSD(m);
diff --git a/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp b/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp
index 1eaacfce5..f5f8af139 100644
--- a/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp
+++ b/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp
@@ -102,9 +102,9 @@ void ExportOpCropFlipNormalizeReformat(py::module &m)
             rect (Tensor): The crop rectangle tensor which has shape of [batch_size, 1, 1, 4] in reference to the input tensor.
                            The crop value of [crop_x, crop_y, crop_width, crop_height] stored in the final dimension of
                            the crop tensor, provided per image.
-            flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping
-                                around the x-axis and positive value (for example, 1) means flipping
-                                around y-axis. Negative value (for example, -1) means flipping around both axes, provided per image.
+            flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping around the x-axis,
+                                1 means flipping around the y-axis, -1 means flipping around both axes,
+                                and any other value will result  in no flip, provided per image.
             base (Tensor): Tensor providing base values for normalization.
             scale (Tensor): Tensor providing scale values for normalization.
             globalscale (float ,optional): Additional scale value to be used in addition to scale
@@ -142,9 +142,9 @@ void ExportOpCropFlipNormalizeReformat(py::module &m)
             rect (Tensor): The crop rectangle tensor which has shape of [batch_size, 1, 1, 4] in reference to the input tensor.
                            The crop value of [crop_x, crop_y, crop_width, crop_height] stored in the final dimension of
                            the crop tensor, provided per image.
-            flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping
-                                around the x-axis and positive value (for example, 1) means flipping
-                                around y-axis. Negative value (for example, -1) means flipping around both axes, provided per image.
+            flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping around the x-axis,
+                                1 means flipping around the y-axis, -1 means flipping around both axes,
+                                and any other value will result  in no flip, provided per image.
             base (Tensor): Tensor providing base values for normalization.
             scale (Tensor): Tensor providing scale values for normalization.
             globalscale (float ,optional): Additional scale value to be used in addition to scale
diff --git a/python/mod_cvcuda/OpPairwiseMatcher.cpp b/python/mod_cvcuda/OpPairwiseMatcher.cpp
index 195c1f19c..687570a03 100644
--- a/python/mod_cvcuda/OpPairwiseMatcher.cpp
+++ b/python/mod_cvcuda/OpPairwiseMatcher.cpp
@@ -44,7 +44,7 @@ TupleTensor3 PairwiseMatcherInto(Tensor &matches, std::optional<Tensor> numMatch
 
     if (!normType)
     {
-        normType = set1.dtype() == nvcv::TYPE_F32 ? NVCV_NORM_L2 : NVCV_NORM_HAMMING;
+        normType = NVCV_NORM_L2;
     }
 
     auto op = CreateOperator<cvcuda::PairwiseMatcher>(algoChoice);
@@ -149,8 +149,7 @@ void ExportOpPairwiseMatcher(py::module &m)
             cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is
                                           the best match (minimum distance) from 1st set to 2nd set and vice versa.
             matches_per_point (Number, optional): Number of best matches to return per point.
-            norm_type (cvcuda.Norm, optional): Choice on how distances are normalized.  Defaults to cvcuda.Norm.L2
-                                               for float input and cvcuda.Norm.HAMMING for other input data types.
+            norm_type (cvcuda.Norm, optional): Choice on how distances are normalized.  Defaults to cvcuda.Norm.L2.
             algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match.
             stream (Stream, optional): CUDA Stream on which to perform the operation.
 
@@ -186,8 +185,7 @@ void ExportOpPairwiseMatcher(py::module &m)
             cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is
                                           the best match (minimum distance) from 1st set to 2nd set and vice versa.
             matches_per_point (Number, optional): Number of best matches to return per point.
-            norm_type (cvcuda.Norm, optional): Choice on how distances are normalized.  Defaults to cvcuda.Norm.L2
-                                               for float input and cvcuda.Norm.HAMMING for other input data types.
+            norm_type (cvcuda.Norm, optional): Choice on how distances are normalized.  Defaults to cvcuda.Norm.L2.
             algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match.
             stream (Stream, optional): CUDA Stream on which to perform the operation.
 
diff --git a/python/mod_cvcuda/OpResizeCropConvertReformat.cpp b/python/mod_cvcuda/OpResizeCropConvertReformat.cpp
new file mode 100644
index 000000000..a99c5072f
--- /dev/null
+++ b/python/mod_cvcuda/OpResizeCropConvertReformat.cpp
@@ -0,0 +1,374 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operators.hpp"
+
+#include <common/PyUtil.hpp>
+#include <cvcuda/OpResizeCropConvertReformat.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <nvcv/python/Image.hpp>
+#include <nvcv/python/ImageBatchVarShape.hpp>
+#include <nvcv/python/ResourceGuard.hpp>
+#include <nvcv/python/Stream.hpp>
+#include <nvcv/python/Tensor.hpp>
+#include <pybind11/stl.h>
+
+namespace cvcudapy {
+
+namespace {
+Tensor ResizeCropConvertReformatInto(Tensor &dst, Tensor &src, const std::tuple<int, int> resizeDim,
+                                     NVCVInterpolationType interp, const std::tuple<int, int> cropPos,
+                                     const NVCVChannelManip manip, std::optional<Stream> pstream)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    auto resize = CreateOperator<cvcuda::ResizeCropConvertReformat>();
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_MODE_READ, {src});
+    guard.add(LockMode::LOCK_MODE_WRITE, {dst});
+    guard.add(LockMode::LOCK_MODE_NONE, {*resize});
+
+    nvcv::Size2D size_wh{std::get<0>(resizeDim), std::get<1>(resizeDim)};
+    int2         crop_xy{std::get<0>(cropPos), std::get<1>(cropPos)};
+
+    resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip);
+
+    return std::move(dst);
+}
+
+Tensor ResizeCropConvertReformat(Tensor &src, const std::tuple<int, int> resizeDim, NVCVInterpolationType interp,
+                                 const NVCVRectI cropRect, const char *layout, nvcv::DataType dataType,
+                                 const NVCVChannelManip manip, std::optional<Stream> pstream)
+{
+    nvcv::TensorLayout srcLayout = src.layout();
+
+    if (srcLayout != NVCV_TENSOR_HWC && srcLayout != NVCV_TENSOR_NHWC)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_IMAGE_FORMAT,
+                              "Input tensor must have layout 'HWC' or 'NHWC'.");
+    }
+
+    nvcv::TensorLayout dstLayout = (layout && *layout ? nvcv::TensorLayout(layout) : nvcv::TensorLayout(""));
+
+    if (dstLayout.rank() == 0)
+    {
+        dstLayout = srcLayout;
+    }
+
+    if (dstLayout.rank() != srcLayout.rank())
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_IMAGE_FORMAT,
+                              "Output tensor rank must match input tensor rank.");
+    }
+    if (dstLayout != NVCV_TENSOR_HWC && dstLayout != NVCV_TENSOR_NHWC && dstLayout != NVCV_TENSOR_CHW
+        && dstLayout != NVCV_TENSOR_NCHW)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_IMAGE_FORMAT,
+                              "Output tensor must have layout 'HWC', 'NHWC', 'CHW', or 'NCHW'.");
+    }
+
+    nvcv::TensorShape srcShape = Permute(src.shape(), NVCV_TENSOR_NHWC);
+
+    nvcv::TensorShape::ShapeType shape = srcShape.shape();
+
+    shape[2] = cropRect.width;
+    shape[1] = cropRect.height;
+
+    nvcv::TensorShape dstShape = Permute(nvcv::TensorShape(shape, NVCV_TENSOR_NHWC), dstLayout);
+
+    Tensor dst = Tensor::Create(dstShape, dataType);
+
+    const std::tuple<int, int> cropPos = std::make_tuple((int)cropRect.x, (int)cropRect.y);
+
+    return ResizeCropConvertReformatInto(dst, src, resizeDim, interp, cropPos, manip, pstream);
+}
+
+Tensor ResizeCropConvertReformatVarShapeInto(Tensor &dst, ImageBatchVarShape &src, const std::tuple<int, int> resizeDim,
+                                             NVCVInterpolationType interp, const std::tuple<int, int> cropPos,
+                                             const NVCVChannelManip manip, std::optional<Stream> pstream)
+{
+    if (!pstream)
+    {
+        pstream = Stream::Current();
+    }
+
+    auto resize = CreateOperator<cvcuda::ResizeCropConvertReformat>();
+
+    ResourceGuard guard(*pstream);
+    guard.add(LockMode::LOCK_MODE_READ, {src});
+    guard.add(LockMode::LOCK_MODE_WRITE, {dst});
+    guard.add(LockMode::LOCK_MODE_NONE, {*resize});
+
+    nvcv::Size2D size_wh(std::get<0>(resizeDim), std::get<1>(resizeDim));
+    int2         crop_xy{std::get<0>(cropPos), std::get<1>(cropPos)};
+
+    resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip);
+
+    return std::move(dst);
+}
+
+Tensor ResizeCropConvertReformatVarShape(ImageBatchVarShape &src, const std::tuple<int, int> resizeDim,
+                                         NVCVInterpolationType interp, const NVCVRectI cropRect, const char *layout,
+                                         nvcv::DataType dataType, const NVCVChannelManip manip,
+                                         std::optional<Stream> pstream)
+{
+    const nvcv::ImageFormat srcFrmt = src.uniqueFormat();
+    if (!srcFrmt)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must have same format across all images.");
+    }
+
+    nvcv::TensorLayout dstLayout = (layout && *layout ? nvcv::TensorLayout(layout) : nvcv::TensorLayout(""));
+
+    int channels = srcFrmt.numChannels();
+    int images   = src.numImages();
+
+    if (channels != 3)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must have 3 channels.");
+    }
+
+    if (srcFrmt != nvcv::FMT_RGB8 && srcFrmt != nvcv::FMT_BGR8)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Input must have three interleaved, 8-bit channels in RGB or BGR format.");
+    }
+
+    nvcv::TensorShape shape;
+
+    if (dstLayout.rank() == 0)
+    {
+        if (srcFrmt == nvcv::FMT_RGB8 || srcFrmt == nvcv::FMT_BGR8)
+        {
+            shape = nvcv::TensorShape{
+                {images, cropRect.height, cropRect.width, channels},
+                NVCV_TENSOR_NHWC
+            };
+        }
+    }
+    else
+    {
+        if (dstLayout == NVCV_TENSOR_NHWC || dstLayout == NVCV_TENSOR_HWC || dstLayout == NVCV_TENSOR_NHW
+            || dstLayout == NVCV_TENSOR_HW)
+            shape = nvcv::TensorShape{
+                {images, cropRect.height, cropRect.width, channels},
+                NVCV_TENSOR_NHWC
+            };
+        else if (dstLayout == NVCV_TENSOR_NCHW || dstLayout == NVCV_TENSOR_CHW)
+            shape = nvcv::TensorShape{
+                {images, channels, cropRect.height, cropRect.width},
+                NVCV_TENSOR_NCHW
+            };
+        else
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Destination layout must be 'HWC', 'NHWC', 'NHW', 'HW', 'NCHW', or 'CHW'.");
+        }
+    }
+
+    Tensor dst = Tensor::Create(shape, dataType);
+
+    const std::tuple<int, int> cropPos = std::make_tuple((int)cropRect.x, (int)cropRect.y);
+
+    return ResizeCropConvertReformatVarShapeInto(dst, src, resizeDim, interp, cropPos, manip, pstream);
+}
+
+} // namespace
+
+void ExportOpResizeCropConvertReformat(py::module &m)
+{
+    using namespace pybind11::literals;
+
+    py::options options;
+    options.disable_function_signatures();
+
+    m.def("resize_crop_convert_reformat", &ResizeCropConvertReformat, "src"_a, "resize_dim"_a, "interp"_a,
+          "crop_rect"_a, py::kw_only(), "layout"_a = "", "data_type"_a = NVCV_DATA_TYPE_NONE,
+          "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.resize_crop_convert_reformat(src: nvcv.Tensor,
+                                        resize_dim: tuple[int,int],
+                                        interp: cvcuda.Interp,
+                                        crop_rect: nvcv.RectI,
+                                        *,
+                                        layout: str = "",
+                                        data_type: nvcv.Type = 0,
+                                        manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP,
+                                        stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor
+
+        Executes the ResizeCropConvertReformat operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator
+            for more details and usage examples.
+
+        Args:
+            src (nvcv.Tensor): Input tensor containing one or more images.
+            resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping).
+            interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and
+                                    NVCV_INTERP_LINEAR are available.
+            crop_rect (nvcv.RectI): Crop rectangle, (top, left, width, height), specifying the top-left corner and
+                                   width & height dimensions of the region to crop from the resized images.
+            layout(string, optional): String specifying output tensor layout (e.g., 'NHWC' or 'CHW'). Empty string
+                                      (default) indicates output tensor layout copies input.
+            data_type(nvcv.Type, optional): Data type of output tensor channel (e.g., uint8 or float). 0 (default)
+                                           indicates output tensor data type copies input.
+            manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default)
+                                                  indicates output tensor channels are unchanged.
+            stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            nvcv.Tensor: The output tensor.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("resize_crop_convert_reformat_into", &ResizeCropConvertReformatInto, "dst"_a, "src"_a, "resize_dim"_a,
+          "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.resize_crop_convert_reformat_into(dst: nvcv.Tensor,
+                                             src: nvcv.Tensor,
+                                             resize_dim: tuple[int,int],
+                                             interp: cvcuda.Interp,
+                                             cropPos: tuple[int,int],
+                                             *,
+                                             manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP,
+                                             stream: Optional[nvcv.cuda.Stream] = None)
+
+        Executes the ResizeCropConvertReformat operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator
+            for more details and usage examples.
+
+        Args:
+            dst (nvcv.Tensor): Output tensor to store the result of the operation. Output tensor also specifies the
+                               crop dimensions (i.e., width & height), as well as the output data type (e.g., uchar3
+                               or float) and tensor layout (e.g., 'NHWC' or 'NCHW').
+            src (nvcv.Tensor): Input tensor containing one or more images.
+            resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping).
+            interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and
+                                    NVCV_INTERP_LINEAR are available.
+            cropPos (tuple[int,int]): Crop position, (top, left), specifying the top-left corner of the region to crop
+                                      from the resized images. The crop region's width and height is specified by the
+                                      output tensor's width & height.
+            manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default)
+                                                  indicates output tensor channels are unchanged.
+            stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            None
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("resize_crop_convert_reformat", &ResizeCropConvertReformatVarShape, "src"_a, "resize_dim"_a, "interp"_a,
+          "crop_rect"_a, py::kw_only(), "layout"_a = "", "data_type"_a = NVCV_DATA_TYPE_NONE,
+          "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.resizeCropConvertReformat(src: nvcv.ImageBatchVarShape,
+                                     resize_dim: tuple[int,int],
+                      interp: cvcuda.Interp,
+                      crop_rect: nvcv.RectI,
+                      *,
+                      layout: str = "",
+                      data_type: nvcv.Type = 0,
+                      manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP,
+                      stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor
+
+        Executes the ResizeCropConvertReformat operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator
+            for more details and usage examples.
+
+        Args:
+            src (ImageBatchVarShape): Input image batch containing one or more images of varying sizes, but all images
+                                      must have the same data type, channels, and layout.
+            resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping).
+            interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and
+                                    NVCV_INTERP_LINEAR are available.
+            crop_rect (nvcv.RectI): Crop rectangle, (top, left, width, height), specifying the top-left corner and
+                                   width & height dimensions of the region to crop from the resized images.
+            layout(string, optional): String specifying output tensor layout (e.g., 'NHWC' or 'CHW'). Empty string
+                                      (default) indicates output tensor layout copies input.
+            data_type(nvcv.Type, optional): Data type of output tensor channel (e.g., uint8 or float). 0 (default)
+                                           indicates output tensor data type copies input.
+            manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default)
+                                                  indicates output tensor channels are unchanged.
+            stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            nvcv.Tensor: The output tensor.
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+
+    m.def("resize_crop_convert_reformat_into", &ResizeCropConvertReformatVarShapeInto, "dst"_a, "src"_a, "resize_dim"_a,
+          "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc(
+
+	cvcuda.resize_crop_convert_reformat_into(dst: nvcv.Tensor,
+                                             src: nvcv.ImageBatchVarShape,
+                                             resize_dim: tuple[int,int],
+                                             interp: cvcuda.Interp,
+                                             cropPos: tuple[int,int],
+                                             *,
+                                             manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP,
+                                             stream: Optional[nvcv.cuda.Stream] = None)
+
+        Executes the ResizeCropConvertReformat operation on the given cuda stream.
+
+        See also:
+            Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator
+            for more details and usage examples.
+
+        Args:
+            dst (nvcv.Tensor): Output tensor to store the result of the operation. Output tensor also specifies the
+                               crop dimensions (i.e., width & height), as well as the output data type (e.g., uchar3
+                               or float) and tensor layout (e.g., 'NHWC' or 'NCHW').
+            src (ImageBatchVarShape): Input image batch containing one or more images of varying sizes, but all images
+                                      must have the same data type, channels, and layout.
+            resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping).
+            interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and
+                                    NVCV_INTERP_LINEAR are available.
+            cropPos (tuple[int,int]): Crop position, (top, left), specifying the top-left corner of the region to
+                                      crop from the resized images. The crop region's width and height is specified by
+                                      the output tensor's width & height.
+            manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default)
+                                                  indicates output tensor channels are unchanged.
+            stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation.
+
+        Returns:
+            None
+
+        Caution:
+            Restrictions to several arguments may apply. Check the C
+            API references of the CV-CUDA operator.
+    )pbdoc");
+}
+
+} // namespace cvcudapy
diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp
index 6197e43ba..55307fcff 100644
--- a/python/mod_cvcuda/Operators.hpp
+++ b/python/mod_cvcuda/Operators.hpp
@@ -99,6 +99,7 @@ void ExportOpLabel(py::module &m);
 void ExportOpPairwiseMatcher(py::module &m);
 void ExportOpStack(py::module &m);
 void ExportOpFindHomography(py::module &m);
+void ExportOpResizeCropConvertReformat(py::module &m);
 
 // Helper class that serves as generic python-side operator class.
 // OP: native operator class
diff --git a/python/mod_nvcv/Cache.cpp b/python/mod_nvcv/Cache.cpp
index 23053439d..29c6d8d5f 100644
--- a/python/mod_nvcv/Cache.cpp
+++ b/python/mod_nvcv/Cache.cpp
@@ -151,6 +151,37 @@ std::vector<std::shared_ptr<CacheItem>> Cache::fetch(const IKey &key) const
     return v;
 }
 
+#ifndef NDEBUG
+void Cache::dbgPrintCacheForKey(const IKey &key, const std::string &prefix)
+{
+    std::vector<std::shared_ptr<CacheItem>> v;
+    std::unique_lock<std::mutex>            lk(pimpl->mtx);
+    auto                                    itrange = pimpl->items.equal_range(&key);
+
+    for (auto it = itrange.first; it != itrange.second; ++it)
+    {
+        std::cerr << prefix << typeid(*(it->second)).name() << " - " << it->second.use_count() << std::endl;
+    }
+}
+#endif
+
+std::shared_ptr<CacheItem> Cache::fetchOne(const IKey &key) const
+{
+    std::unique_lock<std::mutex> lk(pimpl->mtx);
+
+    auto itrange = pimpl->items.equal_range(&key);
+
+    for (auto it = itrange.first; it != itrange.second; ++it)
+    {
+        if (!it->second->isInUse())
+        {
+            return it->second;
+        }
+    }
+
+    return {};
+}
+
 void Cache::clear()
 {
     std::unique_lock<std::mutex> lk(pimpl->mtx);
diff --git a/python/mod_nvcv/Cache.hpp b/python/mod_nvcv/Cache.hpp
index c933c9196..fdc962911 100644
--- a/python/mod_nvcv/Cache.hpp
+++ b/python/mod_nvcv/Cache.hpp
@@ -76,6 +76,12 @@ class PYBIND11_EXPORT Cache
     void removeAllNotInUseMatching(const IKey &key);
 
     std::vector<std::shared_ptr<CacheItem>> fetch(const IKey &key) const;
+    std::shared_ptr<CacheItem>              fetchOne(const IKey &key) const;
+
+#ifndef NDEBUG
+    // Make this function available only in Debug builds
+    void dbgPrintCacheForKey(const IKey &key, const std::string &prefix = "");
+#endif
 
     template<class T>
     std::vector<std::shared_ptr<T>> fetchAll() const
diff --git a/python/mod_nvcv/Image.cpp b/python/mod_nvcv/Image.cpp
index 703611db7..393937306 100644
--- a/python/mod_nvcv/Image.cpp
+++ b/python/mod_nvcv/Image.cpp
@@ -463,28 +463,7 @@ Image::Image(std::vector<std::shared_ptr<ExternalBuffer>> bufs, const nvcv::Imag
 {
     m_wrapData.emplace();
 
-    NVCV_ASSERT(bufs.size() >= 1);
-    m_wrapData->devType = bufs[0]->dlTensor().device.device_type;
-
-    if (bufs.size() == 1)
-    {
-        m_wrapData->obj = py::cast(bufs[0]);
-    }
-    else
-    {
-        for (size_t i = 1; i < bufs.size(); ++i)
-        {
-            if (bufs[i]->dlTensor().device.device_type != bufs[0]->dlTensor().device.device_type
-                || bufs[i]->dlTensor().device.device_id != bufs[0]->dlTensor().device.device_id)
-            {
-                throw std::runtime_error("All buffers must belong to the same device, but some don't.");
-            }
-        }
-
-        m_wrapData->obj = py::cast(std::move(bufs));
-    }
-
-    m_impl = nvcv::ImageWrapData(imgData);
+    this->setWrapData(std::move(bufs), imgData);
 }
 
 Image::Image(std::vector<py::buffer> bufs, const nvcv::ImageDataStridedHost &hostData, int rowAlign)
@@ -571,6 +550,52 @@ std::shared_ptr<Image> Image::WrapExternalBuffer(ExternalBuffer &buffer, nvcv::I
     return WrapExternalBufferVector({obj}, fmt);
 }
 
+std::vector<std::shared_ptr<Image>> Image::WrapExternalBufferMany(std::vector<std::shared_ptr<ExternalBuffer>> &buffers,
+                                                                  nvcv::ImageFormat                             fmt)
+{
+    // This is the key of an image wrapper.
+    // All image wrappers have the same key.
+    Image::Key key;
+
+    std::vector<std::shared_ptr<CacheItem>> items = Cache::Instance().fetch(key);
+
+    std::vector<std::shared_ptr<Image>> out;
+    out.reserve(buffers.size());
+
+    for (size_t i = 0; i < buffers.size(); ++i)
+    {
+        std::vector<std::shared_ptr<ExternalBuffer>> spBuffers;
+        spBuffers.push_back(buffers[i]);
+
+        if (!spBuffers.back())
+            throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces");
+
+        std::vector<DLPackTensor> bufinfos;
+        bufinfos.emplace_back(spBuffers[0]->dlTensor());
+        nvcv::ImageDataStridedCuda imgData = CreateNVCVImageDataCuda(bufinfos, fmt);
+
+        // None found?
+        if (items.empty())
+        {
+            // Need to add wrappers into cache so that they don't get destroyed by
+            // the cuda stream when they're last used, and python script isn't
+            // holding a reference to them. If we don't do it, things might break.
+            std::shared_ptr<Image> img(new Image(std::move(spBuffers), imgData));
+            Cache::Instance().add(*img);
+            out.push_back(img);
+        }
+        else
+        {
+            std::shared_ptr<Image> img = std::static_pointer_cast<Image>(items.back());
+            items.pop_back();
+            img->setWrapData(std::move(spBuffers), imgData);
+            out.push_back(img);
+        }
+    }
+
+    return out;
+}
+
 std::shared_ptr<Image> Image::WrapExternalBufferVector(std::vector<py::object> buffers, nvcv::ImageFormat fmt)
 {
     std::vector<std::shared_ptr<ExternalBuffer>> spBuffers;
@@ -594,16 +619,55 @@ std::shared_ptr<Image> Image::WrapExternalBufferVector(std::vector<py::object> b
     // This is the key of an image wrapper.
     // All image wrappers have the same key.
     Image::Key key;
-    // We take this opportunity to remove from cache all wrappers that aren't
-    // being used. They aren't reusable anyway.
-    Cache::Instance().removeAllNotInUseMatching(key);
 
-    // Need to add wrappers to cache so that they don't get destroyed by
-    // the cuda stream when they're last used, and python script isn't
-    // holding a reference to them. If we don't do it, things might break.
-    std::shared_ptr<Image> img(new Image(std::move(spBuffers), imgData));
-    Cache::Instance().add(*img);
-    return img;
+    std::shared_ptr<CacheItem> item = Cache::Instance().fetchOne(key);
+
+    // None found?
+    if (!item)
+    {
+        // Need to add wrappers into cache so that they don't get destroyed by
+        // the cuda stream when they're last used, and python script isn't
+        // holding a reference to them. If we don't do it, things might break.
+        std::shared_ptr<Image> img(new Image(std::move(spBuffers), imgData));
+        Cache::Instance().add(*img);
+        return img;
+    }
+    else
+    {
+        std::shared_ptr<Image> img = std::static_pointer_cast<Image>(item);
+        img->setWrapData(std::move(spBuffers), imgData);
+        return img;
+    }
+}
+
+void Image::setWrapData(std::vector<std::shared_ptr<ExternalBuffer>> bufs, const nvcv::ImageDataStridedCuda &imgData)
+{
+    NVCV_ASSERT(m_wrapData);
+
+    NVCV_ASSERT(bufs.size() >= 1);
+    m_wrapData->devType = bufs[0]->dlTensor().device.device_type;
+
+    if (bufs.size() == 1)
+    {
+        m_wrapData->obj = py::cast(bufs[0]);
+    }
+    else
+    {
+        for (size_t i = 1; i < bufs.size(); ++i)
+        {
+            if (bufs[i]->dlTensor().device.device_type != bufs[0]->dlTensor().device.device_type
+                || bufs[i]->dlTensor().device.device_id != bufs[0]->dlTensor().device.device_id)
+            {
+                throw std::runtime_error("All buffers must belong to the same device, but some don't.");
+            }
+        }
+
+        m_wrapData->obj = py::cast(std::move(bufs));
+    }
+
+    //We recreate the nvcv::Image wrapper (m_impl) because it's cheap.
+    //It's not cheap to create nvcvpy::Image as it might have allocated expensive resources (cudaEvent_t in Resource parent).
+    m_impl = nvcv::ImageWrapData(imgData);
 }
 
 std::shared_ptr<Image> Image::CreateHost(py::buffer buffer, nvcv::ImageFormat fmt, int rowAlign)
diff --git a/python/mod_nvcv/Image.hpp b/python/mod_nvcv/Image.hpp
index eb3c7f768..4bf10a887 100644
--- a/python/mod_nvcv/Image.hpp
+++ b/python/mod_nvcv/Image.hpp
@@ -46,7 +46,9 @@ class Image final : public Container
     static std::shared_ptr<Image> CreateHost(py::buffer buffer, nvcv::ImageFormat fmt, int rowAlign);
     static std::shared_ptr<Image> CreateHostVector(std::vector<py::buffer> buffer, nvcv::ImageFormat fmt, int rowAlign);
 
-    static std::shared_ptr<Image> WrapExternalBuffer(ExternalBuffer &buffer, nvcv::ImageFormat fmt);
+    static std::shared_ptr<Image>              WrapExternalBuffer(ExternalBuffer &buffer, nvcv::ImageFormat fmt);
+    static std::vector<std::shared_ptr<Image>> WrapExternalBufferMany(
+        std::vector<std::shared_ptr<ExternalBuffer>> &buffer, nvcv::ImageFormat fmt);
     static std::shared_ptr<Image> WrapExternalBufferVector(std::vector<py::object> buffer, nvcv::ImageFormat fmt);
 
     std::shared_ptr<Image>       shared_from_this();
@@ -106,6 +108,8 @@ class Image final : public Container
     explicit Image(std::vector<std::shared_ptr<ExternalBuffer>> buf, const nvcv::ImageDataStridedCuda &imgData);
     explicit Image(std::vector<py::buffer> buf, const nvcv::ImageDataStridedHost &imgData, int rowalign);
 
+    void setWrapData(std::vector<std::shared_ptr<ExternalBuffer>> buf, const nvcv::ImageDataStridedCuda &imgData);
+
     nvcv::Image m_impl; // must come before m_key
     Key         m_key;
 
diff --git a/python/mod_nvcv/ImageBatch.cpp b/python/mod_nvcv/ImageBatch.cpp
index 58831fe9e..c0595b22b 100644
--- a/python/mod_nvcv/ImageBatch.cpp
+++ b/python/mod_nvcv/ImageBatch.cpp
@@ -51,7 +51,8 @@ std::shared_ptr<ImageBatchVarShape> ImageBatchVarShape::Create(int capacity)
     else
     {
         // Get the first one
-        auto batch = std::static_pointer_cast<ImageBatchVarShape>(vcont[0]);
+        auto       batch = std::static_pointer_cast<ImageBatchVarShape>(vcont[0]);
+        Image::Key key;
         batch->clear(); // make sure it's in pristine state
         return batch;
     }
@@ -60,17 +61,22 @@ std::shared_ptr<ImageBatchVarShape> ImageBatchVarShape::Create(int capacity)
 std::shared_ptr<ImageBatchVarShape> ImageBatchVarShape::WrapExternalBufferVector(std::vector<py::object> buffers,
                                                                                  nvcv::ImageFormat       fmt)
 {
-    auto batch = Create(buffers.size());
-    for (auto &obj : buffers)
+    std::vector<std::shared_ptr<ExternalBuffer>> buflist;
+    buflist.reserve(buffers.size());
+
+    for (py::object &obj : buffers)
     {
         std::shared_ptr<ExternalBuffer> buffer = cast_py_object_as<ExternalBuffer>(obj);
         if (!buffer)
         {
             throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces");
         }
-        auto image = Image::WrapExternalBuffer(*buffer, fmt);
-        batch->pushBack(*image);
+        buflist.push_back(buffer);
     }
+
+    std::shared_ptr<ImageBatchVarShape> batch = Create(buffers.size());
+    batch->pushBackMany(Image::WrapExternalBufferMany(buflist, fmt));
+
     return batch;
 }
 
@@ -127,16 +133,24 @@ void ImageBatchVarShape::pushBack(Image &img)
     m_list.push_back(img.shared_from_this());
 }
 
-void ImageBatchVarShape::pushBackMany(std::vector<std::shared_ptr<Image>> &imgList)
+void ImageBatchVarShape::pushBackMany(const std::vector<std::shared_ptr<Image>> &imgList)
 {
-    for (auto &img : imgList)
+    std::vector<NVCVImageHandle> handlelist;
+    handlelist.reserve(imgList.size());
+    for (size_t i = 0; i < imgList.size(); ++i)
     {
-        m_list.push_back(img);
-        if (img)
-            m_impl.pushBack(img->impl());
+        if (imgList[i])
+        {
+            handlelist.push_back(imgList[i]->impl().handle());
+        }
         else
-            m_impl.pushBack(nvcv::Image());
+        {
+            handlelist.push_back(nullptr);
+        }
+        m_list.push_back(imgList[i]);
     }
+
+    nvcv::detail::CheckThrow(nvcvImageBatchVarShapePushImages(m_impl.handle(), handlelist.data(), handlelist.size()));
 }
 
 void ImageBatchVarShape::popBack(int imgCount)
diff --git a/python/mod_nvcv/ImageBatch.hpp b/python/mod_nvcv/ImageBatch.hpp
index 7006d4cfd..ffa58d865 100644
--- a/python/mod_nvcv/ImageBatch.hpp
+++ b/python/mod_nvcv/ImageBatch.hpp
@@ -55,7 +55,7 @@ class ImageBatchVarShape : public Container
     Size2D     maxSize() const;
 
     void pushBack(Image &img);
-    void pushBackMany(std::vector<std::shared_ptr<Image>> &imgList);
+    void pushBackMany(const std::vector<std::shared_ptr<Image>> &imgList);
     void popBack(int imgCount);
     void clear();
 
diff --git a/python/mod_nvcv/Main.cpp b/python/mod_nvcv/Main.cpp
index d02bf3896..372f02a24 100644
--- a/python/mod_nvcv/Main.cpp
+++ b/python/mod_nvcv/Main.cpp
@@ -48,24 +48,32 @@ PYBIND11_MODULE(nvcv, m)
 
     using namespace nvcvpy::priv;
 
-    // Core entities
-    Cache::Export(m);
-
-    {
-        py::module_ cuda = m.def_submodule("cuda");
-        Stream::Export(cuda);
-    }
+    // These will be destroyed in the reverse order here
+    // Since everything is ref counted the order should not matter
+    // but it is safer to ini them in order
 
+    // Core entities
+    ExportCAPI(m);
+    Resource::Export(m);
+    Container::Export(m);
     ExternalBuffer::Export(m);
+
+    // Supporting objects
     ExportImageFormat(m);
     ExportDataType(m);
     ExportRect(m);
-    Resource::Export(m);
-    Container::Export(m);
+    ExportColorSpec(m);
+
+    // Objects
     Tensor::Export(m);
     TensorBatch::Export(m);
     Image::Export(m);
     ImageBatchVarShape::Export(m);
-    ExportCAPI(m);
-    ExportColorSpec(m);
+
+    // Cache and Streams
+    Cache::Export(m);
+    {
+        py::module_ cuda = m.def_submodule("cuda");
+        Stream::Export(cuda);
+    }
 }
diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp
index a8d0fe67a..5aea1d631 100644
--- a/python/mod_nvcv/Resource.cpp
+++ b/python/mod_nvcv/Resource.cpp
@@ -31,15 +31,6 @@ Resource::Resource()
     m_id = idnext++;
 
     m_event = nullptr;
-    try
-    {
-        util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
-    }
-    catch (...)
-    {
-        cudaEventDestroy(m_event);
-        throw;
-    }
 }
 
 Resource::~Resource()
@@ -52,6 +43,15 @@ uint64_t Resource::id() const
     return m_id;
 }
 
+cudaEvent_t Resource::event()
+{
+    if (m_event == nullptr)
+    {
+        util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+    }
+    return m_event;
+}
+
 void Resource::submitSync(Stream &stream)
 {
     //Check if we have a last stream, if not set it to the current stream
@@ -69,8 +69,8 @@ void Resource::submitSync(Stream &stream)
 
     // if we are on a different stream we need to wait for that stream to finish
     // write event on the old stream, the new stream will have to wait for it to be done
-    util::CheckThrow(cudaEventRecord(m_event, m_lastStream.value()->handle()));
-    util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_event));
+    util::CheckThrow(cudaEventRecord(event(), m_lastStream.value()->handle()));
+    util::CheckThrow(cudaStreamWaitEvent(stream.handle(), event()));
 
     // update the last stream since we changed streams
     m_lastStream.reset();
diff --git a/python/mod_nvcv/Resource.hpp b/python/mod_nvcv/Resource.hpp
index 010c8b33c..f66b4b5ac 100644
--- a/python/mod_nvcv/Resource.hpp
+++ b/python/mod_nvcv/Resource.hpp
@@ -91,6 +91,8 @@ class PYBIND11_EXPORT Resource : public virtual Object
     uint64_t                                     m_id;         /**< The unique identifier of the resource. */
     cudaEvent_t                                  m_event;      /**< The CUDA event used for synchronization. */
     std::optional<std::shared_ptr<const Stream>> m_lastStream; /**< Cache the last stream used for this resource. */
+
+    cudaEvent_t event();
 };
 
 } // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Stream.cpp b/python/mod_nvcv/Stream.cpp
index bd3c1f9dc..39afa8203 100644
--- a/python/mod_nvcv/Stream.cpp
+++ b/python/mod_nvcv/Stream.cpp
@@ -436,24 +436,32 @@ void Stream::Export(py::module &m)
     util::RegisterCleanup(m,
                           [globalStream]()
                           {
-                              for (std::shared_ptr<Stream> stream : Cache::Instance().fetchAll<Stream>())
+                              try
                               {
-                                  stream->sync();
+                                  for (std::shared_ptr<Stream> stream : Cache::Instance().fetchAll<Stream>())
+                                  {
+                                      stream->sync();
+                                  }
+                                  globalStream->sync();
+
+                                  // There should only be 1 stream in the stack, namely the
+                                  // global stream.
+                                  auto s = StreamStack::Instance().top();
+                                  if (s != globalStream)
+                                  {
+                                      std::cerr << "Stream stack leak detected" << std::endl;
+                                  }
+
+                                  // Make sure stream stack is empty
+                                  while (auto s = StreamStack::Instance().top())
+                                  {
+                                      StreamStack::Instance().pop();
+                                  }
                               }
-                              globalStream->sync();
-
-                              // There should only be 1 stream in the stack, namely the
-                              // global stream.
-                              auto s = StreamStack::Instance().top();
-                              if (s != globalStream)
-                              {
-                                  std::cerr << "Stream stack leak detected" << std::endl;
-                              }
-
-                              // Make sure stream stack is empty
-                              while (auto s = StreamStack::Instance().top())
+                              catch (const std::exception &e)
                               {
-                                  StreamStack::Instance().pop();
+                                  //Do nothing here this can happen if someone closes the cuda context prior to exit.
+                                  std::cerr << "Warning CVCUDA cleanup may be incomplete due to: " << e.what() << "\n";
                               }
                           });
 }
diff --git a/samples/common/python/perf_utils.py b/samples/common/python/perf_utils.py
index 7c32a2bdc..1d854337f 100644
--- a/samples/common/python/perf_utils.py
+++ b/samples/common/python/perf_utils.py
@@ -218,9 +218,12 @@ def finalize(self):
             #   "data" : {
             #       ...
             #   }
-            #   "mean_data" : {
+            #   "data_stats_minus_warmup" : {
             #       ...
             #   }
+            #   "gpu_metrics" : {
+            #       ...
+            #   },
             #   "batch_info" : {
             #       ...
             #   }
@@ -234,8 +237,9 @@ def finalize(self):
             #
             # The data field stores timing info of all batches keyed with raw flattened
             # names of the NVTX push/pop ranges.
-            # The mean_data stores the mean timing info for NVTX ranges across
-            # all the batches.
+            # The data_stats_minus_warmup stores various stats (e.g. mean, median) for
+            # NVTX ranges across all the batches.
+            # The gpu_metrics field stores various GPU metrics (e.g. power and utilization)
             # The batch_info stores the batch index and batch size of each batch.
             # The inside batch info is list of NVTX range names which executed inside
             # a batch.
@@ -248,7 +252,8 @@ def finalize(self):
             #
             benchmark_dict = {
                 "data": self.timing_info,
-                "mean_data": {},
+                "data_stats_minus_warmup": {},
+                "gpu_metrics": {},
                 "batch_info": self.batch_info,
                 "inside_batch_info": self.inside_batch_info,
                 "meta": {},
@@ -296,12 +301,13 @@ def finalize(self):
             return {}
 
 
-def maximize_clocks(logger):
+def maximize_clocks(logger, device_id=0):
     """
     Maximizes the GPU clocks. Useful to do it before any type of performance
     benchmarking.
+    :param device_id: The GPU device ID whose clocks should be maximized.
     """
-    logger.info("Trying to maximize the GPU clocks...")
+    logger.info("Trying to maximize the GPU clocks for device: %d" % device_id)
 
     gpu_available = torch.cuda.device_count() > 0
 
@@ -353,6 +359,7 @@ def maximize_clocks(logger):
             "nvidia-smi",
             "--query-gpu=power.limit",
             "--format=csv,nounits,noheader",
+            "-i=%d" % device_id,
         ],
         stdout=subprocess.PIPE,
     )
@@ -365,6 +372,7 @@ def maximize_clocks(logger):
         proc_ret = subprocess.run(
             [
                 "nvidia-smi",
+                "-i=%d" % device_id,
                 "--query-gpu=power.max_limit",
                 "--format=csv,nounits,noheader",
             ],
@@ -378,6 +386,7 @@ def maximize_clocks(logger):
             proc_ret = subprocess.run(
                 [
                     "nvidia-smi",
+                    "-i=%d" % device_id,
                     "--power-limit=%f" % max_power_limit,
                 ],
                 stdout=subprocess.PIPE,
@@ -392,6 +401,7 @@ def maximize_clocks(logger):
     proc_ret = subprocess.run(
         [
             "nvidia-smi",
+            "-i=%d" % device_id,
             "--query-gpu=clocks.max.graphics",
             "--format=csv,nounits,noheader",
         ],
@@ -407,6 +417,7 @@ def maximize_clocks(logger):
         proc_ret = subprocess.run(
             [
                 "nvidia-smi",
+                "-i=%d" % device_id,
                 "--lock-gpu-clocks=%d,%d" % (max_graphics_clock, max_graphics_clock),
             ],
             stdout=subprocess.PIPE,
@@ -421,6 +432,7 @@ def maximize_clocks(logger):
     proc_ret = subprocess.run(
         [
             "nvidia-smi",
+            "-i=%d" % device_id,
             "--query-gpu=clocks.max.memory",
             "--format=csv,nounits,noheader",
         ],
@@ -436,6 +448,7 @@ def maximize_clocks(logger):
         proc_ret = subprocess.run(
             [
                 "nvidia-smi",
+                "-i=%d" % device_id,
                 "--lock-memory-clocks=%d,%d" % (max_memory_clock, max_memory_clock),
             ],
             stdout=subprocess.PIPE,
@@ -451,6 +464,7 @@ def maximize_clocks(logger):
     proc_ret = subprocess.run(
         [
             "nvidia-smi",
+            "-i=%d" % device_id,
             "--applications-clocks=%d,%d" % (max_memory_clock, max_graphics_clock),
         ],
         stdout=subprocess.PIPE,
@@ -465,7 +479,12 @@ def maximize_clocks(logger):
 
     # 7. Get the GPU Performance State. P0 state means the most performance.
     proc_ret = subprocess.run(
-        ["nvidia-smi", "--query-gpu=pstate", "--format=csv,nounits,noheader"],
+        [
+            "nvidia-smi",
+            "-i=%d" % device_id,
+            "--query-gpu=pstate",
+            "--format=csv,nounits,noheader",
+        ],
         stdout=subprocess.PIPE,
     )
     if proc_ret.returncode:
@@ -477,22 +496,26 @@ def maximize_clocks(logger):
         logger.info("Current GPU performance state is %s." % gpu_perf_state)
 
         if gpu_perf_state == "P0":
-            logger.info("Clocks are now maximized.")
+            logger.info("Clocks for device %d are now maximized." % device_id)
             return True, was_persistence_mode_on, current_power_limit
         else:
-            logger.info("Unable to maximize all GPU clocks to reach the P0 state.")
+            logger.info(
+                "Unable to maximize GPU clocks of device %d to reach the P0 state."
+                % device_id
+            )
             return False, was_persistence_mode_on, current_power_limit
 
 
 def reset_clocks(
     logger,
+    device_id=0,
     was_persistence_mode_on=False,
     current_power_limit=None,
 ):
     """
     Resets the GPU clocks.
     """
-    logger.info("Trying to reset the GPU clocks...")
+    logger.info("Trying to reset the GPU clocks for device: %d" % device_id)
 
     gpu_available = torch.cuda.device_count() > 0
 
@@ -501,7 +524,8 @@ def reset_clocks(
 
     # 1. Reset the memory clock.
     proc_ret = subprocess.run(
-        ["nvidia-smi", "--reset-memory-clocks"], stdout=subprocess.PIPE
+        ["nvidia-smi", "-i=%d" % device_id, "--reset-memory-clocks"],
+        stdout=subprocess.PIPE,
     )
     if proc_ret.returncode:
         logger.warning("Unable to reset the memory clock back to normal.")
@@ -510,7 +534,8 @@ def reset_clocks(
 
     # 2. Reset GPU clock.
     proc_ret = subprocess.run(
-        ["nvidia-smi", "--reset-gpu-clocks"], stdout=subprocess.PIPE
+        ["nvidia-smi", "-i=%d" % device_id, "--reset-gpu-clocks"],
+        stdout=subprocess.PIPE,
     )
     if proc_ret.returncode:
         logger.warning("Unable to reset the GPU clock back to normal.")
@@ -519,7 +544,8 @@ def reset_clocks(
 
     # 3. Reset application clocks.
     proc_ret = subprocess.run(
-        ["nvidia-smi", "--reset-applications-clocks"], stdout=subprocess.PIPE
+        ["nvidia-smi", "-i=%d" % device_id, "--reset-applications-clocks"],
+        stdout=subprocess.PIPE,
     )
     if proc_ret.returncode:
         logger.warning("Unable to reset the application clocks back to normal.")
@@ -531,6 +557,7 @@ def reset_clocks(
         proc_ret = subprocess.run(
             [
                 "nvidia-smi",
+                "-i=%d" % device_id,
                 "--power-limit=%f" % current_power_limit,
             ],
             stdout=subprocess.PIPE,
@@ -562,7 +589,12 @@ def reset_clocks(
 
     # 7. Get GPU Performance State
     proc_ret = subprocess.run(
-        ["nvidia-smi", "--query-gpu=pstate", "--format=csv,nounits,noheader"],
+        [
+            "nvidia-smi",
+            "-i=%d" % device_id,
+            "--query-gpu=pstate",
+            "--format=csv,nounits,noheader",
+        ],
         stdout=subprocess.PIPE,
     )
     if proc_ret.returncode:
@@ -774,12 +806,12 @@ def summarize_runs(
     """
 
     def _parse_json_for_time(json_data):
-        mean_all_batches = json_data["mean_all_batches"]
-        sample_name_key = list(mean_all_batches.keys())[0]
+        mean_all_procs = json_data["data_mean_all_procs"]
+        sample_name_key = list(mean_all_procs.keys())[0]
 
-        cpu_time_minus_warmup_per_item = mean_all_batches[sample_name_key][
-            "run_sample"
-        ]["pipeline"]["cpu_time_minus_warmup_per_item"]
+        cpu_time_minus_warmup_per_item = mean_all_procs[sample_name_key]["run_sample"][
+            "pipeline"
+        ]["cpu_time_minus_warmup_per_item"]["mean"]
 
         return cpu_time_minus_warmup_per_item
 
diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py
index 938f080c8..57507f2e6 100644
--- a/samples/scripts/benchmark.py
+++ b/samples/scripts/benchmark.py
@@ -20,10 +20,14 @@
 import os
 import sys
 import json
+import time
 import logging
 import argparse
 import subprocess
+import numpy as np
+import pandas as pd
 import multiprocessing as mp
+import matplotlib.pyplot as plt
 
 common_dir = os.path.join(
     os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
@@ -37,7 +41,7 @@
 
 class NvtxRangeTimeInfo:
     """
-    A class encapsulating the time information associated with an NVTX range.
+    A data class to hold the time information of an NVTX range.
     """
 
     def __init__(self, start_ms, end_ms):
@@ -58,7 +62,7 @@ def duration_ms(self):
 
 class NvtxRange:
     """
-    A class representing an NVTX range with its timing information.
+    A data class representing an NVTX range with its CPU and GPU time information.
     """
 
     def __init__(
@@ -395,116 +399,156 @@ def calc_mean_ranges(all_range_info):
     return mean_range_info
 
 
-class MeanDictInfo:
+class NumpyValuesEncoder(json.JSONEncoder):
     """
-    A small data structure to help track various stats over multiple dictionaries.
-    For example, we can use to create one dictionary that represents the sum of
-    many dictionaries. In that, this data structure can track the total value
-    (i.e the sum) and how many items were added into making that sum (i.e len).
+    Helps encode various Numpy data-types correctly in the JSON encoder.
     """
 
-    def __init__(self, value):
-        self.value = value
-        self.len = 0
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NumpyValuesEncoder, self).default(obj)
 
 
-def recurse_sum_dict(input_dict, target_dict):
+def recurse_gather_dict(input_dict, target_dict):
     """
-    Recursively sums up value of all keys of input_dict in another dictionary.
-    This is useful for computing the total (or the mean eventually) of all the
-    keys in a dictionary.
-    This function uses and inserts a special object `MeanDictInfo` as values of
-    the leaf nodes. That object is useful to track not just the sum total values
-    but also how many items were summed up in there.
+    Recursively gathers values of all keys of input_dict in another dictionary.
+    This is useful for computing various stats on the data such as mean, median or
+    std-dev of all the keys in a dictionary.
+    This function collects data in a list.
     :param input_dict: The dictionary that should be used as input.
     :param target_dict: The single dictionary in which all the sums should be gathered.
     """
+    assert type(input_dict) is type(target_dict)
+    assert isinstance(input_dict, dict)
     # Loop over all the keys in the input dictionary.
     for key in input_dict:
+        if key in ["total_items", "total_items_warmup", "total_items_minus_warmup"]:
+            continue  # We skip these key.
+
         # Check if the value is another dictionary.
-        if isinstance(input_dict[key], dict):
+        elif isinstance(input_dict[key], dict):
             # Create this if our target_dict did not already have it.
             if key not in target_dict:
                 target_dict[key] = {}
 
             # Recurse the same function again.
-            recurse_sum_dict(input_dict[key], target_dict[key])
+            recurse_gather_dict(input_dict[key], target_dict[key])
 
-        # Check if the value is a list or tuple.
+        # Check if the value is a list or tuple. We will store inside a list of lists.
         elif isinstance(input_dict[key], list) or isinstance(input_dict[key], tuple):
-
-            # Create this if our target_dict did not already have it. Instead
-            # of saving just the sums, we will save both, the sum and the len
-            # telling us how many numbers were summed up. We use the MeanDictInfo
-            # object for this.
+            # Create this if our target_dict did not already have it.
             if key not in target_dict:
-                target_dict[key] = MeanDictInfo(value=[])
+                target_dict[key] = []
                 for _ in range(len(input_dict[key])):
-                    target_dict[key].value.append(0)
+                    target_dict[key].append([])  # This creates list of lists.
 
             for i in range(len(input_dict[key])):
-                target_dict[key].value[i] += input_dict[key][i]
-
-            # Increment the length.
-            target_dict[key].len += 1
+                target_dict[key][i].append(input_dict[key][i])
 
-        # For anything else, we assume it was a number.
+        # For anything else, we assume it was a number. We will store inside a list.
         else:
             if key not in target_dict:
-                target_dict[key] = MeanDictInfo(value=0)
+                target_dict[key] = []
 
-            target_dict[key].value += input_dict[key]
-            target_dict[key].len += 1
+            target_dict[key].append(input_dict[key])
 
 
-def recurse_divide_dict(input_dict, divide_by=None):
+def recurse_calc_stats_dict(
+    input_dict,
+    compute_mean_only=False,
+    compute_throughput=False,
+    throughput_multiplier=1,
+):
     """
-    Recursively divides the value of all keys of input_dict.
-    The denominator can be a fixed value `divide_by` or it can be dynamically
-    inferred from the length attributes of the MeanDictInfo object.
+    Recursively calculates various stats on the value of all keys of input_dict.
     :param input_dict: The dictionary that should be used as input.
-    :param divide_by: Optional value to use in the denominator.
+    :param compute_mean_only: A flag indicating whether only the mean should be computed or not.
+     Computes a lot of other stats (e.g. median, min, max...) if set to False.
+    :param compute_throughput: A flag indicating whether throughput should be computed or not.
+     Only set to True when running in parallel with all resources maximized otherwise throughput
+     calculation may give incorrect results.
+    :param throughput_multiplier: A number with which the throughput is multiplied to calculate the
+     total throughput. Usually set to the number of parallel processes or threads executing in parallel.
     """
     # Loop over all the keys in the input dictionary.
-    for key in input_dict:
+    for key in list(input_dict.keys()):
         # Check if the value is another dictionary.
         if isinstance(input_dict[key], dict):
-            recurse_divide_dict(input_dict[key], divide_by)
-
-        # Check if the value is a MeanDictInfo object.
-        elif isinstance(input_dict[key], MeanDictInfo):
-            if isinstance(input_dict[key].value, list) or isinstance(
-                input_dict[key].value, tuple
-            ):
-                # Use the user given divide by if supplied or else use the len
-                divide_by = divide_by if divide_by else input_dict[key].len
-
-                for i in range(len(input_dict[key].value)):
-                    input_dict[key].value[i] /= divide_by
-                    input_dict[key].value[i] = round(input_dict[key].value[i], 4)
-            else:
-                divide_by = divide_by if divide_by else input_dict[key].len
-
-                input_dict[key].value /= divide_by
-                input_dict[key].value = round(input_dict[key].value, 4)
-
-            # Remove the MeanDictInfo object and store the value directly.
-            input_dict[key] = input_dict[key].value
-
-        elif isinstance(input_dict[key], list) or isinstance(input_dict[key], tuple):
-            if divide_by is None:
-                raise ValueError(
-                    "divide_by must not be None when the values of the dictionary are "
-                    "not MeanDictInfo objects."
-                )
-
-            for i in range(len(input_dict[key].value)):
-                input_dict[key][i] /= divide_by
-                input_dict[key][i] = round(input_dict[key][i], 4)
+            recurse_calc_stats_dict(
+                input_dict[key],
+                compute_mean_only,
+                compute_throughput,
+                throughput_multiplier,
+            )
 
         else:
-            input_dict[key] /= divide_by
-            input_dict[key] = round(input_dict[key], 4)
+            assert isinstance(input_dict[key], list)
+
+            # Compute all stats.
+            if compute_mean_only:
+                stats_dict = {
+                    "total_items": len(input_dict[key]),
+                    "mean": round(np.mean(input_dict[key], axis=-1), 4),
+                }
+            else:
+                stats_dict = {
+                    "total_items": len(input_dict[key]),
+                    "min": round(np.min(input_dict[key], axis=-1), 4),
+                    "max": round(np.max(input_dict[key], axis=-1), 4),
+                    "mean": round(np.mean(input_dict[key], axis=-1), 4),
+                    "std": round(np.std(input_dict[key], axis=-1), 4),
+                    "median": round(np.median(input_dict[key], axis=-1), 4),
+                    "percentile_95": round(
+                        np.percentile(input_dict[key], 95, axis=-1), 4
+                    ),
+                }
+
+                if compute_throughput:
+                    throughput_unit = (
+                        "frames_per_second"
+                        if "_per_item" in key
+                        else "batches_per_second"
+                    )
+                    stats_dict["throughput"] = {
+                        "multiplier": throughput_multiplier,
+                        "unit": throughput_unit,
+                        # NOTE: Minimum throughput corresponds to maximum latency.
+                        "min": round(
+                            1000 * throughput_multiplier / stats_dict["max"], 2
+                        )
+                        if stats_dict["max"] > 0
+                        else 0,
+                        # NOTE: Maximum throughput corresponds to minimum latency.
+                        "max": round(
+                            1000 * throughput_multiplier / stats_dict["min"], 2
+                        )
+                        if stats_dict["min"] > 0
+                        else 0,
+                        "mean": round(
+                            1000 * throughput_multiplier / stats_dict["mean"], 2
+                        )
+                        if stats_dict["mean"] > 0
+                        else 0,
+                        "median": round(
+                            1000 * throughput_multiplier / stats_dict["median"], 2
+                        )
+                        if stats_dict["median"] > 0
+                        else 0,
+                        "percentile_95": round(
+                            1000 * throughput_multiplier / stats_dict["percentile_95"],
+                            2,
+                        )
+                        if stats_dict["percentile_95"] > 0
+                        else 0,
+                    }
+
+            # Assign in-place.
+            input_dict[key] = stats_dict
 
 
 def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
@@ -556,7 +600,7 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
     #           }
     #       }
     #
-    # 4. Finally, it computes what we call as the mean values of the timings from all
+    # 4. Finally, it computes various stats (e.g mean, median) of the timings from all
     #    the batches. In other words, it computes how much range X would take on an
     #    average when it is averaged across all the batches. To do this, we again use
     #    the information present inside benchmark.json and apply basic recursion math.
@@ -638,6 +682,29 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
                     current_dict[parts[-1]]["gpu_time"] / batch_size, 4
                 )
 
+            # Pass the total_items information to all the children of this batch
+            # i.e. keys which were present in inside_batch_info
+            # unless they already had it before (i.e. very weird case where someone
+            # inserted a batch in a batch with different inner batch size).
+            def _recurse_update_children_total_items(in_dict):
+                for k in list(in_dict.keys()):
+                    if isinstance(in_dict[k], dict):
+                        _recurse_update_children_total_items(in_dict[k])
+                    else:
+                        # Add total items if not already present.
+                        if "total_items" not in in_dict:
+                            in_dict["total_items"] = batch_size
+                            in_dict["cpu_time_per_item"] = round(
+                                in_dict["cpu_time"] / batch_size, 4
+                            )
+                            in_dict["gpu_time_per_item"] = round(
+                                in_dict["gpu_time"] / batch_size, 4
+                            )
+
+            # Apply it.
+            if batch_size > 0:
+                _recurse_update_children_total_items(current_dict[parts[-1]])
+
             # Maintain global counts of various batch level stats
             # for example, counting the total items seen at this
             # batch level.
@@ -673,7 +740,7 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
 
         elif path in benchmark_dict["inside_batch_info"]:
             # We could be inside a batch. Nothing to do here. We won't have the total_items yet
-            # at this level
+            # at this level. We will update it when we come on a parent level.
             pass
         else:
             # We are one or more levels outside/above the batch level.
@@ -725,6 +792,14 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
     for batch_level_prefix in batch_dicts:
         batch_dict = batch_dicts[batch_level_prefix]
 
+        batch_dict["total_items_warmup"] = (
+            batch_dict["total_items"] - total_items_minus_warmup[batch_level_prefix]
+        )
+
+        batch_dict["total_items_minus_warmup"] = total_items_minus_warmup[
+            batch_level_prefix
+        ]
+
         batch_dict["cpu_time_minus_warmup"] = round(
             (batch_dict["cpu_time"] - total_warmup_cpu_time[batch_level_prefix]), 4
         )
@@ -747,15 +822,11 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
                 4,
             )
 
-        batch_dict["total_items_minus_warmup"] = total_items_minus_warmup[
-            batch_level_prefix
-        ]
-
     # The processing is over. So we assign the expanded version of data into the
     # original benchmark dictionary.
     benchmark_dict["data"] = unfltten_data_dict
 
-    # Finally, process the batches to find out the mean batch timings.
+    # Finally, process the batches to calculate various stats on the batch timings.
     # i.e. how much did range X took on an average across all the batches.
     # Again, we will not use any batches that are warm-up batches in this calculation.
     # For this to happen, we need to rely on the batch_info keys. Those are the
@@ -772,8 +843,7 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
     #            during all the batches. Our division logic takes care of properly
     #            dividing with the current count anyway.
     #
-    mean_data = {}
-    total_batches_used = 0
+    data_stats = {}
     for batch_range_name in benchmark_dict["batch_info"]:
         batch_idx, batch_size = benchmark_dict["batch_info"][batch_range_name]
         # Next, we find out the batch level prefix. This is the key in which
@@ -801,28 +871,26 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches):
             and (batch_idx + 1 + warmup_batches)
             <= benchmark_dict["meta"]["total_batches"][batch_level_prefix]
         ):
-            # Keep on updating the mean_data dictionary. This will
-            # create a dictionary that is union of all the dicts of the batch level.
+            # Keep on updating the data_stats dictionary. This will
+            # create a dictionary that is union of all the dictionaries of the batch level.
             nested_keys = batch_range_name.split("/")
-            target_dict = benchmark_dict["data"]
+            source_dict = benchmark_dict["data"]
             for k in nested_keys:
-                target_dict = target_dict[k]
+                source_dict = source_dict[k]
 
-            # Need to recursively update the mean_data based on
-            # the target_dict. We will sum the values up.
-            if batch_level_prefix not in mean_data:
-                mean_data[batch_level_prefix] = {}
+            # Need to recursively update the data_stats based on
+            # the source_dict. We will sum the values up.
+            if batch_level_prefix not in data_stats:
+                data_stats[batch_level_prefix] = {}
 
-            recurse_sum_dict(target_dict, mean_data[batch_level_prefix])
-            total_batches_used += 1
+            recurse_gather_dict(source_dict, target_dict=data_stats[batch_level_prefix])
 
-    # Once all the sums are calculated, we need to divide by the length to figure
+    # Once all the numbers are gathered, we need to divide by the length to figure
     # out the mean values.
-    recurse_divide_dict(mean_data)
-    benchmark_dict["mean_data"] = mean_data
+    recurse_calc_stats_dict(data_stats)
+    benchmark_dict["data_stats_minus_warmup"] = data_stats
 
     # Remove the batch_info and inside_batch_info keys as they are no longer needed.
-    del benchmark_dict["batch_info"]
     del benchmark_dict["inside_batch_info"]
 
 
@@ -862,7 +930,7 @@ def benchmark_script(
 
     # Setup the command that will launch nsys and ask it to benchmark the script
     # that we were interested in.
-    nsys_root_path = "/opt/nvidia/nsight-systems/2023.2.1/"
+    nsys_root_path = "/opt/nvidia/nsight-systems/2024.2.1/"
     nsys_binary_path = os.path.join(nsys_root_path, "bin/nsys")
     nsys_reports_path = os.path.join(nsys_root_path, "target-linux-x64/reports")
     nsys_gpu_proj_trace_report_path = os.path.join(
@@ -874,7 +942,7 @@ def benchmark_script(
 
     if not os.path.isfile(nsys_binary_path):
         raise ValueError(
-            "Unable to locate nsys binary at %s. Make sure you have nsight-systems 2023.2.1 installed."
+            "Unable to locate nsys binary at %s. Make sure you have nsight-systems 2024.2.1 installed."
             % nsys_binary_path
         )
 
@@ -953,9 +1021,6 @@ def benchmark_script(
     all_range_info = merge_cpu_and_gpu_ranges(cpu_range_info, gpu_range_info)
     # Calculate averages across processes/threads
     mean_ranges_info = calc_mean_ranges(all_range_info)
-    # and save it.
-    with open(os.path.join(output_dir, "perf_report_nvtx_mean.json"), "w") as f:
-        f.write(json.dumps(mean_ranges_info, indent=4))
 
     # Final step is to pull the data from the all_range_info we generated above and fill
     # it in the benchmark_dict.
@@ -974,14 +1039,98 @@ def benchmark_script(
 
     # Write the updated benchmark dictionary.
     with open(benchmark_json_path, "w") as f:
-        f.write(json.dumps(benchmark_dict, indent=4))
+        f.write(json.dumps(benchmark_dict, indent=4, cls=NumpyValuesEncoder))
+
+    # Delete the temporary files.
+    os.remove(os.path.join(output_dir, "perf_report_nvtx_pushpop_trace.json"))
+    os.remove(os.path.join(output_dir, "perf_report_nvtx_gpu_proj_trace.json"))
+    os.remove(os.path.join(output_dir, "perf_report.sqlite"))
 
     return 0, output_dir
 
 
+def monitor_gpu_metrics(list_of_device_ids, terminate_event, gpu_metrics_info):
+    """
+    Monitors various GPU metrics directly from NVIDIA-smi. These metrics are not accessible
+    from NSYS as of now. The monitoring stays on until an event is received.
+    :param list_of_device_ids: A list of string values of the GPU-ids that are being used in this
+    benchmark run.
+    :param terminate_event: An event that can mark an end of the monitoring process.
+    :param gpu_metrics_info: A multiprocessing share dictionary to store the results of monitoring.
+    """
+    # Initialize the gpu_metrics_info dictionary for the first time. We will save the
+    # following pieces of information per GPU.
+    # 1) The total GPU power drawn in Watts
+    # 2) The GPU utilization in %.
+
+    # We will work in a local dictionary first. Only when we are done that we would
+    # transfer its contents to the mp managed dictionary. Because otherwise the mp
+    # managed dictionary has no way of knowing when a nested key-value changes and it
+    # won't update/save it.
+    gpu_metrics_info_local = {"power.draw.watts": {}, "utilization.gpu": {}}
+    for device_id in list_of_device_ids:
+        gpu_metrics_info_local["power.draw.watts"]["GPU: %s" % device_id] = []
+        gpu_metrics_info_local["utilization.gpu"]["GPU: %s" % device_id] = []
+
+    # Begin the monitoring loop. Continue till we are asked to stopped by the event.
+    while not terminate_event.is_set():
+        # Use nvidia-smi to get power draw and GPU utilization numbers for all GPUs.
+        proc_ret = subprocess.run(
+            [
+                "nvidia-smi",
+                "-i=%s" % ",".join(list_of_device_ids),
+                "--query-gpu=power.draw,utilization.gpu",
+                "--format=csv,nounits,noheader",
+            ],
+            stdout=subprocess.PIPE,
+        )
+        if proc_ret.returncode == 0:
+            outputs = proc_ret.stdout.decode().strip().split("\n")
+            for idx, device_id in enumerate(list_of_device_ids):
+                power_draw, gpu_util = outputs[idx].split(",")
+                power_draw = float(power_draw)
+                gpu_util = float(gpu_util)
+
+                gpu_metrics_info_local["power.draw.watts"][
+                    "GPU: %s" % device_id
+                ].append(power_draw)
+                gpu_metrics_info_local["utilization.gpu"]["GPU: %s" % device_id].append(
+                    gpu_util
+                )
+        else:
+            for device_id in list_of_device_ids:
+                gpu_metrics_info_local["power.draw.watts"][
+                    "GPU: %s" % device_id
+                ].append(0.0)
+                gpu_metrics_info_local["utilization.gpu"]["GPU: %s" % device_id].append(
+                    0.0
+                )
+
+        # Sleep a bit
+        time.sleep(0.5)  # 500 milliseconds
+
+    # Update the mp managed dictionary
+    gpu_metrics_info.update(gpu_metrics_info_local)
+
+
+def plot_gpu_metrics(gpu_metrics_info, output_dir):
+    """
+    Plots GPU metrics as a matplotlib plot.
+    """
+    for metric_name in gpu_metrics_info:
+        # Create pandas data frame.
+        df = pd.DataFrame(gpu_metrics_info[metric_name])
+        ax = df.plot(title=metric_name)
+        ax.set_xlabel("Execution time")
+        ax.set_ylabel(metric_name)
+        fig = ax.get_figure()
+        fig.savefig(os.path.join(output_dir, "plot.%s.jpg" % metric_name))
+        plt.close(fig)
+
+
 def main():
     parser = argparse.ArgumentParser(
-        "Performance benchmarking script for CV-CUDA samples.",
+        "Performance benchmarking script for CV-CUDA.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
@@ -1072,17 +1221,38 @@ def main():
         )
 
     # Maximize the clocks.
-    if args.maximize_clocks:
-        (
-            did_maximize_clocks,
-            was_persistence_mode_on,
-            current_power_limit,
-        ) = maximize_clocks(logger)
-
-    # Start multiple processes, per num_processes per num_gpus.
+    clocks_info = []
+    all_device_ids = []
+    for gpu_idx in range(args.num_gpus):
+        device_id = args.gpu_offset_id + gpu_idx
+
+        all_device_ids.append(str(device_id))
+        if args.maximize_clocks:
+            (
+                did_maximize_clocks,
+                was_persistence_mode_on,
+                current_power_limit,
+            ) = maximize_clocks(logger, device_id)
+            clocks_info.append((was_persistence_mode_on, current_power_limit))
+
+    # We will start multiple processes, per num_processes per num_gpus in Pool to run the benchmarks.
     pool = mp.Pool()
+    # Create an event to signal other processes (e.g. monitor_gpu_metrics to stop when main pool has stopped)
+    pool_terminate_event = mp.Event()
+    # Create a shared dict to retrieve the results of monitor_gpu_metrics
+    mp_manager = mp.Manager()
+    gpu_metrics_info = mp_manager.dict()
+    # Finally allocate a list to store the Pool's process's results.
     results = []
 
+    # Begin by starting one process to keep on monitoring various GPU metrics that are not available via NSYS.
+    gpu_metric_monitor_proc = mp.Process(
+        target=monitor_gpu_metrics,
+        args=(all_device_ids, pool_terminate_event, gpu_metrics_info),
+    )
+    gpu_metric_monitor_proc.start()
+
+    # Then we start the multiprocessing Pool.
     for gpu_idx in range(args.num_gpus):
         for process_idx in range(args.num_processes):
             # Since each the output of each process needs to be stored in a different directory,
@@ -1124,58 +1294,162 @@ def main():
     pool.close()
     pool.join()
 
+    # Set the terminate event so other processes know that the Pool has finished.
+    pool_terminate_event.set()
+    # Wait for the gpu_metric_monitor process to finish.
+    gpu_metric_monitor_proc.join()
+
     # Reset the clocks.
     if args.maximize_clocks:
-        reset_clocks(
-            logger,
-            was_persistence_mode_on,
-            current_power_limit,
-        )
+        for gpu_idx in range(args.num_gpus):
+            device_id = args.gpu_offset_id + gpu_idx
+
+            was_persistence_mode_on, current_power_limit = clocks_info[gpu_idx]
+
+            reset_clocks(
+                logger,
+                device_id,
+                was_persistence_mode_on,
+                current_power_limit,
+            )
     else:
         logger.warning("Clocks were not maximized during this run.")
 
-    # Now we need to calculate the average of all the perf-numbers.
-    # This means average across all the processes that we had launched.
-    # This can only be done if all processes finished without error.
-    # So we will check that first and if that is the case, we will
-    # read their benchmark.json data in a list to later calculate
-    # the average.
-    all_benchmark_data_dicts = []
-    all_benchmark_mean_batch_dicts = []
+    # We must create a copy of gpu_metrics_info to detach it from multiprocessing.
+    gpu_metrics_info = gpu_metrics_info.copy()
+
+    # Plot the GPU metrics.
+    plot_gpu_metrics(gpu_metrics_info, args.output_dir)
+
+    # Now we need to :
+    # 1) Write the gpu_metrics_info in to the benchmark.json files stored per
+    #    process and
+    # 2) Calculate various stats at the all processes level.
+    #    e.g. If we ran 1 or more processes, there will be a benchmark_mean.json
+    #    created in the output root folder with mean and other stats computed from
+    #    all benchmark.json files of all the processes.
+    #    This can only be done if all processes finished without error.
+    #    So we will check that first and if that is the case, we will
+    #    read their benchmark.json data in a list to later calculate various stats.
+    all_data_dicts = []
     for r in results:
         # Grab the return result from the pool.
         proc_ret_code, proc_output_dir = r.get()
         if proc_ret_code:
             # Any non-zero return code mean the process failed.
             raise Exception(
-                "Failed to execute process: %d on gpu: %d" % (process_idx, gpu_idx)
+                "Process: %d on gpu: %d exited with a non-zero return code: %d"
+                % (process_idx, gpu_idx, proc_ret_code)
             )
         else:
             # Zero return code means success. Read the benchmark.json.
             with open(os.path.join(proc_output_dir, "benchmark.json"), "r") as f:
                 benchmark_dict = json.loads(f.read())
 
-            # Append to our list of data dict and mean_batch data dict
-            all_benchmark_data_dicts.append(benchmark_dict["data"])
-            all_benchmark_mean_batch_dicts.append(benchmark_dict["mean_data"])
+            # Update this benchmark dict with GPU metrics for this GPU id.
+            for metric_name in gpu_metrics_info:
+                device_id_of_this_proc = benchmark_dict["meta"]["device"]["id"]
+                benchmark_dict["gpu_metrics"][metric_name] = gpu_metrics_info[
+                    metric_name
+                ]["GPU: %d" % device_id_of_this_proc]
+
+            with open(os.path.join(proc_output_dir, "benchmark.json"), "w") as f:
+                f.write(json.dumps(benchmark_dict, indent=4, cls=NumpyValuesEncoder))
+
+            # Append to our list of data dict.
+            all_data_dicts.append(benchmark_dict["data"])
+
+    # 1) Compute mean of the data field from all processes...
+    data_mean_all_procs = {}
+    # First recursively collect all values from all the data dictionaries of all processes.
+    for data_dict in all_data_dicts:
+        recurse_gather_dict(data_dict, data_mean_all_procs)
+    # And then compute just the mean over this.
+    recurse_calc_stats_dict(data_mean_all_procs, compute_mean_only=True)
+
+    # 2) Compute various stats of the data_stats_minus_warmup field from all processes...
+    # Now compute all the stats (such as mean, median etc) for all processes from all numbers.
+    # NOTE: We have already computed these stats per process in the benchmark.json's
+    # data_stats_minus_warmup field. This time, we want to do it over all the processes. Instead
+    # of taking mean of those numbers, we will calculate the freshly, combining all data points.
+    # This results in much accurate statistics.
+    # We will use last process's benchmark_dict to use query some important fields such as
+    # batch_info and total_batches etc. This assumes that all processes ran the same code.
+    data_stats_all_procs = {}
+    for batch_range_name in benchmark_dict["batch_info"]:
+        batch_idx, batch_size = benchmark_dict["batch_info"][batch_range_name]
+        # Next, we find out the batch level prefix. This is the key in which
+        # the batches are nested. One profiling session can have multiple levels
+        # at which batches may be used.
+        # e.g.
+        # program_X:
+        #   method_A:
+        #       batch_1
+        #       batch_2
+        #   method_B:
+        #       batch_1
+        #       batch_2
+        #
+        # We need to find mean at these two levels (i.e. method_A and method_B)
+        # in this case.
+        # programA/method_A and program_A/method_B are the batch level prefix here.
+        # We can easily get those by using the dirname method since those are like
+        # the directory names in a path.
+        batch_level_prefix = os.path.dirname(batch_range_name)
+
+        if (
+            batch_size > 0
+            and batch_idx + 1 > args.warmup_batches
+            and (batch_idx + 1 + args.warmup_batches)
+            <= benchmark_dict["meta"]["total_batches"][batch_level_prefix]
+        ):
+            # Keep on updating the data_stats dictionary. This will create a dictionary
+            # that is union of all the dictionaries at the batch level for all processes.
+            nested_keys = batch_range_name.split("/")
+
+            for data_dict in all_data_dicts:
+                source_dict = data_dict
 
-    mean_all_batch_data = {}
-    mean_data = {}
+                # Go deep down the nested key path from the root.
+                for k in nested_keys:
+                    source_dict = source_dict[k]
 
-    # First recursively sum up all values from all dictionaries.
-    for data_dict in all_benchmark_data_dicts:
-        recurse_sum_dict(data_dict, mean_all_batch_data)
-    # And then divide by the length to get the mean values.
-    recurse_divide_dict(mean_all_batch_data)
+                # Need to recursively update the data_stats_all_procs based on
+                # the source_dict. We will sum the values up.
+                if batch_level_prefix not in data_stats_all_procs:
+                    data_stats_all_procs[batch_level_prefix] = {}
 
-    for mean_batch_dict in all_benchmark_mean_batch_dicts:
-        recurse_sum_dict(mean_batch_dict, mean_data)
-    # And then divide by the length to get the mean values.
-    recurse_divide_dict(mean_data)
+                recurse_gather_dict(
+                    source_dict, target_dict=data_stats_all_procs[batch_level_prefix]
+                )
+
+    # Once all the data points are gathered, we need to divide by the length to figure
+    # out the mean values.
+    recurse_calc_stats_dict(
+        data_stats_all_procs,
+        compute_throughput=True,
+        throughput_multiplier=args.num_gpus * args.num_processes,
+    )
+
+    # 3). Compute stats of of all GPU metrics for all GPUs involved.
+    gpu_metrics_all_procs = {}
+    for metric_name in gpu_metrics_info:
+        gpu_metrics_all_procs[metric_name] = []
+        for device_id in gpu_metrics_info[metric_name]:
+            # Gather all
+            gpu_metrics_all_procs[metric_name].extend(
+                gpu_metrics_info[metric_name][device_id]
+            )
+
+    # Compute stats.
+    recurse_calc_stats_dict(
+        gpu_metrics_all_procs,
+    )
 
     mean_benchmark_data = {
-        "mean_all_batches": mean_all_batch_data,
-        "mean_data": mean_data,
+        "data_mean_all_procs": data_mean_all_procs,
+        "data_stats_minus_warmup_all_procs": data_stats_all_procs,
+        "gpu_metrics_all_procs": gpu_metrics_all_procs,
         "meta": {"args": {}},
     }
     for arg in vars(args):
@@ -1184,7 +1458,7 @@ def main():
     # Write it in a file.
     mean_benchmark_json_path = os.path.join(args.output_dir, "benchmark_mean.json")
     with open(mean_benchmark_json_path, "w") as f:
-        f.write(json.dumps(mean_benchmark_data, indent=4))
+        f.write(json.dumps(mean_benchmark_data, indent=4, cls=NumpyValuesEncoder))
         logger.info(
             "Benchmarking completed successfully. Results saved at: %s"
             % mean_benchmark_json_path
diff --git a/samples/scripts/benchmark_samples.sh b/samples/scripts/benchmark_samples.sh
index 7b97c3f78..669db058e 100755
--- a/samples/scripts/benchmark_samples.sh
+++ b/samples/scripts/benchmark_samples.sh
@@ -31,7 +31,7 @@ if [ "$#" -lt 1 ]; then
     exit 1
 fi
 
-SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SAMPLES_ROOT="$(dirname "$SCRIPT_DIR")"  # removes the scripts dir
 OUTPUT_DIR="$1"
 USE_TRT=${2:-True}
diff --git a/samples/scripts/build_samples.sh b/samples/scripts/build_samples.sh
index 59b4b4986..b54d07ff3 100755
--- a/samples/scripts/build_samples.sh
+++ b/samples/scripts/build_samples.sh
@@ -18,10 +18,11 @@
 # Builds samples
 # Usage: build_samples.sh [build folder]
 
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 build_dir="build"
 
 mkdir -p $build_dir
 
 cd $build_dir
 
-cmake .. && make
+cmake "$SCRIPT_DIR/.." && make
diff --git a/samples/scripts/install_dependencies.sh b/samples/scripts/install_dependencies.sh
index cd2e6fb72..236f5753c 100755
--- a/samples/scripts/install_dependencies.sh
+++ b/samples/scripts/install_dependencies.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -e
 
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -99,7 +99,7 @@ chmod a+x /usr/local/bin/tao-converter
 
 # Install NVIDIA NSIGHT 2023.2.1
 cd /tmp
-wget https://developer.download.nvidia.com/devtools/nsight-systems/nsight-systems-2023.2.1_2023.2.1.122-1_amd64.deb
+wget https://developer.download.nvidia.com/devtools/nsight-systems/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb
 apt-get update && apt-get install -y \
     libsm6 \
     libxrender1 \
@@ -107,7 +107,7 @@ apt-get update && apt-get install -y \
     libxext6 \
     libx11-dev \
     libxkbfile-dev \
-    /tmp/nsight-systems-2023.2.1_2023.2.1.122-1_amd64.deb \
+    /tmp/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb \
     && rm -rf /var/lib/apt/lists/*
 
 echo "export PATH=$PATH:/opt/tensorrt/bin" >> ~/.bashrc
diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh
index 7a8fc3025..40fae3cb9 100755
--- a/samples/scripts/run_samples.sh
+++ b/samples/scripts/run_samples.sh
@@ -23,7 +23,7 @@
 set -e
 
 export CUDA_MODULE_LOADING="LAZY"
-SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 SAMPLES_DIR="$(dirname "$SCRIPT_DIR")"
 CLASSIFICATION_OUT_DIR=/tmp/classification
 SEGMENTATION_OUT_DIR="/tmp/segmentation"
@@ -79,9 +79,9 @@ python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos
 # Run the classification C++ sample. Since the Python sample was already run, we can reuse the TensorRT model
 # and the labels file generated by it.
 # Batch size 1
-LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.1.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 1
+LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e "$CLASSIFICATION_OUT_DIR/1/model.4.224.224.trtmodel" -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l "$CLASSIFICATION_OUT_DIR/1/labels.txt" -b 1
 # Batch size 2
-LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.2.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2
+LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e "$CLASSIFICATION_OUT_DIR/1/model.4.224.224.trtmodel" -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l "$CLASSIFICATION_OUT_DIR/1/labels.txt" -b 2
 
 # Run the segmentation Python sample with default settings, without any command-line args.
 rm -rf "$SEGMENTATION_OUT_DIR"
diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt
index 202caf756..b2b3b1245 100644
--- a/src/cvcuda/CMakeLists.txt
+++ b/src/cvcuda/CMakeLists.txt
@@ -69,6 +69,7 @@ set(CV_CUDA_OP_FILES
     OpPairwiseMatcher.cpp
     OpFindHomography.cpp
     OpStack.cpp
+    OpResizeCropConvertReformat.cpp
 )
 
 # filter only one that matches the patern (case insensitive), should be set on the global level
diff --git a/src/cvcuda/OpResizeCropConvertReformat.cpp b/src/cvcuda/OpResizeCropConvertReformat.cpp
new file mode 100644
index 000000000..2db154d6f
--- /dev/null
+++ b/src/cvcuda/OpResizeCropConvertReformat.cpp
@@ -0,0 +1,71 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "priv/OpResizeCropConvertReformat.hpp"
+
+#include "priv/SymbolVersioning.hpp"
+
+#include <nvcv/Exception.hpp>
+#include <nvcv/ImageBatch.hpp>
+#include <nvcv/Tensor.hpp>
+#include <util/Assert.h>
+
+namespace priv = cvcuda::priv;
+
+CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatCreate, (NVCVOperatorHandle * handle))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            if (handle == nullptr)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                      "Pointer to NVCVOperator handle must not be NULL");
+            }
+
+            *handle = reinterpret_cast<NVCVOperatorHandle>(new priv::ResizeCropConvertReformat());
+        });
+}
+
+CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out,
+                   const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos,
+                   const NVCVChannelManip manip))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            nvcv::TensorWrapHandle input(in), output(out);
+            priv::ToDynamicRef<priv::ResizeCropConvertReformat>(handle)(stream, input, output, resizeDim, interpolation,
+                                                                        cropPos, manip);
+        });
+}
+
+CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatVarShapeSubmit,
+                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVImageBatchHandle in, NVCVTensorHandle out,
+                   const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos,
+                   const NVCVChannelManip manip))
+{
+    return nvcv::ProtectCall(
+        [&]
+        {
+            nvcv::ImageBatchVarShapeWrapHandle input(in);
+            nvcv::TensorWrapHandle             output(out);
+            priv::ToDynamicRef<priv::ResizeCropConvertReformat>(handle)(stream, input, output, resizeDim, interpolation,
+                                                                        cropPos, manip);
+        });
+}
diff --git a/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h b/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h
index 7366c0c6d..80d2ee002 100644
--- a/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h
+++ b/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h
@@ -105,7 +105,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaCropFlipNormalizeReformatCreate(NVCVOperatorHandl
  *      64bit Float    | No
  *
  * Output:
- *      Data Layout:    [kNHWC, kHWC, kNCHW, KCHW]
+ *      Data Layout:    [kNHWC, kNCHW]
  *      Channels:       [1, 3, 4]
  *
  *      Data Type      | Allowed
@@ -145,10 +145,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaCropFlipNormalizeReformatCreate(NVCVOperatorHandl
  *
  * @param [in] borderValue Border value to be used for constant border mode \p NVCV_BORDER_CONSTANT.
  *
- * @param [in] flipCode a tensor flag to specify how to flip the array; 0 means flipping
- *      around the x-axis and positive value (for example, 1) means flipping
- *      around y-axis. Negative value (for example, -1) means flipping around
- *      both axes.
+ * @param [in] flipCode a tensor flag to specify how to flip the array; 0 means flipping around the x-axis,
+ *      1 means flipping around the y-axis, -1 means flipping around both axes, and any other value will result
+ *      in no flip.
  *
  * @param [in] base Tensor providing base values for normalization.
  *
diff --git a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h
new file mode 100644
index 000000000..041ff8003
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h
@@ -0,0 +1,176 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpResizeCropConvertReformat.h
+ *
+ * @brief Defines functions that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines.
+ * @defgroup NVCV_C_ALGORITHM__RESIZE_CROP Resize Crop
+ * @{
+ */
+
+#ifndef CVCUDA__RESIZE_CROP_H
+#define CVCUDA__RESIZE_CROP_H
+
+#include "Operator.h"
+#include "Types.h"
+#include "detail/Export.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/ImageBatch.h>
+#include <nvcv/Rect.h>
+#include <nvcv/Size.h>
+#include <nvcv/Status.h>
+#include <nvcv/Tensor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** Constructs and an instance of the ResizeCropConvertReformat operator.
+ *
+ * @param [out] handle Where the image instance handle will be written to.
+ *                     + Must not be NULL.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
+ * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatCreate(NVCVOperatorHandle *handle);
+
+/** Executes the ResizeCropConvertReformat operation on the given cuda stream. This operation
+ *  does not wait for completion.
+ *
+ *  ResizeCropConvertReformat performs the following operations in order:
+ *    1) Resize either a single tensor or each image in an ImageBatchVarShape
+ *       to a specified width and height (other dimensions are unchanged).
+ *    2) Crops a specified region of size width x height (determined by the
+ *       output tensor's width & height) starting at the pixel position
+ *       (cropPos.x, cropPos.y) out of the resized tensor.
+ *    3) Convert the element data type to the output tensor's data type. For
+ *       example, convert uchar elements to float. Limited options availble.
+ *    4) Optional channel manipulation--i.e., re-order the channels
+ *       of a tensor (e.g., RGB to BGR). Limited options available.
+ *    5) If output tensor's layout doesn't match the input's layout, reshape
+ *       the layout to match output layout (e.g., NHWC to NCHW). Limited
+ *       options available.
+ *  NOTE: Since all images in an ImageBatchVarShape are resized to the
+ *        same size, the resulting collection now fits in a single tensor.
+ *
+ *  Limitations:
+ *
+ *  Input: STILL NEED TO FILL THIS IN
+ *       Data Layout:    [NVCV_TENSOR_HWC, NVCV_TENSOR_NHWC]
+ *       Channels:       [1, 3]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | Yes
+ *       8bit  Signed   | No
+ *       16bit Unsigned | No
+ *       16bit Signed   | No
+ *       32bit Unsigned | No
+ *       32bit Signed   | No
+ *       32bit Float    | No
+ *       64bit Float    | No
+ *
+ *  Output:
+ *       Data Layout:    [NVCV_TENSOR_NHWC, NVCV_TENSOR_HWC,
+ *                        NVCV_TENSOR_NCHW, NVCV_TENSOR_CHW]
+ *       Channels:       [1, 3]
+ *
+ *       Data Type      | Allowed
+ *       -------------- | -------------
+ *       8bit  Unsigned | Yes
+ *       8bit  Signed   | No
+ *       16bit Unsigned | No
+ *       16bit Signed   | No
+ *       32bit Unsigned | No
+ *       32bit Signed   | No
+ *       32bit Float    | Yes
+ *       64bit Float    | No
+ *
+ *  Input/Output dependency
+ *
+ *       Property      |  Input == Output
+ *      -------------- | -------------
+ *       Data Layout   | No (Limited)
+ *       Data Type     | No (Limited)
+ *       Number        | Yes
+ *       Channels      | Yes
+ *       Width         | No
+ *       Height        | No
+ *
+ * @param [in] handle Handle to the operator.
+ *                    + Must not be NULL.
+ * @param [in] stream Handle to a valid CUDA stream.
+ *
+ * @param [in] in Input tensor or image batch. The images in an image batch can be of different
+ *                sizes, but all images must have the same data type, channels, and layout.
+ *
+ * @param [in] resizeDim Dimensions, {width, height}, to resize the tensor method to be used,
+ *                       see \ref NVCVSize2D for more details.
+ *
+ * @param [in] interpolation Interpolation method to be used, see \ref NVCVInterpolationType for
+ *                           more details. Currently, only NVCV_INTERP_NEAREST and NVCV_INTERP_LINEAR
+ *                           are available.
+ *
+ * @param [in] cropPos Crop position, (x, y), specifying the top-left corner of the crop region.
+ *                     The crop region's width and height is specified by the output tensor's
+ *                     width & height.
+ *                     @note: The crop must fall within the resized image. Let (x, y, w, h)
+ *                     represent the crop rectangle, where x & y are the cropPos coordinates
+ *                     and w & h are the output tensor's width and height, then the following
+ *                     must all be true:
+ *                        x >= 0
+ *                        y >= 0
+ *                        x + w <= resizeDim.w
+ *                        y + h <= resizeDim.h
+ *
+ *
+ * @param [in] manip Channel manipulation to be used (e.g., reshuffle RGB to BGR),
+ *                   see \ref NVCVChannelManip for more details.
+ *
+ * @param [out] out Output tensor. In addition to the output tensor determining the crop width
+ *                  and height, the output tensor also specifies the data type (e.g., uchar3 or
+ *                  float) and tensor layout (NHWC or NCHW), with limitations.
+ *
+ * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
+ * @retval #NVCV_ERROR_INTERNAL         Internal error in the operator, invalid types passed in.
+ * @retval #NVCV_SUCCESS                Operation executed successfully.
+ */
+/** @{ */
+CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                               NVCVTensorHandle in, NVCVTensorHandle out,
+                                                               const NVCVSize2D            resizeDim,
+                                                               const NVCVInterpolationType interpolation,
+                                                               const int2 cropPos, const NVCVChannelManip manip);
+
+CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream,
+                                                                       NVCVImageBatchHandle in, NVCVTensorHandle out,
+                                                                       const NVCVSize2D            resizeDim,
+                                                                       const NVCVInterpolationType interpolation,
+                                                                       const int2                  cropPos,
+                                                                       const NVCVChannelManip      manip);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CVCUDA__RESIZE_CROP_H */
diff --git a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp
new file mode 100644
index 000000000..1e7fb143f
--- /dev/null
+++ b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp
@@ -0,0 +1,97 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpResizeCropConvertReformat.hpp
+ *
+ * @brief Defines the public C++ class for that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines.
+ * @defgroup NVCV_CPP_ALGORITHM__RESIZE_CROP ResizeCropConvertReformat
+ * @{
+ */
+
+#ifndef CVCUDA__RESIZE_CROP_HPP
+#define CVCUDA__RESIZE_CROP_HPP
+
+#include "IOperator.hpp"
+#include "OpResizeCropConvertReformat.h"
+
+#include <cuda_runtime.h>
+#include <nvcv/ImageBatch.hpp>
+#include <nvcv/ImageFormat.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/alloc/Requirements.hpp>
+
+namespace cvcuda {
+
+class ResizeCropConvertReformat final : public IOperator
+{
+public:
+    explicit ResizeCropConvertReformat();
+
+    ~ResizeCropConvertReformat();
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim,
+                    const NVCVInterpolationType interpolation, const int2 cropPos,
+                    const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP);
+
+    void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out,
+                    const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos,
+                    const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP);
+
+    virtual NVCVOperatorHandle handle() const noexcept override;
+
+private:
+    NVCVOperatorHandle m_handle;
+};
+
+inline ResizeCropConvertReformat::ResizeCropConvertReformat()
+{
+    nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatCreate(&m_handle));
+    assert(m_handle);
+}
+
+inline ResizeCropConvertReformat::~ResizeCropConvertReformat()
+{
+    nvcvOperatorDestroy(m_handle);
+    m_handle = nullptr;
+}
+
+inline void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
+                                                  const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation,
+                                                  const int2 cropPos, const NVCVChannelManip manip)
+{
+    nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatSubmit(m_handle, stream, in.handle(), out.handle(),
+                                                                   resizeDim, interpolation, cropPos, manip));
+}
+
+inline void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in,
+                                                  const nvcv::Tensor &out, const NVCVSize2D resizeDim,
+                                                  const NVCVInterpolationType interpolation, const int2 cropPos,
+                                                  const NVCVChannelManip manip)
+{
+    nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatVarShapeSubmit(m_handle, stream, in.handle(), out.handle(),
+                                                                           resizeDim, interpolation, cropPos, manip));
+}
+
+inline NVCVOperatorHandle ResizeCropConvertReformat::handle() const noexcept
+{
+    return m_handle;
+}
+
+} // namespace cvcuda
+
+#endif // CVCUDA__RESIZE_CROP_HPP
diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h
index 8dc5131f9..d222b38eb 100644
--- a/src/cvcuda/include/cvcuda/Types.h
+++ b/src/cvcuda/include/cvcuda/Types.h
@@ -58,6 +58,16 @@ typedef enum
     NVCV_CLOSE  = 3,
 } NVCVMorphologyType;
 
+// clang-format off
+// @brief Select how channel data is manipulated--both interleaved (NHWC) and planar (NCHW)
+typedef enum
+{
+    NVCV_CHANNEL_NO_OP = 0,        //!< do not manipulate channel data
+    NVCV_CHANNEL_REVERSE = 1,      //!< reverse channel order (e.g., RGB to BGR, BGR to RGB, RGBA to ABGR, etc.)
+} NVCVChannelManip;
+
+// clang-format on
+
 // @brief Flag to choose the color conversion to be used
 typedef enum
 {
diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt
index fa0e8c390..85683e409 100644
--- a/src/cvcuda/priv/CMakeLists.txt
+++ b/src/cvcuda/priv/CMakeLists.txt
@@ -68,6 +68,7 @@ set(CV_CUDA_PRIV_OP_FILES
     OpPairwiseMatcher.cu
     OpStack.cpp
     OpFindHomography.cu
+    OpResizeCropConvertReformat.cu
 )
 
 # filter only one that matches the patern (case insensitive), should be set on the global level
diff --git a/src/cvcuda/priv/OpResizeCropConvertReformat.cu b/src/cvcuda/priv/OpResizeCropConvertReformat.cu
new file mode 100644
index 000000000..624624284
--- /dev/null
+++ b/src/cvcuda/priv/OpResizeCropConvertReformat.cu
@@ -0,0 +1,584 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OpResizeCropConvertReformat.hpp"
+#include "legacy/CvCudaLegacy.h"
+#include "legacy/CvCudaLegacyHelpers.hpp"
+
+#include <nvcv/DataType.hpp>
+#include <nvcv/Exception.hpp>
+#include <nvcv/TensorData.hpp>
+#include <nvcv/TensorLayout.hpp>
+#include <nvcv/cuda/DropCast.hpp>
+#include <nvcv/cuda/InterpolationVarShapeWrap.hpp>
+#include <nvcv/cuda/InterpolationWrap.hpp>
+#include <nvcv/cuda/MathOps.hpp>
+#include <nvcv/cuda/StaticCast.hpp>
+#include <util/Assert.h>
+#include <util/Math.hpp>
+
+#include <type_traits>
+
+namespace cuda = nvcv::cuda;
+namespace util = nvcv::util;
+
+namespace cuda_op = nvcv::legacy::cuda_op;
+namespace helpers = nvcv::legacy::helpers;
+
+namespace {
+
+//******************** NN = Nearest Neighbor (TensorWrap src)
+
+template<class SrcWrapper, class DstT, typename SrcT = typename SrcWrapper::ValueType>
+__global__ void resizeCrop_NN(SrcWrapper src, DstT *dst, const int src_w, const int src_h, const int dst_w,
+                              const int dst_h, const float scale_x, const float scale_y, const int crop_x,
+                              const int crop_y, const size_t incrN, const size_t incrH, const size_t incrW,
+                              const size_t incrC, const uchar4 mapC)
+{
+    const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((dst_x < dst_w) && (dst_y < dst_h))
+    { // Generic copy pixel to pixel.
+        const int sample = blockIdx.z;
+
+        dst += sample * incrN + dst_y * incrH + dst_x * incrW;
+
+        const int sx = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + crop_x) * scale_x), src_w - 1);
+        const int sy = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_y + crop_y) * scale_y), src_h - 1);
+
+        SrcT v = *src.ptr(sample, sy, sx);
+
+        // Channel manipulation, convert type, and reformat.
+        dst[mapC.x * incrC] = (DstT)v.x;
+        dst[mapC.y * incrC] = (DstT)v.y;
+        dst[mapC.z * incrC] = (DstT)v.z;
+    }
+} // resizeCrop_NN
+
+//******************** Bilinear (TensorWrap src)
+
+template<class SrcWrapper, class DstT, typename SrcT = typename SrcWrapper::ValueType>
+__global__ void resizeCrop_bilinear(SrcWrapper src, DstT *dst, const int src_w, const int src_h, const int dst_w,
+                                    const int dst_h, const float scale_x, const float scale_y, const int crop_x,
+                                    const int crop_y, const size_t incrN, const size_t incrH, const size_t incrW,
+                                    const size_t incrC, const uchar4 mapC)
+{
+    const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (dst_x < dst_w && dst_y < dst_h)
+    {
+        const int sample = blockIdx.z;
+
+        // Float space for weighted addition.
+        // Compute y coordinate.
+        float fy = (float)((dst_y + crop_y + 0.5f) * scale_y - 0.5f);
+        int   sy = cuda::round<cuda::RoundMode::DOWN, int>(fy);
+        fy -= sy;
+        sy = cuda::max(0, cuda::min(sy, src_h - 2));
+
+        // Row pointers.
+        const SrcT *aPtr = src.ptr(sample, sy, 0);     // Start of upper row.
+        const SrcT *bPtr = src.ptr(sample, sy + 1, 0); // Start of lower row.
+
+        dst += sample * incrN + dst_y * incrH + dst_x * incrW;
+
+        { // Compute source data position and weight for [x0] components.
+            float fx = (float)((dst_x + crop_x + 0.5f) * scale_x - 0.5f);
+            int   sx = cuda::round<cuda::RoundMode::DOWN, int>(fx);
+            fx -= sx;
+
+            fx *= ((sx >= 0) && (sx < src_w - 1));
+            sx = cuda::max(0, cuda::min(sx, src_w - 2));
+
+            SrcT v = cuda::SaturateCast<SrcT>((1.0f - fx) * (aPtr[sx] * (1.0f - fy) + bPtr[sx] * fy)
+                                              + fx * (aPtr[sx + 1] * (1.0f - fy) + bPtr[sx + 1] * fy));
+            // Channel manipulation, convert type, and reformat.
+            dst[mapC.x * incrC] = (DstT)v.x;
+            dst[mapC.y * incrC] = (DstT)v.y;
+            dst[mapC.z * incrC] = (DstT)v.z;
+        }
+    }
+} // resizeCrop_bilinear
+
+//******************** NN = Nearest Neighbor (ImageBatchVarShape src)
+
+template<class SrcWrapper, class DstT, typename SrcT = typename SrcWrapper::ValueType>
+__global__ void resizeCrop_NN_varShape(SrcWrapper src, DstT *dst, const int dst_w, const int dst_h,
+                                       const float resize_w, const float resize_h, const int crop_x, const int crop_y,
+                                       const size_t incrN, const size_t incrH, const size_t incrW, const size_t incrC,
+                                       const uchar4 mapC)
+{
+    const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (dst_x < dst_w && dst_y < dst_h)
+    { // Generic copy pixel to pixel.
+        const int sample = blockIdx.z;
+        const int src_w  = src.width(sample);
+        const int src_h  = src.height(sample);
+
+        const float scale_x = static_cast<float>(src_w) / resize_w;
+        const float scale_y = static_cast<float>(src_h) / resize_h;
+
+        dst += sample * incrN + dst_y * incrH + dst_x * incrW;
+
+        const int sx = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + crop_x) * scale_x), src_w - 1);
+        const int sy = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_y + crop_y) * scale_y), src_h - 1);
+
+        SrcT v = *src.ptr(sample, sy, sx);
+
+        // Channel manipulation, convert type, and reformat.
+        dst[mapC.x * incrC] = (DstT)v.x;
+        dst[mapC.y * incrC] = (DstT)v.y;
+        dst[mapC.z * incrC] = (DstT)v.z;
+    }
+} // resizeCrop_NN_varShape
+
+//******************** Bilinear (ImageBatchVarShape src)
+
+template<class SrcWrapper, class DstT, typename SrcT = typename SrcWrapper::ValueType>
+__global__ void resizeCrop_bilinear_varShape(SrcWrapper src, DstT *dst, const int dst_w, const int dst_h,
+                                             const float resize_w, const float resize_h, const int crop_x,
+                                             const int crop_y, const size_t incrN, const size_t incrH,
+                                             const size_t incrW, const size_t incrC, const uchar4 mapC)
+{
+    const int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((dst_x < dst_w) && (dst_y < dst_h))
+    {
+        const int sample = blockIdx.z;
+        const int src_w  = src.width(sample);
+        const int src_h  = src.height(sample);
+
+        // Float space for weighted addition.
+        float scale_x = static_cast<float>(src_w) / resize_w;
+        float scale_y = static_cast<float>(src_h) / resize_h;
+
+        // Compute y coordinate.
+        float fy = (float)((dst_y + crop_y + 0.5f) * scale_y - 0.5f);
+        int   sy = cuda::round<cuda::RoundMode::DOWN, int>(fy);
+        fy -= sy;
+        sy = cuda::max(0, cuda::min(sy, src_h - 2));
+
+        // Row pointers.
+        const SrcT *aPtr = src.ptr(sample, sy, 0);     // Start of upper row.
+        const SrcT *bPtr = src.ptr(sample, sy + 1, 0); // Start of lower row.
+
+        dst += sample * incrN + dst_y * incrH + dst_x * incrW;
+
+        { // Cimpute source data position and weight for [x0] components.
+            float fx = (float)((dst_x + crop_x + 0.5f) * scale_x - 0.5f);
+            int   sx = cuda::round<cuda::RoundMode::DOWN, int>(fx);
+            fx -= sx;
+
+            fx *= ((sx >= 0) && (sx < src_w - 1));
+            sx = cuda::max(0, cuda::min(sx, src_w - 2));
+
+            SrcT v = cuda::SaturateCast<SrcT>((1.0f - fx) * (aPtr[sx] * (1.0f - fy) + bPtr[sx] * fy)
+                                              + fx * (aPtr[sx + 1] * (1.0f - fy) + bPtr[sx + 1] * fy));
+            // Channel manipulation, convert type, and reformat.
+            dst[mapC.x * incrC] = (DstT)v.x;
+            dst[mapC.y * incrC] = (DstT)v.y;
+            dst[mapC.z * incrC] = (DstT)v.z;
+        }
+    }
+} // resizeCrop_bilinear_varShape
+
+#define MAP(m, i, v) ((uint8_t *)&(m))[i] = (v)
+
+inline uchar4 remapChannels(const NVCVChannelManip manip, int channels)
+{
+    uchar4 map = make_uchar4(0, 1, 2, 3);
+
+    if (manip == NVCV_CHANNEL_REVERSE)
+    {
+        for (int c = 0; c < channels; ++c) MAP(map, c, channels - c - 1);
+    }
+    return map;
+}
+
+#undef MAP
+
+template<typename SrcT, typename DstT>
+void resizeCropConvertReformat(const nvcv::TensorDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData,
+                               const NVCVSize2D resizeDim, NVCVInterpolationType interpolation, const int2 cropPos,
+                               const NVCVChannelManip manip, cudaStream_t stream)
+
+{
+    using SrcBaseT = cuda::BaseType<SrcT>;
+    using DstBaseT = cuda::BaseType<DstT>;
+
+    auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(srcData);
+    NVCV_ASSERT(srcAccess);
+
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData);
+    NVCV_ASSERT(dstAccess);
+
+    const int samples  = srcAccess->numSamples();
+    const int channels = srcAccess->numChannels();
+    const int src_w    = srcAccess->numCols();
+    const int src_h    = srcAccess->numRows();
+    const int dst_w    = dstAccess->numCols();
+    const int dst_h    = dstAccess->numRows();
+
+    NVCV_ASSERT(samples == dstAccess->numSamples());
+    NVCV_ASSERT(channels == dstAccess->numChannels());
+
+    float scale_x = (float)src_w / resizeDim.w;
+    float scale_y = (float)src_h / resizeDim.h;
+
+    const int planes = dstAccess->numPlanes();
+
+    const uchar4 remap = remapChannels(manip, channels);
+
+    const size_t incrC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1);
+    const size_t incrW = channels / planes; // 1 if planar; channels if not.
+    const size_t incrH = dstAccess->rowStride() / sizeof(DstBaseT);
+    const size_t incrN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT);
+
+    const int THREADS_PER_BLOCK = 256; //256?  64?
+    const int BLOCK_WIDTH       = 16;  //as in 32x4 or 32x8.  16x8 and 16x16 are also viable
+
+    const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1);
+    const dim3 gridSize(util::DivUp(dst_w, blockSize.x), util::DivUp(dst_h, blockSize.y), samples);
+
+    auto src = cuda::CreateTensorWrapNHW<const SrcT>(srcData);
+
+    DstBaseT *dst = reinterpret_cast<DstBaseT *>(dstData.basePtr());
+
+    //Note: resize is fundamentally a gather memory operation, with a little bit of compute
+    //      our goals are to (a) maximize throughput, and (b) minimize occupancy for the same performance
+
+    switch (interpolation)
+    {
+    case NVCV_INTERP_NEAREST:
+        resizeCrop_NN<<<gridSize, blockSize, 0, stream>>>(src, dst, src_w, src_h, dst_w, dst_h, scale_x, scale_y,
+                                                          cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap);
+        break;
+
+    case NVCV_INTERP_LINEAR:
+        resizeCrop_bilinear<<<gridSize, blockSize, 0, stream>>>(src, dst, src_w, src_h, dst_w, dst_h, scale_x, scale_y,
+                                                                cropPos.x, cropPos.y, incrN, incrH, incrW, incrC,
+                                                                remap);
+        break;
+
+    case NVCV_INTERP_CUBIC:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_CUBIC");
+        break;
+
+    case NVCV_INTERP_AREA:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_AREA");
+        break;
+
+    default:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation");
+        break;
+    } //switch
+} //resize
+
+template<typename SrcT, typename DstT>
+void resizeCropConvertReformat(const nvcv::ImageBatchVarShapeDataStridedCuda &srcData,
+                               const nvcv::TensorDataStridedCuda &dstData, const NVCVSize2D resizeDim,
+                               const NVCVInterpolationType interpolation, const int2 cropPos,
+                               const NVCVChannelManip manip, cudaStream_t stream)
+{
+    using SrcBaseT = cuda::BaseType<SrcT>;
+    using DstBaseT = cuda::BaseType<DstT>;
+
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData);
+    NVCV_ASSERT(dstAccess);
+
+    const nvcv::ImageFormat srcFrmt = srcData.uniqueFormat();
+    NVCV_ASSERT(srcFrmt);
+
+    const int samples  = srcData.numImages();
+    const int channels = srcFrmt.numChannels();
+    const int dst_w    = dstAccess->numCols();
+    const int dst_h    = dstAccess->numRows();
+
+    NVCV_ASSERT(samples == dstAccess->numSamples());
+    NVCV_ASSERT(channels == dstAccess->numChannels());
+
+    const int    planes = dstAccess->numPlanes();
+    const uchar4 remap  = remapChannels(manip, channels);
+
+    const size_t incrC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1);
+    const size_t incrW = channels / planes; // 1 if planar; channels if not.
+    const size_t incrH = dstAccess->rowStride() / sizeof(DstBaseT);
+    const size_t incrN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT);
+
+    const int THREADS_PER_BLOCK = 256; //Performance degrades above 256 and below 16 (GMEM speed limited)
+    const int BLOCK_WIDTH       = 8;   //as in 32x4 or 32x8 or 8x32.
+
+    const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1);
+    const dim3 gridSize(util::DivUp(dst_w, blockSize.x), util::DivUp(dst_h, blockSize.y), samples);
+
+    cuda::ImageBatchVarShapeWrap<const SrcT> src(srcData);
+
+    DstBaseT *dst = reinterpret_cast<DstBaseT *>(dstData.basePtr());
+
+    switch (interpolation)
+    {
+    case NVCV_INTERP_NEAREST:
+        resizeCrop_NN_varShape<<<gridSize, blockSize, 0, stream>>>(
+            src, dst, dst_w, dst_h, resizeDim.w, resizeDim.h, cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap);
+        break;
+
+    case NVCV_INTERP_LINEAR:
+        resizeCrop_bilinear_varShape<<<gridSize, blockSize, 0, stream>>>(
+            src, dst, dst_w, dst_h, resizeDim.w, resizeDim.h, cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap);
+        break;
+
+    case NVCV_INTERP_CUBIC:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_CUBIC");
+        break;
+
+    case NVCV_INTERP_AREA:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_AREA");
+        break;
+
+    default:
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation");
+        break;
+
+    } //switch interpolation
+}
+
+} // anonymous namespace
+
+// clang-format off
+namespace cvcuda::priv {
+ResizeCropConvertReformat::ResizeCropConvertReformat() { }
+
+// clang-format on
+
+void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst,
+                                           const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation,
+                                           const int2 cropPos, const NVCVChannelManip manip) const
+{
+    auto srcData = src.exportData<nvcv::TensorDataStridedCuda>();
+    if (!srcData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Input must be a cuda-accessible, pitch-linear tensor");
+    }
+
+    auto dstData = dst.exportData<nvcv::TensorDataStridedCuda>();
+    if (!dstData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Output must be a cuda-accessible, pitch-linear tensor");
+    }
+
+    auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData);
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData);
+
+    const int samples  = srcAccess->numSamples();
+    const int channels = srcAccess->numChannels();
+
+    const int dst_w = dstAccess->numCols();
+    const int dst_h = dstAccess->numRows();
+
+    if (samples != dstAccess->numSamples())
+    {
+        std::string msg = "Input and output must have the same batch size (i.e., same number of images): Provided "
+                        + std::to_string(samples) + " input and " + std::to_string(dstAccess->numSamples())
+                        + " output images / samples";
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str());
+    }
+
+    if (channels != dstAccess->numChannels())
+    {
+        std::string msg = "Input and output must have same number of channels: Provided " + std::to_string(channels)
+                        + " input and " + std::to_string(dstAccess->numChannels()) + " output channels";
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    if (channels != 3)
+    {
+        std::string msg = "Only three-channel input is currently supported: Provided " + std::to_string(channels)
+                        + " input channels";
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    cuda_op::DataType srcType = helpers::GetLegacyDataType((*srcData).dtype());
+    cuda_op::DataType dstType = helpers::GetLegacyDataType((*dstData).dtype());
+
+    if (srcType != cuda_op::kCV_8U)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Input must be of data type uchar.");
+    }
+
+    if (dstType != cuda_op::kCV_8U && dstType != cuda_op::kCV_32F)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Output must be of data type uchar or float.");
+    }
+
+    nvcv::TensorLayout srcLayout = srcData->layout();
+    nvcv::TensorLayout dstLayout = dstData->layout();
+
+    if (!(srcLayout == NVCV_TENSOR_NHWC || srcLayout == NVCV_TENSOR_HWC))
+    {
+        const char *layout = nvcvTensorLayoutGetName(&srcLayout.m_layout);
+        std::string msg    = "Input tensor must have 'NHWC' or 'HWC' layout: Layout provided " + std::string(layout);
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    if (!(dstLayout == NVCV_TENSOR_NHWC || dstLayout == NVCV_TENSOR_HWC || dstLayout == NVCV_TENSOR_NCHW
+          || dstLayout == NVCV_TENSOR_CHW))
+    {
+        const char *layout = nvcvTensorLayoutGetName(&dstLayout.m_layout);
+        std::string msg
+            = "Output tensor must have 'NHWC', 'NCHW', 'HWC', or 'CHW' layout: Layout provided " + std::string(layout);
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > resizeDim.w || cropPos.y + dst_h > resizeDim.h)
+    {
+        std::string msg = "Invalid crop region: crop region(x, y, w, h) = (" + std::to_string(cropPos.x) + ", "
+                        + std::to_string(cropPos.y) + ", " + std::to_string(dst_w) + ", " + std::to_string(dst_h)
+                        + ") extends beyond bounds of resized tensor";
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str());
+    }
+
+    if (srcType == cuda_op::kCV_8U)
+    {
+        if (dstType == cuda_op::kCV_8U)
+        {
+            resizeCropConvertReformat<uchar3, uint8_t>(*srcData, *dstData, resizeDim, interpolation, cropPos, manip,
+                                                       stream);
+        }
+        else if (dstType == cuda_op::kCV_32F)
+        {
+            resizeCropConvertReformat<uchar3, float>(*srcData, *dstData, resizeDim, interpolation, cropPos, manip,
+                                                     stream);
+        }
+    }
+}
+
+void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &src,
+                                           const nvcv::Tensor &dst, const NVCVSize2D resizeDim,
+                                           const NVCVInterpolationType interpolation, const int2 cropPos,
+                                           const NVCVChannelManip manip) const
+{
+    auto srcData = src.exportData<nvcv::ImageBatchVarShapeDataStridedCuda>(stream);
+    if (!srcData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Input data must be a cuda-accessible, varshape image batch");
+    }
+
+    auto dstData = dst.exportData<nvcv::TensorDataStridedCuda>();
+    if (!dstData)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Output must be a cuda-accessible, pitch-linear tensor");
+    }
+
+    const nvcv::ImageFormat srcFrmt = src.uniqueFormat();
+
+    if (!srcFrmt)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "All input images in a batch must have the same format (including number of channels)");
+    }
+
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData);
+
+    const int samples  = srcData->numImages();
+    const int channels = srcFrmt.numChannels();
+
+    const int dst_w = dstAccess->numCols();
+    const int dst_h = dstAccess->numRows();
+
+    if (samples != dstAccess->numSamples())
+    {
+        std::string msg = "Input and output must have the same batch size (i.e., same number of images): Provided "
+                        + std::to_string(samples) + " input and " + std::to_string(dstAccess->numChannels())
+                        + " output channels";
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str());
+    }
+
+    if (channels != dstAccess->numChannels())
+    {
+        std::string msg = "Input and output must have same number of channels: Provided " + std::to_string(channels)
+                        + " input and " + std::to_string(dstAccess->numChannels()) + " output channels";
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    if (channels != 3)
+    {
+        std::string msg = "Only three-channel input is currently supported: Provided " + std::to_string(channels)
+                        + " input channels";
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    cuda_op::DataType srcType = helpers::GetLegacyDataType(srcFrmt);
+    cuda_op::DataType dstType = helpers::GetLegacyDataType((*dstData).dtype());
+
+    if (srcType != cuda_op::kCV_8U)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Input must be of data type uchar.");
+    }
+
+    if (dstType != cuda_op::kCV_8U && dstType != cuda_op::kCV_32F)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Output must be of data type uchar or float.");
+    }
+
+    nvcv::TensorLayout dstLayout = dstData->layout();
+
+    if (srcFrmt.numPlanes() > 1)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Input must be non-planar (i.e., interleaved).");
+    }
+
+    if (!(dstLayout == NVCV_TENSOR_NHWC || dstLayout == NVCV_TENSOR_HWC || dstLayout == NVCV_TENSOR_NCHW
+          || dstLayout == NVCV_TENSOR_CHW))
+    {
+        const char *layout = nvcvTensorLayoutGetName(&dstLayout.m_layout);
+        std::string msg
+            = "Output tensor must have 'NHWC', 'NCHW', 'HWC', or 'CHW' layout: Layout provided " + std::string(layout);
+        throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str());
+    }
+
+    if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > resizeDim.w || cropPos.y + dst_h > resizeDim.h)
+    {
+        std::string msg = "Invalid crop region: crop region(x, y, w, h) = (" + std::to_string(cropPos.x) + ", "
+                        + std::to_string(cropPos.y) + ", " + std::to_string(dst_w) + ", " + std::to_string(dst_h)
+                        + ") extends beyond bounds of resized tensor";
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str());
+    }
+
+    if (srcType == cuda_op::kCV_8U)
+    {
+        if (dstType == cuda_op::kCV_8U)
+        {
+            resizeCropConvertReformat<uchar3, uint8_t>(*srcData, *dstData, resizeDim, interpolation, cropPos, manip,
+                                                       stream);
+        }
+        else if (dstType == cuda_op::kCV_32F)
+        {
+            resizeCropConvertReformat<uchar3, float>(*srcData, *dstData, resizeDim, interpolation, cropPos, manip,
+                                                     stream);
+        }
+    }
+}
+} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpResizeCropConvertReformat.hpp b/src/cvcuda/priv/OpResizeCropConvertReformat.hpp
new file mode 100644
index 000000000..eea8d7df5
--- /dev/null
+++ b/src/cvcuda/priv/OpResizeCropConvertReformat.hpp
@@ -0,0 +1,54 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OpResizeCropConvertReformat.hpp
+ *
+ * @brief Defines the private C++ class for that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines.
+ */
+
+#ifndef CVCUDA_PRIV__RESIZE_CROP_HPP
+#define CVCUDA_PRIV__RESIZE_CROP_HPP
+
+#include "IOperator.hpp"
+#include "legacy/CvCudaLegacy.h"
+
+#include <cvcuda/Types.h> // for NVCVInterpolationType, NVCVChannelManip, etc.
+#include <nvcv/ImageBatch.hpp>
+#include <nvcv/Tensor.hpp>
+
+#include <memory>
+
+namespace cvcuda::priv {
+
+class ResizeCropConvertReformat final : public IOperator
+{
+public:
+    explicit ResizeCropConvertReformat();
+
+    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim,
+                    const NVCVInterpolationType interpolation, const int2 cropPos,
+                    const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP) const;
+
+    void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out,
+                    const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos,
+                    const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP) const;
+};
+
+} // namespace cvcuda::priv
+
+#endif // CVCUDA_PRIV__RESIZE_CROP_HPP
diff --git a/src/cvcuda/priv/legacy/custom_crop.cu b/src/cvcuda/priv/legacy/custom_crop.cu
index eea4ebfe7..3695007a3 100644
--- a/src/cvcuda/priv/legacy/custom_crop.cu
+++ b/src/cvcuda/priv/legacy/custom_crop.cu
@@ -123,7 +123,7 @@ ErrorCode CustomCrop::infer(const TensorDataStridedCuda &inData, const TensorDat
 
     if (start_x < 0 || start_y < 0 || end_x >= cols || end_y >= rows)
     {
-        LOG_ERROR("Invliad Roi range x " << roi.x << " y " << roi.y << " width " << roi.width << " height "
+        LOG_ERROR("Invalid Roi range x " << roi.x << " y " << roi.y << " width " << roi.width << " height "
                                          << roi.height);
         return ErrorCode::INVALID_PARAMETER;
     }
diff --git a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp
index b925ce60a..e4afd3ca0 100644
--- a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp
@@ -175,7 +175,7 @@ class TensorWrap<const T, Strides...>
      *
      * @return The const array (as a pointer) containing run-time pitches in bytes.
      */
-    __host__ __device__ const int *strides() const
+    const __host__ __device__ int *strides() const
     {
         return m_strides;
     }
@@ -483,6 +483,34 @@ __host__ auto CreateTensorWrapNHWC(const TensorDataStridedCuda &tensor)
                            static_cast<int>(tensorAccess->rowStride()), static_cast<int>(tensorAccess->colStride()));
 }
 
+/**
+ * Factory function to create an NCHW tensor wrap given a tensor data.
+ *
+ * The output \ref TensorWrap is an NCHW 4D tensor allowing to access data per batch (N), per channel (C), per row (H), and per column
+ * (W) of the input tensor.  The input tensor data must have either NCHW or CHW layout, where
+ * the channel C is of type \p T, e.g. T=uchar for each channel of either RGB8 or RGBA8.
+ *
+ * @sa NVCV_CPP_CUDATOOLS_TENSORWRAP
+ *
+ * @tparam T Type of the values to be accessed in the tensor wrap.
+ *
+ * @param[in] tensor Reference to the tensor that will be wrapped.
+ *
+ * @return Tensor wrap useful to access tensor data in CUDA kernels.
+ */
+template<typename T, class = Require<HasTypeTraits<T>>>
+__host__ auto CreateTensorWrapNCHW(const TensorDataStridedCuda &tensor)
+{
+    auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor);
+    assert(tensorAccess);
+    assert(tensorAccess->sampleStride() <= TypeTraits<int>::max);
+    assert(tensorAccess->chStride() <= TypeTraits<int>::max);
+    assert(tensorAccess->rowStride() <= TypeTraits<int>::max);
+
+    return Tensor4DWrap<T>(tensor.basePtr(), static_cast<int>(tensorAccess->sampleStride()),
+                           static_cast<int>(tensorAccess->chStride()), static_cast<int>(tensorAccess->rowStride()));
+}
+
 } // namespace nvcv::cuda
 
 #endif // NVCV_CUDA_TENSOR_WRAP_HPP
diff --git a/tests/cvcuda/python/test_opresizecropconvertreformat.py b/tests/cvcuda/python/test_opresizecropconvertreformat.py
new file mode 100644
index 000000000..0f9f2a796
--- /dev/null
+++ b/tests/cvcuda/python/test_opresizecropconvertreformat.py
@@ -0,0 +1,422 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest as t
+import nvcv
+import cvcuda
+import torch
+
+# NOTE: The following tests for resize_crop_convert_reformat DO NOT TEST:
+#       1. The correctness of the output data
+#       2. Whether the format conversion actually worked correctly w.r.t. the data
+#       3. Whether the channel swapping actually worked correctly w.r.t. the data
+
+
+@t.mark.parametrize(
+    "tensor_params, resize_dim, resize_interpolation, crop_rect_params, "
+    "out_layout, out_dtype, manip, out_expected_shape, is_positive_test",
+    [
+        (
+            ((4, 512, 512, 3), np.uint8, "NHWC"),  # Basic test
+            (256, 256),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 224),
+            "NCHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (4, 3, 224, 224),
+            True,
+        ),
+        (
+            ((4, 512, 512, 3), np.uint8, "NHWC"),
+            (256, 256),
+            cvcuda.Interp.NEAREST,  # With NEAREST Interpolation
+            (0, 0, 224, 224),
+            "NCHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (4, 3, 224, 224),
+            True,
+        ),
+        (
+            ((4, 512, 512, 3), np.uint8, "NHWC"),
+            (256, 256),
+            cvcuda.Interp.NEAREST,
+            (0, 0, 224, 224),
+            "",  # Empty output layout means keep the same as input
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (4, 224, 224, 3),
+            True,
+        ),
+        (
+            ((4, 512, 512, 3), np.uint8, "NHWC"),
+            (256, 256),
+            cvcuda.Interp.NEAREST,
+            (0, 0, 224, 224),
+            "",  # Empty output layout means keep the same layout as input
+            0,  # Zero means keep the same dtype as input
+            cvcuda.ChannelManip.REVERSE,
+            (4, 224, 224, 3),
+            True,
+        ),
+        (
+            ((17, 678, 1027, 3), np.uint8, "NHWC"),  # Odd sizes
+            (251, 256),  # Odd sizes
+            cvcuda.Interp.LINEAR,
+            (0, 0, 200, 22),
+            "NCHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (17, 3, 22, 200),
+            True,
+        ),
+        (
+            ((17, 678, 1027, 3), np.uint8, "NHWC"),
+            (251, 256),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 200, 22),
+            "NHWC",  # Same output layout as the input tensor
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (17, 22, 200, 3),
+            True,
+        ),
+        (
+            ((3, 40, 20, 3), np.uint8, "NHWC"),
+            (160, 160),
+            cvcuda.Interp.NEAREST,
+            (10, 20, 20, 35),
+            "NCHW",
+            nvcv.Type.U8,  # Same dtype as the input tensor
+            cvcuda.ChannelManip.NO_OP,  # No op here
+            (3, 3, 35, 20),
+            True,
+        ),
+        (
+            ((512, 512, 3), np.uint8, "HWC"),  # Single image case.
+            (256, 256),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 224),
+            "CHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (3, 224, 224),
+            True,
+        ),
+        (
+            ((3, 512, 512), np.uint8, "CHW"),  # Unsupported input CHW
+            (256, 256),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 224),
+            "CHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (3, 224, 224),
+            False,  # Negative test
+        ),
+        (
+            ((512, 1024, 3), np.uint8, "HWC"),  # Large sizes
+            (1024, 256),  # Unchanged resize width
+            cvcuda.Interp.LINEAR,
+            (0, 0, 1024, 224),  # Unchanged crop width
+            "CHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.NO_OP,
+            (3, 224, 1024),
+            True,
+        ),
+        (
+            ((512, 1024, 3), np.float32, "HWC"),  # Unsupported input dtype
+            (1024, 256),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 1024, 224),
+            "CHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (3, 224, 1024),
+            False,  # Negative test
+        ),
+        (
+            ((1, 2, 3), np.uint8, "HWC"),  # Very small sizes
+            (60, 5),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 59, 5),
+            "CHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (3, 5, 59),
+            True,
+        ),
+        (
+            ((1, 2, 3), np.uint8, "HWC"),
+            (60, 5),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 61, 5),  # Out of range crop.
+            "CHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (3, 5, 59),
+            False,  # Negative test
+        ),
+        (
+            ((4, 512, 512, 3), np.uint8, "NHWC"),
+            (256, 256),
+            cvcuda.Interp.AREA,  # With Area Interpolation
+            (0, 0, 224, 224),
+            "NCHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (4, 3, 224, 224),
+            False,  # Negative test
+        ),
+    ],
+)
+def test_op_resize_crop_convert_reformat(
+    tensor_params,
+    resize_dim,
+    resize_interpolation,
+    crop_rect_params,
+    out_layout,
+    out_dtype,
+    manip,
+    out_expected_shape,
+    is_positive_test,
+):
+
+    inputTensor = cvcuda.Tensor(*tensor_params)
+    out_layout = out_layout if out_layout else str(inputTensor.layout)
+    out_dtype = out_dtype if out_dtype else inputTensor.dtype
+
+    try:
+        out1 = cvcuda.resize_crop_convert_reformat(
+            inputTensor,
+            resize_dim,
+            resize_interpolation,
+            cvcuda.RectI(*crop_rect_params),
+            layout=out_layout,
+            data_type=out_dtype,
+            manip=manip,
+        )
+    except Exception as e:
+        if is_positive_test:
+            raise e
+        else:
+            # This is pass for a negative test.
+            pass
+
+    if is_positive_test:
+        assert out1.layout == out_layout
+        assert out1.shape == out_expected_shape
+        assert out1.dtype == out_dtype
+
+    out2 = cvcuda.Tensor(out_expected_shape, out_dtype, out_layout)
+
+    try:
+        tmp = cvcuda.resize_crop_convert_reformat_into(
+            out2,
+            inputTensor,
+            resize_dim,
+            resize_interpolation,
+            [crop_rect_params[1], crop_rect_params[0]],
+            manip=manip,
+        )
+    except Exception as e:
+        if is_positive_test:
+            raise e
+        else:
+            # This is pass for a negative test.
+            pass
+
+    if is_positive_test:
+        assert tmp is out2
+        assert out2.layout == out_layout
+        assert out2.shape == out_expected_shape
+        assert out2.dtype == out_dtype
+
+    # Compare the two
+    if is_positive_test:
+        out1 = torch.as_tensor(out1.cuda())
+        out2 = torch.as_tensor(out2.cuda())
+        assert torch.equal(out1, out2)
+
+
+@t.mark.parametrize(
+    "num_images, min_size, max_size, resize_dim, resize_interpolation, crop_rect_params, "
+    "out_layout, out_dtype, manip, out_expected_shape, is_positive_test",
+    [
+        (
+            10,  # Basic test
+            (50, 50),
+            (512, 512),
+            (256, 256),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 224),
+            "NCHW",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (10, 3, 224, 224),
+            True,
+        ),
+        (
+            1,  # Only one image
+            (500, 300),
+            (800, 700),  # Bigger sizes
+            (400, 200),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 190),
+            "NHWC",  # Same output layout as the input
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.REVERSE,
+            (1, 190, 224, 3),
+            True,
+        ),
+        (
+            50,  # More images
+            (1, 1),  # Very small image
+            (50, 70),
+            (400, 200),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 190),
+            "NHWC",
+            nvcv.Type.F32,
+            cvcuda.ChannelManip.NO_OP,  # No channels swapping
+            (50, 190, 224, 3),
+            True,
+        ),
+        (
+            50,
+            (1, 1),
+            (50, 70),
+            (400, 200),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 190),
+            "NCHW",
+            nvcv.Type.U8,  # Same uint8 dtype as the input
+            cvcuda.ChannelManip.REVERSE,
+            (50, 3, 190, 224),
+            True,
+        ),
+        (
+            50,
+            (1, 1),
+            (50, 70),
+            (400, 200),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 190),
+            "NCHW",
+            nvcv.Type.U8,
+            cvcuda.ChannelManip.NO_OP,  # NO_OP
+            (50, 3, 190, 224),
+            True,
+        ),
+        (
+            50,
+            (1, 1),
+            (50, 70),
+            (400, 200),
+            cvcuda.Interp.LINEAR,
+            (0, 0, 224, 190),
+            "",  # Same uint8 dtype as the input
+            0,  # Same uint8 dtype as the input
+            cvcuda.ChannelManip.REVERSE,
+            (50, 190, 224, 3),
+            True,
+        ),
+    ],
+)
+def test_op_resize_crop_convert_reformat_varshape(
+    num_images,
+    min_size,
+    max_size,
+    resize_dim,
+    resize_interpolation,
+    crop_rect_params,
+    out_layout,
+    out_dtype,
+    manip,
+    out_expected_shape,
+    is_positive_test,
+):
+
+    inputVarShape = cvcuda.ImageBatchVarShape(num_images)
+    out_layout = out_layout if out_layout else "NHWC"
+    out_dtype = out_dtype if out_dtype else nvcv.Type.U8
+
+    inputVarShape.pushback(
+        [
+            cvcuda.Image(
+                (
+                    min_size[0] + (max_size[0] - min_size[0]) * i // num_images,
+                    min_size[1] + (max_size[1] - min_size[1]) * i // num_images,
+                ),
+                cvcuda.Format.RGB8,
+            )
+            for i in range(num_images)
+        ]
+    )
+
+    try:
+        out1 = cvcuda.resize_crop_convert_reformat(
+            inputVarShape,
+            resize_dim=resize_dim,
+            interp=resize_interpolation,
+            crop_rect=cvcuda.RectI(*crop_rect_params),
+            layout=out_layout,
+            data_type=out_dtype,
+            manip=manip,
+        )
+    except Exception as e:
+        if is_positive_test:
+            raise e
+        else:
+            # This is pass for a negative test.
+            pass
+
+    if is_positive_test:
+        assert out1.layout == out_layout
+        assert out1.shape == out_expected_shape
+        assert out1.dtype == out_dtype
+
+    out2 = cvcuda.Tensor(out_expected_shape, out_dtype, out_layout)
+
+    try:
+        tmp = cvcuda.resize_crop_convert_reformat_into(
+            out2,
+            inputVarShape,
+            resize_dim,
+            resize_interpolation,
+            [crop_rect_params[1], crop_rect_params[0]],
+            manip=manip,
+        )
+    except Exception as e:
+        if is_positive_test:
+            raise e
+        else:
+            # This is pass for a negative test.
+            pass
+
+    if is_positive_test:
+        assert tmp is out2
+        assert out2.layout == out_layout
+        assert out2.shape == out_expected_shape
+        assert out2.dtype == out_dtype
+
+    if is_positive_test:
+        # Compare the two
+        out1 = torch.as_tensor(out1.cuda())
+        out2 = torch.as_tensor(out2.cuda())
+        assert torch.equal(out1, out2)
diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt
index e82b17070..0b3375c3d 100644
--- a/tests/cvcuda/system/CMakeLists.txt
+++ b/tests/cvcuda/system/CMakeLists.txt
@@ -31,6 +31,7 @@ endif()
 
 # system core -------------------------------------------------
 add_executable(cvcuda_test_system
+    TestOpResizeCropConvertReformat.cpp
     TestOpPairwiseMatcher.cpp
     TestOpStack.cpp
     TestOpLabel.cpp
diff --git a/tests/cvcuda/system/ResizeUtils.cpp b/tests/cvcuda/system/ResizeUtils.cpp
index 401ed59f8..98bfa2792 100644
--- a/tests/cvcuda/system/ResizeUtils.cpp
+++ b/tests/cvcuda/system/ResizeUtils.cpp
@@ -28,50 +28,42 @@
 
 namespace nvcv::test {
 
-void Resize(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<uint8_t> &hSrc,
-            int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation,
-            bool isVarshape)
+template<typename T, T MinVal, T MaxVal>
+void _Resize(std::vector<T> &hDst, int dstStep, nvcv::Size2D dstSize, const std::vector<T> &hSrc, int srcStep,
+             nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, bool isVarshape)
 {
-    if (interpolation == NVCV_INTERP_NEAREST || interpolation == NVCV_INTERP_LINEAR
-        || interpolation == NVCV_INTERP_CUBIC)
-    {
-        ResizedCrop(hDst, dstRowStride, dstSize, hSrc, srcRowStride, srcSize, 0, 0, srcSize.h, srcSize.w, fmt,
-                    interpolation);
-        return;
-    }
-
-    double iScale = static_cast<double>(srcSize.h) / dstSize.h;
-    double jScale = static_cast<double>(srcSize.w) / dstSize.w;
+    double scaleH = static_cast<double>(srcSize.h) / dstSize.h;
+    double scaleW = static_cast<double>(srcSize.w) / dstSize.w;
 
     assert(fmt.numPlanes() == 1);
 
-    int elementsPerPixel = fmt.numChannels();
+    int channels = fmt.numChannels();
 
-    uint8_t       *dstPtr = hDst.data();
-    const uint8_t *srcPtr = hSrc.data();
+    T       *dstPtr = hDst.data();
+    const T *srcPtr = hSrc.data();
 
-    for (int di = 0; di < dstSize.h; di++)
+    for (int dy = 0; dy < dstSize.h; dy++)
     {
-        for (int dj = 0; dj < dstSize.w; dj++)
+        for (int dx = 0; dx < dstSize.w; dx++)
         {
-            if (interpolation == NVCV_INTERP_AREA)
+            if (interp == NVCV_INTERP_AREA)
             {
-                double fsx1 = dj * jScale;
-                double fsx2 = fsx1 + jScale;
-                double fsy1 = di * iScale;
-                double fsy2 = fsy1 + iScale;
+                double fsx1 = dx * scaleW;
+                double fsx2 = fsx1 + scaleW;
+                double fsy1 = dy * scaleH;
+                double fsy2 = fsy1 + scaleH;
                 int    sx1  = cuda::round<cuda::RoundMode::UP, int>(fsx1);
                 int    sx2  = cuda::round<cuda::RoundMode::DOWN, int>(fsx2);
                 int    sy1  = cuda::round<cuda::RoundMode::UP, int>(fsy1);
                 int    sy2  = cuda::round<cuda::RoundMode::DOWN, int>(fsy2);
 
-                for (int k = 0; k < elementsPerPixel; k++)
+                for (int c = 0; c < channels; c++)
                 {
                     double out = 0.0;
 
-                    if (std::ceil(jScale) == jScale && std::ceil(iScale) == iScale)
+                    if (std::ceil(scaleW) == scaleW && std::ceil(scaleH) == scaleH)
                     {
-                        double invscale = 1.f / (jScale * iScale);
+                        double invscale = 1.f / (scaleW * scaleH);
 
                         for (int dy = sy1; dy < sy2; ++dy)
                         {
@@ -79,90 +71,88 @@ void Resize(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize,
                             {
                                 if (dy >= 0 && dy < srcSize.h && dx >= 0 && dx < srcSize.w)
                                 {
-                                    out = out + srcPtr[dy * srcRowStride + dx * elementsPerPixel + k] * invscale;
+                                    out = out + srcPtr[dy * srcStep + dx * channels + c] * invscale;
                                 }
                             }
                         }
                     }
                     else
                     {
-                        if (!isVarshape || (iScale >= 1.0f && jScale >= 1.0f))
+                        if (!isVarshape || (scaleH >= 1.0f && scaleW >= 1.0f))
                         {
                             double invscale
-                                = 1.f / (std::min(jScale, srcSize.w - fsx1) * std::min(iScale, srcSize.h - fsy1));
+                                = 1.f / (std::min(scaleW, srcSize.w - fsx1) * std::min(scaleH, srcSize.h - fsy1));
 
                             for (int dy = sy1; dy < sy2; ++dy)
                             {
                                 for (int dx = sx1; dx < sx2; ++dx)
                                     if (dy >= 0 && dy < srcSize.h && dx >= 0 && dx < srcSize.w)
-                                        out = out + srcPtr[dy * srcRowStride + dx * elementsPerPixel + k] * invscale;
+                                        out = out + srcPtr[dy * srcStep + dx * channels + c] * invscale;
 
                                 if (sx1 > fsx1)
                                     if (dy >= 0 && dy < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w)
                                         out = out
-                                            + srcPtr[dy * srcRowStride + (sx1 - 1) * elementsPerPixel + k]
+                                            + srcPtr[dy * srcStep + (sx1 - 1) * channels + c]
                                                   * ((sx1 - fsx1) * invscale);
 
                                 if (sx2 < fsx2)
                                     if (dy >= 0 && dy < srcSize.h && sx2 >= 0 && sx2 < srcSize.w)
                                         out = out
-                                            + srcPtr[dy * srcRowStride + sx2 * elementsPerPixel + k]
-                                                  * ((fsx2 - sx2) * invscale);
+                                            + srcPtr[dy * srcStep + sx2 * channels + c] * ((fsx2 - sx2) * invscale);
                             }
 
                             if (sy1 > fsy1)
                                 for (int dx = sx1; dx < sx2; ++dx)
                                     if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && dx >= 0 && dx < srcSize.w)
                                         out = out
-                                            + srcPtr[(sy1 - 1) * srcRowStride + dx * elementsPerPixel + k]
+                                            + srcPtr[(sy1 - 1) * srcStep + dx * channels + c]
                                                   * ((sy1 - fsy1) * invscale);
 
                             if (sy2 < fsy2)
                                 for (int dx = sx1; dx < sx2; ++dx)
                                     if (sy2 >= 0 && sy2 < srcSize.h && dx >= 0 && dx < srcSize.w)
                                         out = out
-                                            + srcPtr[sy2 * srcRowStride + dx * elementsPerPixel + k]
-                                                  * ((fsy2 - sy2) * invscale);
+                                            + srcPtr[sy2 * srcStep + dx * channels + c] * ((fsy2 - sy2) * invscale);
 
                             if ((sy1 > fsy1) && (sx1 > fsx1))
                                 if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w)
                                     out = out
-                                        + srcPtr[(sy1 - 1) * srcRowStride + (sx1 - 1) * elementsPerPixel + k]
+                                        + srcPtr[(sy1 - 1) * srcStep + (sx1 - 1) * channels + c]
                                               * ((sy1 - fsy1) * (sx1 - fsx1) * invscale);
 
                             if ((sy1 > fsy1) && (sx2 < fsx2))
                                 if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w)
                                     out = out
-                                        + srcPtr[(sy1 - 1) * srcRowStride + sx2 * elementsPerPixel + k]
+                                        + srcPtr[(sy1 - 1) * srcStep + sx2 * channels + c]
                                               * ((sy1 - fsy1) * (fsx2 - sx2) * invscale);
 
                             if ((sy2 < fsy2) && (sx2 < fsx2))
                                 if (sy2 >= 0 && sy2 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w)
                                     out = out
-                                        + srcPtr[sy2 * srcRowStride + sx2 * elementsPerPixel + k]
+                                        + srcPtr[sy2 * srcStep + sx2 * channels + c]
                                               * ((fsy2 - sy2) * (fsx2 - sx2) * invscale);
 
                             if ((sy2 < fsy2) && (sx1 > fsx1))
                                 if (sy2 >= 0 && sy2 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w)
                                     out = out
-                                        + srcPtr[sy2 * srcRowStride + (sx1 - 1) * elementsPerPixel + k]
+                                        + srcPtr[sy2 * srcStep + (sx1 - 1) * channels + c]
                                               * ((fsy2 - sy2) * (sx1 - fsx1) * invscale);
                         }
                         else // zoom in for varshape
                         {
-                            double iScale_inv = 1.0 / iScale;
-                            double jScale_inv = 1.0 / jScale;
+                            double scaleH_inv = 1.0 / scaleH;
+                            double scaleW_inv = 1.0 / scaleW;
 
                             sy1      = cuda::round<cuda::RoundMode::DOWN, int>(fsy1);
                             sx1      = cuda::round<cuda::RoundMode::DOWN, int>(fsx1);
-                            float fy = (float)(float(di + 1) - float(sy1 + 1) * iScale_inv);
+                            float fy = (float)(float(dy + 1) - float(sy1 + 1) * scaleH_inv);
                             fy       = fy <= 0 ? 0.f : fy - cuda::round<cuda::RoundMode::DOWN, int>(fy);
 
                             float cbufy[2];
                             cbufy[0] = 1.f - fy;
                             cbufy[1] = fy;
 
-                            float fx = (float)(float(dj + 1) - float(sx1 + 1) * jScale_inv);
+                            float fx = (float)(float(dx + 1) - float(sx1 + 1) * scaleW_inv);
                             fx       = fx <= 0 ? 0.f : fx - cuda::round<cuda::RoundMode::DOWN, int>(fx);
 
                             if (sx1 < 0)
@@ -181,144 +171,205 @@ void Resize(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize,
                             float cbufx[2];
                             cbufx[0] = 1.f - fx;
                             cbufx[1] = fx;
-                            out      = srcPtr[sy1 * srcRowStride + sx1 * elementsPerPixel + k] * cbufx[0] * cbufy[0]
-                                + srcPtr[(sy1 + 1) * srcRowStride + sx1 * elementsPerPixel + k] * cbufx[0] * cbufy[1]
-                                + srcPtr[sy1 * srcRowStride + (sx1 + 1) * elementsPerPixel + k] * cbufx[1] * cbufy[0]
-                                + srcPtr[(sy1 + 1) * srcRowStride + (sx1 + 1) * elementsPerPixel + k] * cbufx[1]
-                                      * cbufy[1];
+                            out      = srcPtr[sy1 * srcStep + sx1 * channels + c] * cbufx[0] * cbufy[0]
+                                + srcPtr[(sy1 + 1) * srcStep + sx1 * channels + c] * cbufx[0] * cbufy[1]
+                                + srcPtr[sy1 * srcStep + (sx1 + 1) * channels + c] * cbufx[1] * cbufy[0]
+                                + srcPtr[(sy1 + 1) * srcStep + (sx1 + 1) * channels + c] * cbufx[1] * cbufy[1];
                         }
                     }
 
                     out = std::rint(std::abs(out));
 
-                    dstPtr[di * dstRowStride + dj * elementsPerPixel + k] = out < 0 ? 0 : (out > 255 ? 255 : out);
+                    dstPtr[dy * dstStep + dx * channels + c] = out < MinVal ? MinVal : (out > MaxVal ? MaxVal : out);
                 }
             }
         }
     }
 }
 
-void ResizedCrop(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<uint8_t> &hSrc,
-                 int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols,
-                 nvcv::ImageFormat fmt, NVCVInterpolationType interpolation)
+template<typename T, T MinVal, T MaxVal>
+void _ResizedCrop(std::vector<T> &hDst, int dstStep, nvcv::Size2D dstSize, const std::vector<T> &hSrc, int srcStep,
+                  nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat fmt,
+                  NVCVInterpolationType interp)
 {
-    double iScale = static_cast<double>(crop_rows) / dstSize.h;
-    double jScale = static_cast<double>(crop_cols) / dstSize.w;
+    double scaleH = static_cast<double>(crop_rows) / dstSize.h;
+    double scaleW = static_cast<double>(crop_cols) / dstSize.w;
 
     assert(fmt.numPlanes() == 1);
 
-    int elementsPerPixel = fmt.numChannels();
+    int channels = fmt.numChannels();
 
-    uint8_t       *dstPtr = hDst.data();
-    const uint8_t *srcPtr = hSrc.data();
+    T       *dstPtr = hDst.data();
+    const T *srcPtr = hSrc.data();
 
-    for (int di = 0; di < dstSize.h; di++)
+    for (int dy = 0; dy < dstSize.h; dy++)
     {
-        for (int dj = 0; dj < dstSize.w; dj++)
+        for (int dx = 0; dx < dstSize.w; dx++)
         {
-            if (interpolation == NVCV_INTERP_NEAREST)
+            if (interp == NVCV_INTERP_NEAREST)
             {
-                double fi = iScale * di + top;
-                double fj = jScale * dj + left;
+                double fy = scaleH * dy + top;
+                double fx = scaleW * dx + left;
 
-                int si = std::floor(fi);
-                int sj = std::floor(fj);
+                int sy = std::floor(fy);
+                int sx = std::floor(fx);
 
-                si = std::min(si, srcSize.h - 1);
-                sj = std::min(sj, srcSize.w - 1);
+                sy = std::min(sy, srcSize.h - 1);
+                sx = std::min(sx, srcSize.w - 1);
 
-                for (int k = 0; k < elementsPerPixel; k++)
+                for (int c = 0; c < channels; c++)
                 {
-                    dstPtr[di * dstRowStride + dj * elementsPerPixel + k]
-                        = srcPtr[si * srcRowStride + sj * elementsPerPixel + k];
+                    dstPtr[dy * dstStep + dx * channels + c] = srcPtr[sy * srcStep + sx * channels + c];
                 }
             }
-            else if (interpolation == NVCV_INTERP_LINEAR)
+            else if (interp == NVCV_INTERP_LINEAR)
             {
-                double fi = iScale * (di + 0.5) - 0.5 + top;
-                double fj = jScale * (dj + 0.5) - 0.5 + left;
+                double fy = scaleH * (dy + 0.5) - 0.5 + top;
+                double fx = scaleW * (dx + 0.5) - 0.5 + left;
 
-                int si = std::floor(fi);
-                int sj = std::floor(fj);
+                int sy = std::floor(fy);
+                int sx = std::floor(fx);
 
-                fi -= si;
-                fj -= sj;
+                fy -= sy;
+                fx -= sx;
 
-                fj = (sj < 0 || sj >= srcSize.w - 1) ? 0 : fj;
+                fx = (sx < 0 || sx >= srcSize.w - 1) ? 0 : fx;
 
-                si = std::max(0, std::min(si, srcSize.h - 2));
-                sj = std::max(0, std::min(sj, srcSize.w - 2));
+                sy = std::max(0, std::min(sy, srcSize.h - 2));
+                sx = std::max(0, std::min(sx, srcSize.w - 2));
 
-                double iWeights[2] = {1 - fi, fi};
-                double jWeights[2] = {1 - fj, fj};
+                double wghtY[2] = {1 - fy, fy};
+                double wghtX[2] = {1 - fx, fx};
 
-                for (int k = 0; k < elementsPerPixel; k++)
+                for (int c = 0; c < channels; c++)
                 {
-                    double res = std::rint(std::abs(
-                        srcPtr[(si + 0) * srcRowStride + (sj + 0) * elementsPerPixel + k] * iWeights[0] * jWeights[0]
-                        + srcPtr[(si + 1) * srcRowStride + (sj + 0) * elementsPerPixel + k] * iWeights[1] * jWeights[0]
-                        + srcPtr[(si + 0) * srcRowStride + (sj + 1) * elementsPerPixel + k] * iWeights[0] * jWeights[1]
-                        + srcPtr[(si + 1) * srcRowStride + (sj + 1) * elementsPerPixel + k] * iWeights[1]
-                              * jWeights[1]));
-
-                    dstPtr[di * dstRowStride + dj * elementsPerPixel + k] = res < 0 ? 0 : (res > 255 ? 255 : res);
+                    double res = std::rint(
+                        std::abs(srcPtr[(sy + 0) * srcStep + (sx + 0) * channels + c] * wghtY[0] * wghtX[0]
+                                 + srcPtr[(sy + 1) * srcStep + (sx + 0) * channels + c] * wghtY[1] * wghtX[0]
+                                 + srcPtr[(sy + 0) * srcStep + (sx + 1) * channels + c] * wghtY[0] * wghtX[1]
+                                 + srcPtr[(sy + 1) * srcStep + (sx + 1) * channels + c] * wghtY[1] * wghtX[1]));
+
+                    dstPtr[dy * dstStep + dx * channels + c] = res < MinVal ? MinVal : (res > MaxVal ? MaxVal : res);
                 }
             }
-            else if (interpolation == NVCV_INTERP_CUBIC)
+            else if (interp == NVCV_INTERP_CUBIC)
             {
-                double fi = iScale * (di + 0.5) - 0.5 + top;
-                double fj = jScale * (dj + 0.5) - 0.5 + left;
+                double fy = scaleH * (dy + 0.5) - 0.5 + top;
+                double fx = scaleW * (dx + 0.5) - 0.5 + left;
 
-                int si = std::floor(fi);
-                int sj = std::floor(fj);
+                int sy = std::floor(fy);
+                int sx = std::floor(fx);
 
-                fi -= si;
-                fj -= sj;
+                fy -= sy;
+                fx -= sx;
 
-                fj = (sj < 1 || sj >= srcSize.w - 3) ? 0 : fj;
+                fx = (sx < 1 || sx >= srcSize.w - 3) ? 0 : fx;
 
-                si = std::max(1, std::min(si, srcSize.h - 3));
-                sj = std::max(1, std::min(sj, srcSize.w - 3));
+                sy = std::max(1, std::min(sy, srcSize.h - 3));
+                sx = std::max(1, std::min(sx, srcSize.w - 3));
 
                 const double A = -0.75;
-                double       iWeights[4];
-                iWeights[0] = ((A * (fi + 1) - 5 * A) * (fi + 1) + 8 * A) * (fi + 1) - 4 * A;
-                iWeights[1] = ((A + 2) * fi - (A + 3)) * fi * fi + 1;
-                iWeights[2] = ((A + 2) * (1 - fi) - (A + 3)) * (1 - fi) * (1 - fi) + 1;
-                iWeights[3] = 1 - iWeights[0] - iWeights[1] - iWeights[2];
-
-                double jWeights[4];
-                jWeights[0] = ((A * (fj + 1) - 5 * A) * (fj + 1) + 8 * A) * (fj + 1) - 4 * A;
-                jWeights[1] = ((A + 2) * fj - (A + 3)) * fj * fj + 1;
-                jWeights[2] = ((A + 2) * (1 - fj) - (A + 3)) * (1 - fj) * (1 - fj) + 1;
-                jWeights[3] = 1 - jWeights[0] - jWeights[1] - jWeights[2];
-
-                for (int k = 0; k < elementsPerPixel; k++)
+                double       wghtY[4];
+                wghtY[0] = ((A * (fy + 1) - 5 * A) * (fy + 1) + 8 * A) * (fy + 1) - 4 * A;
+                wghtY[1] = ((A + 2) * fy - (A + 3)) * fy * fy + 1;
+                wghtY[2] = ((A + 2) * (1 - fy) - (A + 3)) * (1 - fy) * (1 - fy) + 1;
+                wghtY[3] = 1 - wghtY[0] - wghtY[1] - wghtY[2];
+
+                double wghtX[4];
+                wghtX[0] = ((A * (fx + 1) - 5 * A) * (fx + 1) + 8 * A) * (fx + 1) - 4 * A;
+                wghtX[1] = ((A + 2) * fx - (A + 3)) * fx * fx + 1;
+                wghtX[2] = ((A + 2) * (1 - fx) - (A + 3)) * (1 - fx) * (1 - fx) + 1;
+                wghtX[3] = 1 - wghtX[0] - wghtX[1] - wghtX[2];
+
+                for (int c = 0; c < channels; c++)
                 {
-                    double res = std::rint(std::abs(
-                        srcPtr[(si - 1) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[0]
-                        + srcPtr[(si + 0) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[1]
-                        + srcPtr[(si + 1) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[2]
-                        + srcPtr[(si + 2) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[3]
-                        + srcPtr[(si - 1) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[0]
-                        + srcPtr[(si + 0) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[1]
-                        + srcPtr[(si + 1) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[2]
-                        + srcPtr[(si + 2) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[3]
-                        + srcPtr[(si - 1) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[0]
-                        + srcPtr[(si + 0) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[1]
-                        + srcPtr[(si + 1) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[2]
-                        + srcPtr[(si + 2) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[3]
-                        + srcPtr[(si - 1) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] * iWeights[0]
-                        + srcPtr[(si + 0) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] * iWeights[1]
-                        + srcPtr[(si + 1) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] * iWeights[2]
-                        + srcPtr[(si + 2) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3]
-                              * iWeights[3]));
-
-                    dstPtr[di * dstRowStride + dj * elementsPerPixel + k] = res < 0 ? 0 : (res > 255 ? 255 : res);
+                    double res = std::rint(
+                        std::abs(srcPtr[(sy - 1) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[0]
+                                 + srcPtr[(sy + 0) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[1]
+                                 + srcPtr[(sy + 1) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[2]
+                                 + srcPtr[(sy + 2) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[3]
+                                 + srcPtr[(sy - 1) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[0]
+                                 + srcPtr[(sy + 0) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[1]
+                                 + srcPtr[(sy + 1) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[2]
+                                 + srcPtr[(sy + 2) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[3]
+                                 + srcPtr[(sy - 1) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[0]
+                                 + srcPtr[(sy + 0) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[1]
+                                 + srcPtr[(sy + 1) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[2]
+                                 + srcPtr[(sy + 2) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[3]
+                                 + srcPtr[(sy - 1) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[0]
+                                 + srcPtr[(sy + 0) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[1]
+                                 + srcPtr[(sy + 1) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[2]
+                                 + srcPtr[(sy + 2) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[3]));
+
+                    dstPtr[dy * dstStep + dx * channels + c] = res < MinVal ? MinVal : (res > MaxVal ? MaxVal : res);
                 }
             }
         }
     }
 }
 
+void Resize(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<uint8_t> &hSrc,
+            int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp,
+            bool isVarShape)
+{
+    int dstStep = dstRowStride / sizeof(uint8_t);
+    int srcStep = srcRowStride / sizeof(uint8_t);
+
+    if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC)
+    {
+        _ResizedCrop<uint8_t, 0, 255>(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w, fmt,
+                                      interp);
+    }
+    else if (interp == NVCV_INTERP_AREA)
+    {
+        _Resize<uint8_t, 0, 255>(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, fmt, interp, isVarShape);
+    }
+}
+
+void ResizedCrop(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<uint8_t> &hSrc,
+                 int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols,
+                 nvcv::ImageFormat fmt, NVCVInterpolationType interp)
+{
+    int dstStep = dstRowStride / sizeof(uint8_t);
+    int srcStep = srcRowStride / sizeof(uint8_t);
+
+    if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC)
+    {
+        _ResizedCrop<uint8_t, 0, 255>(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, top, left, crop_rows, crop_cols,
+                                      fmt, interp);
+    }
+}
+
+void Resize(std::vector<float> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<float> &hSrc,
+            int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp,
+            bool isVarShape)
+{
+    int dstStep = dstRowStride / sizeof(float);
+    int srcStep = srcRowStride / sizeof(float);
+
+    if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC)
+    {
+        _ResizedCrop<float, 0.0f, 255.0f>(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w,
+                                          fmt, interp);
+    }
+    else if (interp == NVCV_INTERP_AREA)
+    {
+        _Resize<float, 0.0f, 255.0f>(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, fmt, interp, isVarShape);
+    }
+}
+
+void ResizedCrop(std::vector<float> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<float> &hSrc,
+                 int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols,
+                 nvcv::ImageFormat fmt, NVCVInterpolationType interp)
+{
+    int dstStep = dstRowStride / sizeof(float);
+    int srcStep = srcRowStride / sizeof(float);
+
+    if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC)
+    {
+        _ResizedCrop<float, 0.0f, 255.0f>(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, top, left, crop_rows,
+                                          crop_cols, fmt, interp);
+    }
+}
+
 } // namespace nvcv::test
diff --git a/tests/cvcuda/system/ResizeUtils.hpp b/tests/cvcuda/system/ResizeUtils.hpp
index d8c27f7ca..ad1ffd482 100644
--- a/tests/cvcuda/system/ResizeUtils.hpp
+++ b/tests/cvcuda/system/ResizeUtils.hpp
@@ -31,13 +31,21 @@ namespace nvcv::test {
 // support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC/NVCV_INTERP_AREA
 void Resize(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<uint8_t> &hSrc,
             int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation,
-            bool isVarshape);
+            bool isVarShape);
+
+void Resize(std::vector<float> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<float> &hSrc,
+            int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation,
+            bool isVarShape);
 
 // only support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC
 void ResizedCrop(std::vector<uint8_t> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<uint8_t> &hSrc,
                  int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols,
                  nvcv::ImageFormat fmt, NVCVInterpolationType interpolation);
 
+void ResizedCrop(std::vector<float> &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector<float> &hSrc,
+                 int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols,
+                 nvcv::ImageFormat fmt, NVCVInterpolationType interpolation);
+
 } // namespace nvcv::test
 
 #endif // NVCV_TEST_COMMON_FLIP_UTILS_HPP
diff --git a/tests/cvcuda/system/TestOpBilateralFilter.cpp b/tests/cvcuda/system/TestOpBilateralFilter.cpp
index e16668c08..10ef192b2 100644
--- a/tests/cvcuda/system/TestOpBilateralFilter.cpp
+++ b/tests/cvcuda/system/TestOpBilateralFilter.cpp
@@ -29,6 +29,8 @@
 #include <random>
 #include <vector>
 
+#define NVCV_IMAGE_FORMAT_2U8 NVCV_DETAIL_MAKE_NONCOLOR_FMT1(PL, UNSIGNED, XY00, ASSOCIATED, X8_Y8)
+
 namespace gt   = ::testing;
 namespace test = nvcv::test;
 
@@ -37,17 +39,21 @@ static uint32_t saturate_cast(float n)
     return static_cast<uint32_t>(std::min(255.0f, std::round(n)));
 }
 
-static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride, float delta)
+static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride,
+                          size_t channels, float delta)
 {
     for (size_t j = 0; j < rows; j++)
     {
         for (size_t k = 0; k < columns; k++)
         {
-            size_t offset = j * rowStride + k;
-            float  diff   = std::abs(static_cast<float>(pTest[offset]) - static_cast<float>(pGold[offset]));
-            if (diff > delta)
+            for (size_t c = 0; c < channels; ++c)
             {
-                return false;
+                size_t offset = j * rowStride + k * channels + c;
+                float  diff   = std::abs(static_cast<float>(pTest[offset]) - static_cast<float>(pGold[offset]));
+                if (diff > delta)
+                {
+                    return false;
+                }
             }
         }
     }
@@ -55,13 +61,13 @@ static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t
 }
 
 static bool CompareTensors(std::vector<uint8_t> &vTest, std::vector<uint8_t> &vGold, size_t columns, size_t rows,
-                           size_t batch, size_t rowStride, size_t sampleStride, float delta)
+                           size_t batch, size_t rowStride, size_t channels, size_t sampleStride, float delta)
 {
     for (size_t i = 0; i < batch; i++)
     {
         uint8_t *pTest = vTest.data() + i * sampleStride;
         uint8_t *pGold = vGold.data() + i * sampleStride;
-        if (!CompareImages(pTest, pGold, columns, rows, rowStride, delta))
+        if (!CompareImages(pTest, pGold, columns, rows, rowStride, channels, delta))
             return false;
     }
     return true;
@@ -69,27 +75,31 @@ static bool CompareTensors(std::vector<uint8_t> &vTest, std::vector<uint8_t> &vG
 
 static bool CompareVarShapes(std::vector<std::vector<uint8_t>> &vTest, std::vector<std::vector<uint8_t>> &vGold,
                              std::vector<int> &vColumns, std::vector<int> &vRows, std::vector<int> &vRowStride,
-                             float delta)
+                             std::vector<int> &vChannels, float delta)
 {
     for (size_t i = 0; i < vTest.size(); i++)
     {
-        if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], delta))
+        if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], vChannels[i], delta))
             return false;
     }
     return true;
 }
 
-static void CPUBilateralFilter(uint8_t *pIn, uint8_t *pOut, int columns, int rows, int rowStride, int radius,
-                               float colorCoefficient, float spaceCoefficient)
+static void CPUBilateralFilter(uint8_t *pIn, uint8_t *pOut, int columns, int rows, int rowStride, int channels,
+                               int radius, float colorCoefficient, float spaceCoefficient)
 {
     float radiusSquared = radius * radius;
     for (int j = 0; j < rows; j++)
     {
         for (int k = 0; k < columns; k++)
         {
-            float numerator   = 0.0f;
-            float denominator = 0.0f;
-            float center      = static_cast<float>(pIn[j * rowStride + k]);
+            std::vector<float> numerators(channels, 0.0f);
+            float              denominator = 0.0f;
+            std::vector<float> centers{static_cast<float>(pIn[j * rowStride + k * channels]),
+                                       static_cast<float>(pIn[j * rowStride + k * channels + 1]),
+                                       static_cast<float>(pIn[j * rowStride + k * channels + 2]),
+                                       static_cast<float>(pIn[j * rowStride + k * channels + 3])};
+
             for (int y = j - radius; y <= j + radius; y++)
             {
                 for (int x = k - radius; x <= k + radius; x++)
@@ -97,26 +107,44 @@ static void CPUBilateralFilter(uint8_t *pIn, uint8_t *pOut, int columns, int row
                     float distanceSquared = (k - x) * (k - x) + (j - y) * (j - y);
                     if (distanceSquared <= radiusSquared)
                     {
-                        float pixel         = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows))
-                                                ? static_cast<float>(pIn[y * rowStride + x])
-                                                : 0.0f;
-                        float e_space       = distanceSquared * spaceCoefficient;
-                        float one_norm_size = std::abs(pixel - center);
-                        float e_color       = one_norm_size * one_norm_size * colorCoefficient;
-                        float weight        = std::exp(e_space + e_color);
+                        std::vector<float> pixels;
+                        for (auto c = 0; c < channels; ++c)
+                        {
+                            float pixel = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows))
+                                            ? static_cast<float>(pIn[y * rowStride + x * channels + c])
+                                            : 0.0f;
+                            pixels.emplace_back(pixel);
+                        }
+                        float e_space = distanceSquared * spaceCoefficient;
+                        float e_color = 0.0f;
+
+                        for (auto c = 0; c < channels; ++c)
+                        {
+                            e_color += std::abs(pixels[c] - centers[c]);
+                        }
+                        e_color = e_color * e_color * colorCoefficient;
+
+                        float weight = std::exp(e_space + e_color);
                         denominator += weight;
-                        numerator += weight * pixel;
+                        for (auto c = 0; c < channels; ++c)
+                        {
+                            numerators[c] += weight * pixels[c];
+                        }
                     }
                 }
             }
-            pOut[j * rowStride + k] = saturate_cast(numerator / denominator);
+
+            for (auto c = 0; c < channels; ++c)
+            {
+                pOut[j * rowStride + k * channels + c] = saturate_cast(numerators[c] / denominator);
+            }
         }
     }
 }
 
 static void CPUBilateralFilterTensor(std::vector<uint8_t> &vIn, std::vector<uint8_t> &vOut, int columns, int rows,
-                                     int batch, int rowStride, int sampleStride, int diameter, float sigmaColor,
-                                     float sigmaSpace)
+                                     int batch, int rowStride, int channels, int sampleStride, int diameter,
+                                     float sigmaColor, float sigmaSpace)
 {
     int   radius           = diameter / 2;
     float spaceCoefficient = -1 / (2 * sigmaSpace * sigmaSpace);
@@ -125,21 +153,22 @@ static void CPUBilateralFilterTensor(std::vector<uint8_t> &vIn, std::vector<uint
     {
         uint8_t *pIn  = vIn.data() + i * sampleStride;
         uint8_t *pOut = vOut.data() + i * sampleStride;
-        CPUBilateralFilter(pIn, pOut, columns, rows, rowStride, radius, colorCoefficient, spaceCoefficient);
+        CPUBilateralFilter(pIn, pOut, columns, rows, rowStride, channels, radius, colorCoefficient, spaceCoefficient);
     }
 }
 
 static void CPUBilateralFilterVarShape(std::vector<std::vector<uint8_t>> &vIn, std::vector<std::vector<uint8_t>> &vOut,
                                        std::vector<int> &vColumns, std::vector<int> &vRows,
-                                       std::vector<int> &vRowStride, std::vector<int> &vDiameter,
-                                       std::vector<float> &vSigmaColor, std::vector<float> &vSigmaSpace)
+                                       std::vector<int> &vRowStride, std::vector<int> &vChannels,
+                                       std::vector<int> &vDiameter, std::vector<float> &vSigmaColor,
+                                       std::vector<float> &vSigmaSpace)
 {
     for (size_t i = 0; i < vIn.size(); i++)
     {
         int   radius           = vDiameter[i] / 2;
         float spaceCoefficient = -1 / (2 * vSigmaSpace[i] * vSigmaSpace[i]);
         float colorCoefficient = -1 / (2 * vSigmaColor[i] * vSigmaColor[i]);
-        CPUBilateralFilter(vIn[i].data(), vOut[i].data(), vColumns[i], vRows[i], vRowStride[i], radius,
+        CPUBilateralFilter(vIn[i].data(), vOut[i].data(), vColumns[i], vRows[i], vRowStride[i], vChannels[i], radius,
                            colorCoefficient, spaceCoefficient);
     }
 }
@@ -171,177 +200,189 @@ TEST_P(OpBilateralFilter, BilateralFilter_packed)
 {
     cudaStream_t stream;
     EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-    int   width          = GetParamValue<0>();
-    int   height         = GetParamValue<1>();
-    int   d              = GetParamValue<2>();
-    float sigmaColor     = GetParamValue<3>();
-    float sigmaSpace     = GetParamValue<4>();
-    int   numberOfImages = GetParamValue<5>();
-
-    nvcv::Tensor imgOut = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8);
-    nvcv::Tensor imgIn  = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8);
+    int                            width          = GetParamValue<0>();
+    int                            height         = GetParamValue<1>();
+    int                            d              = GetParamValue<2>();
+    float                          sigmaColor     = GetParamValue<3>();
+    float                          sigmaSpace     = GetParamValue<4>();
+    int                            numberOfImages = GetParamValue<5>();
+    std::vector<nvcv::ImageFormat> fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8,
+                                        nvcv::FMT_RGBA8};
+    for (nvcv::ImageFormat fmt : fmts)
+    {
+        nvcv::Tensor imgOut   = nvcv::util::CreateTensor(numberOfImages, width, height, fmt);
+        nvcv::Tensor imgIn    = nvcv::util::CreateTensor(numberOfImages, width, height, fmt);
+        const int    channels = fmt.numChannels();
 
-    auto inData  = imgIn.exportData<nvcv::TensorDataStridedCuda>();
-    auto outData = imgOut.exportData<nvcv::TensorDataStridedCuda>();
+        auto inData  = imgIn.exportData<nvcv::TensorDataStridedCuda>();
+        auto outData = imgOut.exportData<nvcv::TensorDataStridedCuda>();
 
-    ASSERT_NE(nullptr, inData);
-    ASSERT_NE(nullptr, outData);
+        ASSERT_NE(nullptr, inData);
+        ASSERT_NE(nullptr, outData);
 
-    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
-    ASSERT_TRUE(inAccess);
+        auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
+        ASSERT_TRUE(inAccess);
 
-    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData);
-    ASSERT_TRUE(outAccess);
+        auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData);
+        ASSERT_TRUE(outAccess);
 
-    int inSampleStride  = inAccess->numRows() * inAccess->rowStride();
-    int outSampleStride = outAccess->numRows() * outAccess->rowStride();
+        int inSampleStride  = inAccess->numRows() * inAccess->rowStride();
+        int outSampleStride = outAccess->numRows() * outAccess->rowStride();
 
-    int inBufSize  = inSampleStride * inAccess->numSamples();
-    int outBufSize = outSampleStride * outAccess->numSamples();
+        int inBufSize  = inSampleStride * inAccess->numSamples();
+        int outBufSize = outSampleStride * outAccess->numSamples();
 
-    std::vector<uint8_t> vIn(inBufSize);
-    std::vector<uint8_t> vOut(outBufSize);
+        std::vector<uint8_t> vIn(inBufSize);
+        std::vector<uint8_t> vOut(outBufSize);
 
-    std::vector<uint8_t> inGold(inBufSize, 0);
-    std::vector<uint8_t> outGold(outBufSize, 0);
-    for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns
+        std::vector<uint8_t> inGold(inBufSize, 0);
+        std::vector<uint8_t> outGold(outBufSize, 0);
+        for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns
 
-    EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice));
-    CPUBilateralFilterTensor(inGold, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(),
-                             inAccess->rowStride(), inSampleStride, d, sigmaColor, sigmaSpace);
+        EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice));
+        CPUBilateralFilterTensor(inGold, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(),
+                                 inAccess->rowStride(), channels, inSampleStride, d, sigmaColor, sigmaSpace);
 
-    // run operator
-    cvcuda::BilateralFilter bilateralFilterOp;
+        // run operator
+        cvcuda::BilateralFilter bilateralFilterOp;
 
-    EXPECT_NO_THROW(bilateralFilterOp(stream, imgIn, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT));
+        EXPECT_NO_THROW(bilateralFilterOp(stream, imgIn, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT));
 
-    // check cdata
-    std::vector<uint8_t> outTest(outBufSize);
+        // check cdata
+        std::vector<uint8_t> outTest(outBufSize);
 
-    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost));
+        EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+        EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost));
+        ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(),
+                                   inAccess->rowStride(), channels, inSampleStride, 0.9f));
+    }
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
-    ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(),
-                               inAccess->rowStride(), inSampleStride, 0.9f));
 }
 
 TEST_P(OpBilateralFilter, BilateralFilter_VarShape)
 {
     cudaStream_t stream;
     EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-    int               width          = GetParamValue<0>();
-    int               height         = GetParamValue<1>();
-    int               diameter       = GetParamValue<2>();
-    float             sigmaColor     = GetParamValue<3>();
-    float             sigmaSpace     = GetParamValue<4>();
-    int               numberOfImages = GetParamValue<5>();
-    nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8};
-
-    // Create input varshape
-    std::default_random_engine         rng;
-    std::uniform_int_distribution<int> udistWidth(width * 0.8, width * 1.1);
-    std::uniform_int_distribution<int> udistHeight(height * 0.8, height * 1.1);
-
-    std::vector<nvcv::Image> imgSrc;
-
-    std::vector<std::vector<uint8_t>> srcVec(numberOfImages);
-    std::vector<int>                  srcVecRowStride(numberOfImages);
-    std::vector<int>                  srcVecRows(numberOfImages);
-    std::vector<int>                  srcVecColumns(numberOfImages);
-    std::vector<std::vector<uint8_t>> goldVec(numberOfImages);
-    std::vector<std::vector<uint8_t>> dstVec(numberOfImages);
-    for (int i = 0; i < numberOfImages; ++i)
+    int                            width          = GetParamValue<0>();
+    int                            height         = GetParamValue<1>();
+    int                            diameter       = GetParamValue<2>();
+    float                          sigmaColor     = GetParamValue<3>();
+    float                          sigmaSpace     = GetParamValue<4>();
+    int                            numberOfImages = GetParamValue<5>();
+    std::vector<nvcv::ImageFormat> fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8,
+                                        nvcv::FMT_RGBA8};
+    for (nvcv::ImageFormat fmt : fmts)
     {
-        imgSrc.emplace_back(nvcv::Size2D{udistWidth(rng), udistHeight(rng)}, format);
-        int srcRowStride   = imgSrc[i].size().w * format.planePixelStrideBytes(0);
-        srcVecRowStride[i] = srcRowStride;
-        srcVecRows[i]      = imgSrc[i].size().h;
-        srcVecColumns[i]   = imgSrc[i].size().w;
-        std::uniform_int_distribution<uint8_t> udist(0, 255);
-
-        srcVec[i].resize(imgSrc[i].size().h * srcRowStride);
-        goldVec[i].resize(imgSrc[i].size().h * srcRowStride);
-        dstVec[i].resize(imgSrc[i].size().h * srcRowStride);
-        std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); });
-        std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; });
-        std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; });
-        auto imgData = imgSrc[i].exportData<nvcv::ImageDataStridedCuda>();
-        ASSERT_NE(imgData, nvcv::NullOpt);
-
-        // Copy input data to the GPU
-        ASSERT_EQ(cudaSuccess,
-                  cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride, srcVec[i].data(),
-                                    srcRowStride, srcRowStride, imgSrc[i].size().h, cudaMemcpyHostToDevice, stream));
-    }
+        // Create input varshape
+        std::default_random_engine         rng;
+        std::uniform_int_distribution<int> udistWidth(width * 0.8, width * 1.1);
+        std::uniform_int_distribution<int> udistHeight(height * 0.8, height * 1.1);
+
+        std::vector<nvcv::Image> imgSrc;
+
+        std::vector<std::vector<uint8_t>> srcVec(numberOfImages);
+        std::vector<int>                  srcVecRowStride(numberOfImages);
+        std::vector<int>                  srcVecRows(numberOfImages);
+        std::vector<int>                  srcVecColumns(numberOfImages);
+        std::vector<int>                  channelsVec(numberOfImages);
+        std::vector<std::vector<uint8_t>> goldVec(numberOfImages);
+        std::vector<std::vector<uint8_t>> dstVec(numberOfImages);
+        for (int i = 0; i < numberOfImages; ++i)
+        {
+            imgSrc.emplace_back(nvcv::Size2D{udistWidth(rng), udistHeight(rng)}, fmt);
+            int srcRowStride   = imgSrc[i].size().w * fmt.planePixelStrideBytes(0);
+            srcVecRowStride[i] = srcRowStride;
+            srcVecRows[i]      = imgSrc[i].size().h;
+            srcVecColumns[i]   = imgSrc[i].size().w;
+            channelsVec[i]     = fmt.numChannels();
+            std::uniform_int_distribution<uint8_t> udist(0, 255);
+
+            srcVec[i].resize(imgSrc[i].size().h * srcRowStride);
+            goldVec[i].resize(imgSrc[i].size().h * srcRowStride);
+            dstVec[i].resize(imgSrc[i].size().h * srcRowStride);
+            std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); });
+            std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; });
+            std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; });
+            auto imgData = imgSrc[i].exportData<nvcv::ImageDataStridedCuda>();
+            ASSERT_NE(imgData, nvcv::NullOpt);
+
+            // Copy input data to the GPU
+            ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride,
+                                                     srcVec[i].data(), srcRowStride, srcRowStride, imgSrc[i].size().h,
+                                                     cudaMemcpyHostToDevice, stream));
+        }
 
-    nvcv::ImageBatchVarShape batchSrc(numberOfImages);
-    batchSrc.pushBack(imgSrc.begin(), imgSrc.end());
+        nvcv::ImageBatchVarShape batchSrc(numberOfImages);
+        batchSrc.pushBack(imgSrc.begin(), imgSrc.end());
 
-    // Create output varshape
-    std::vector<nvcv::Image> imgDst;
-    for (int i = 0; i < numberOfImages; ++i)
-    {
-        imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format());
-    }
-    nvcv::ImageBatchVarShape batchDst(numberOfImages);
-    batchDst.pushBack(imgDst.begin(), imgDst.end());
+        // Create output varshape
+        std::vector<nvcv::Image> imgDst;
+        for (int i = 0; i < numberOfImages; ++i)
+        {
+            imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format());
+        }
+        nvcv::ImageBatchVarShape batchDst(numberOfImages);
+        batchDst.pushBack(imgDst.begin(), imgDst.end());
 
-    // Create diameter tensor
-    std::vector<int> vDiameter(numberOfImages, diameter);
-    nvcv::Tensor     diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32);
-    {
-        auto dev = diameterTensor.exportData<nvcv::TensorDataStridedCuda>();
-        ASSERT_NE(dev, nullptr);
+        // Create diameter tensor
+        std::vector<int> vDiameter(numberOfImages, diameter);
+        nvcv::Tensor     diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32);
+        {
+            auto dev = diameterTensor.exportData<nvcv::TensorDataStridedCuda>();
+            ASSERT_NE(dev, nullptr);
 
-        ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int),
-                                               cudaMemcpyHostToDevice, stream));
-    }
+            ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int),
+                                                   cudaMemcpyHostToDevice, stream));
+        }
 
-    // Create sigmaColor tensor
-    std::vector<float> vSigmaColor(numberOfImages, sigmaColor);
-    nvcv::Tensor       sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
-    {
-        auto dev = sigmaColorTensor.exportData<nvcv::TensorDataStridedCuda>();
-        ASSERT_NE(dev, nullptr);
+        // Create sigmaColor tensor
+        std::vector<float> vSigmaColor(numberOfImages, sigmaColor);
+        nvcv::Tensor       sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
+        {
+            auto dev = sigmaColorTensor.exportData<nvcv::TensorDataStridedCuda>();
+            ASSERT_NE(dev, nullptr);
 
-        ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(), vSigmaColor.size() * sizeof(float),
-                                               cudaMemcpyHostToDevice, stream));
-    }
+            ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(),
+                                                   vSigmaColor.size() * sizeof(float), cudaMemcpyHostToDevice, stream));
+        }
 
-    // Create sigmaSpace tensor
-    std::vector<float> vSigmaSpace(numberOfImages, sigmaSpace);
-    nvcv::Tensor       sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
-    {
-        auto dev = sigmaSpaceTensor.exportData<nvcv::TensorDataStridedCuda>();
-        ASSERT_NE(dev, nullptr);
+        // Create sigmaSpace tensor
+        std::vector<float> vSigmaSpace(numberOfImages, sigmaSpace);
+        nvcv::Tensor       sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
+        {
+            auto dev = sigmaSpaceTensor.exportData<nvcv::TensorDataStridedCuda>();
+            ASSERT_NE(dev, nullptr);
 
-        ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(), vSigmaSpace.size() * sizeof(float),
-                                               cudaMemcpyHostToDevice, stream));
-    }
+            ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(),
+                                                   vSigmaSpace.size() * sizeof(float), cudaMemcpyHostToDevice, stream));
+        }
 
-    // Create gold data
-    CPUBilateralFilterVarShape(srcVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, vDiameter, vSigmaColor,
-                               vSigmaSpace);
+        // Create gold data
+        CPUBilateralFilterVarShape(srcVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, channelsVec, vDiameter,
+                                   vSigmaColor, vSigmaSpace);
 
-    // Run operator
-    cvcuda::BilateralFilter bilateralFilterOp;
-    EXPECT_NO_THROW(bilateralFilterOp(stream, batchSrc, batchDst, diameterTensor, sigmaColorTensor, sigmaSpaceTensor,
-                                      NVCV_BORDER_CONSTANT));
+        // Run operator
+        cvcuda::BilateralFilter bilateralFilterOp;
+        EXPECT_NO_THROW(bilateralFilterOp(stream, batchSrc, batchDst, diameterTensor, sigmaColorTensor,
+                                          sigmaSpaceTensor, NVCV_BORDER_CONSTANT));
 
-    // Retrieve data from GPU
-    for (int i = 0; i < numberOfImages; i++)
-    {
-        auto imgData = imgDst[i].exportData<nvcv::ImageDataStridedCuda>();
-        ASSERT_NE(imgData, nvcv::NullOpt);
+        // Retrieve data from GPU
+        for (int i = 0; i < numberOfImages; i++)
+        {
+            auto imgData = imgDst[i].exportData<nvcv::ImageDataStridedCuda>();
+            ASSERT_NE(imgData, nvcv::NullOpt);
 
-        // Copy input data to the GPU
-        ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr,
-                                                 imgData->plane(0).rowStride, srcVecRowStride[i], imgDst[i].size().h,
-                                                 cudaMemcpyDeviceToHost, stream));
+            // Copy input data to the GPU
+            ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr,
+                                                     imgData->plane(0).rowStride, srcVecRowStride[i],
+                                                     imgDst[i].size().h, cudaMemcpyDeviceToHost, stream));
+        }
+        EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+
+        // Compare data
+        ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, channelsVec, 0.9f));
     }
-    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
-
-    // Compare data
-    ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, 0.9f));
 }
+
+#undef NVCV_IMAGE_FORMAT_2U8
diff --git a/tests/cvcuda/system/TestOpCvtColor.cpp b/tests/cvcuda/system/TestOpCvtColor.cpp
index bb6bca005..833cfa92a 100644
--- a/tests/cvcuda/system/TestOpCvtColor.cpp
+++ b/tests/cvcuda/system/TestOpCvtColor.cpp
@@ -171,7 +171,7 @@ test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConver
     //     Codes 109, 110, 113, 114 dealing with VYUY format are not implemented
     //     Codes 125, 126 dealing alpha premultiplication are not implemented
     //     Codes 135 to 139 dealing edge-aware demosaicing are not implemented
-/*
+
     // NV12, ... makes tensors raise an error:
     // "NVCV_ERROR_NOT_IMPLEMENTED: Batch image format must not have subsampled planes, but it is: X"
     { 120,  20,  2,    NVCV_IMAGE_FORMAT_NV12,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_NV12,  NVCV_COLOR_RGB2YUV_NV12, 128.0},
@@ -182,9 +182,11 @@ test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConver
     { 160,  60,  5,    NVCV_IMAGE_FORMAT_NV21,   NVCV_IMAGE_FORMAT_BGR8,  NVCV_COLOR_YUV2BGR_NV21,  NVCV_COLOR_BGR2YUV_NV21, 128.0},
     {  60, 100,  4,    NVCV_IMAGE_FORMAT_NV21,  NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_NV21, NVCV_COLOR_RGBA2YUV_NV21, 128.0},
     {  80,  80,  3,    NVCV_IMAGE_FORMAT_NV21,  NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_NV21, NVCV_COLOR_BGRA2YUV_NV21, 128.0},
+/*
     { 120,  40,  2,    NVCV_IMAGE_FORMAT_UYVY,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_UYVY,       NVCV_COLOR_RGB2YUV, 128.0},
     { 120,  40,  2,    NVCV_IMAGE_FORMAT_YUYV,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_YUYV,       NVCV_COLOR_RGB2YUV, 128.0},
 */
+
     // Code 148 is not implemented
 });
 
@@ -323,6 +325,13 @@ TEST_P(OpCvtColor, varshape_correct_output)
     nvcv::ImageFormat srcFormat{GetParamValue<3>()};
     nvcv::ImageFormat dstFormat{GetParamValue<4>()};
 
+    // Waive the formats that have subsampled planes
+    if (srcFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444
+        || dstFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444)
+    {
+        GTEST_SKIP() << "Waived the formats that have subsampled planes for OpCvtColor varshape test";
+    }
+
     NVCVDataType nvcvDataType;
     ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType));
 
diff --git a/tests/cvcuda/system/TestOpErase.cpp b/tests/cvcuda/system/TestOpErase.cpp
index 905a6f0d7..a7126dcbf 100644
--- a/tests/cvcuda/system/TestOpErase.cpp
+++ b/tests/cvcuda/system/TestOpErase.cpp
@@ -27,11 +27,19 @@
 
 #include <iostream>
 
-NVCV_TEST_SUITE_P(OpErase, nvcv::test::ValueList<int>{{1}, {2}});
+NVCV_TEST_SUITE_P(OpErase, nvcv::test::ValueList<int, bool>{
+                               {1, false},
+                               {2, false},
+                               {1,  true},
+                               {2,  true}
+});
 
 TEST_P(OpErase, correct_output)
 {
-    int N = GetParam();
+    int          N                    = GetParamValue<0>();
+    bool         random               = GetParamValue<1>();
+    int          max_num_erasing_area = 2;
+    unsigned int seed                 = 0;
 
     cudaStream_t stream;
     EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream));
@@ -115,9 +123,6 @@ TEST_P(OpErase, correct_output)
                                            cudaMemcpyHostToDevice, stream));
 
     // Call operator
-    unsigned int  seed                 = 0;
-    bool          random               = false;
-    int           max_num_erasing_area = 2;
     cvcuda::Erase eraseOp(max_num_erasing_area);
     EXPECT_NO_THROW(eraseOp(stream, imgIn, imgOut, anchor, erasing, values, imgIdx, random, seed));
 
@@ -126,17 +131,21 @@ TEST_P(OpErase, correct_output)
     std::vector<uint8_t> test(outBufferSize, 0xA);
 
     //Check data
+    if (!random)
+    {
+        EXPECT_EQ(cudaSuccess, cudaMemcpy(test.data(), outData->basePtr(), outBufferSize, cudaMemcpyDeviceToHost));
+
+        EXPECT_EQ(test[0], 1);
+        EXPECT_EQ(test[9], 1);
+        EXPECT_EQ(test[10], 0);
+        EXPECT_EQ(test[9 * 640], 1);
+        EXPECT_EQ(test[9 * 640 + 9], 1);
+        EXPECT_EQ(test[9 * 640 + 10], 0);
+        EXPECT_EQ(test[10 * 640], 0);
+        EXPECT_EQ(test[10 * 640 + 10], 1);
+    }
     EXPECT_EQ(cudaSuccess, cudaMemcpy(test.data(), outData->basePtr(), outBufferSize, cudaMemcpyDeviceToHost));
 
-    EXPECT_EQ(test[0], 1);
-    EXPECT_EQ(test[9], 1);
-    EXPECT_EQ(test[10], 0);
-    EXPECT_EQ(test[9 * 640], 1);
-    EXPECT_EQ(test[9 * 640 + 9], 1);
-    EXPECT_EQ(test[9 * 640 + 10], 0);
-    EXPECT_EQ(test[10 * 640], 0);
-    EXPECT_EQ(test[10 * 640 + 10], 1);
-
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
 
@@ -232,17 +241,21 @@ TEST(OpErase, OpErase_Varshape)
     std::vector<uint8_t> test(dstHeight * dstRowStride, 0xFF);
 
     // Copy output data to Host
-    ASSERT_EQ(cudaSuccess, cudaMemcpy2D(test.data(), dstRowStride, dstData->plane(0).basePtr,
-                                        dstData->plane(0).rowStride, dstRowStride, dstHeight, cudaMemcpyDeviceToHost));
-
-    EXPECT_EQ(test[0], 1);
-    EXPECT_EQ(test[9], 1);
-    EXPECT_EQ(test[10], 0);
-    EXPECT_EQ(test[9 * 640], 1);
-    EXPECT_EQ(test[9 * 640 + 9], 1);
-    EXPECT_EQ(test[9 * 640 + 10], 0);
-    EXPECT_EQ(test[10 * 640], 0);
-    EXPECT_EQ(test[10 * 640 + 10], 1);
+    if (!random)
+    {
+        ASSERT_EQ(cudaSuccess,
+                  cudaMemcpy2D(test.data(), dstRowStride, dstData->plane(0).basePtr, dstData->plane(0).rowStride,
+                               dstRowStride, dstHeight, cudaMemcpyDeviceToHost));
+
+        EXPECT_EQ(test[0], 1);
+        EXPECT_EQ(test[9], 1);
+        EXPECT_EQ(test[10], 0);
+        EXPECT_EQ(test[9 * 640], 1);
+        EXPECT_EQ(test[9 * 640 + 9], 1);
+        EXPECT_EQ(test[9 * 640 + 10], 0);
+        EXPECT_EQ(test[10 * 640], 0);
+        EXPECT_EQ(test[10 * 640 + 10], 1);
+    }
 
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
diff --git a/tests/cvcuda/system/TestOpJointBilateralFilter.cpp b/tests/cvcuda/system/TestOpJointBilateralFilter.cpp
index 44b0a4eea..2be0529bf 100644
--- a/tests/cvcuda/system/TestOpJointBilateralFilter.cpp
+++ b/tests/cvcuda/system/TestOpJointBilateralFilter.cpp
@@ -29,6 +29,8 @@
 #include <random>
 #include <vector>
 
+#define NVCV_IMAGE_FORMAT_2U8 NVCV_DETAIL_MAKE_NONCOLOR_FMT1(PL, UNSIGNED, XY00, ASSOCIATED, X8_Y8)
+
 namespace gt   = ::testing;
 namespace test = nvcv::test;
 
@@ -37,21 +39,26 @@ static uint32_t saturate_cast(float n)
     return static_cast<uint32_t>(std::min(255.0f, std::round(n)));
 }
 
-static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride, float delta)
+static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride,
+                          size_t channels, float delta)
 {
     for (size_t j = 0; j < rows; j++)
     {
         for (size_t k = 0; k < columns; k++)
         {
-            size_t offset = j * rowStride + k;
-            float  diff   = std::abs(static_cast<float>(pTest[offset]) - static_cast<float>(pGold[offset]));
-            if (diff > delta)
+            for (size_t c = 0; c < channels; ++c)
             {
-                std::cout << " o = " << offset << " j = " << j << " k = " << k << " rowS = " << rowStride << std::endl;
-                std::cout << " test = " << static_cast<float>(pTest[offset])
-                          << " gold = " << static_cast<float>(pGold[offset]) << std::endl;
+                size_t offset = j * rowStride + k * channels + c;
+                float  diff   = std::abs(static_cast<float>(pTest[offset]) - static_cast<float>(pGold[offset]));
+                if (diff > delta)
+                {
+                    std::cout << " o = " << offset << " j = " << j << " k = " << k << " rowS = " << rowStride
+                              << std::endl;
+                    std::cout << " test = " << static_cast<float>(pTest[offset])
+                              << " gold = " << static_cast<float>(pGold[offset]) << std::endl;
 
-                return false;
+                    return false;
+                }
             }
         }
     }
@@ -59,13 +66,13 @@ static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t
 }
 
 static bool CompareTensors(std::vector<uint8_t> &vTest, std::vector<uint8_t> &vGold, size_t columns, size_t rows,
-                           size_t batch, size_t rowStride, size_t sampleStride, float delta)
+                           size_t batch, size_t rowStride, size_t channels, size_t sampleStride, float delta)
 {
     for (size_t i = 0; i < batch; i++)
     {
         uint8_t *pTest = vTest.data() + i * sampleStride;
         uint8_t *pGold = vGold.data() + i * sampleStride;
-        if (!CompareImages(pTest, pGold, columns, rows, rowStride, delta))
+        if (!CompareImages(pTest, pGold, columns, rows, rowStride, channels, delta))
             return false;
     }
     return true;
@@ -73,11 +80,11 @@ static bool CompareTensors(std::vector<uint8_t> &vTest, std::vector<uint8_t> &vG
 
 static bool CompareVarShapes(std::vector<std::vector<uint8_t>> &vTest, std::vector<std::vector<uint8_t>> &vGold,
                              std::vector<int> &vColumns, std::vector<int> &vRows, std::vector<int> &vRowStride,
-                             float delta)
+                             std::vector<int> &vChannels, float delta)
 {
     for (size_t i = 0; i < vTest.size(); i++)
     {
-        if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], delta))
+        if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], vChannels[i], delta))
         {
             return false;
         }
@@ -86,16 +93,21 @@ static bool CompareVarShapes(std::vector<std::vector<uint8_t>> &vTest, std::vect
 }
 
 static void CPUJointBilateralFilter(uint8_t *pIn, uint8_t *pInColor, uint8_t *pOut, int columns, int rows,
-                                    int rowStride, int radius, float colorCoefficient, float spaceCoefficient)
+                                    int rowStride, int channels, int radius, float colorCoefficient,
+                                    float spaceCoefficient)
 {
     float radiusSquared = radius * radius;
     for (int j = 0; j < rows; j++)
     {
         for (int k = 0; k < columns; k++)
         {
-            float numerator   = 0;
-            float denominator = 0;
-            float centerColor = static_cast<float>(pInColor[j * rowStride + k]);
+            std::vector<float> numerators(channels, 0.0f);
+            float              denominator = 0;
+            std::vector<float> centerColors{static_cast<float>(pInColor[j * rowStride + k * channels]),
+                                            static_cast<float>(pInColor[j * rowStride + k * channels + 1]),
+                                            static_cast<float>(pInColor[j * rowStride + k * channels + 2]),
+                                            static_cast<float>(pInColor[j * rowStride + k * channels + 3])};
+
             for (int y = j - radius; y <= j + radius; y++)
             {
                 for (int x = k - radius; x <= k + radius; x++)
@@ -103,30 +115,50 @@ static void CPUJointBilateralFilter(uint8_t *pIn, uint8_t *pInColor, uint8_t *pO
                     float distanceSquared = (k - x) * (k - x) + (j - y) * (j - y);
                     if (distanceSquared <= radiusSquared)
                     {
-                        float pixel         = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows))
-                                                ? static_cast<float>(pIn[y * rowStride + x])
-                                                : 0.0f;
-                        float pixelColor    = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows))
-                                                ? static_cast<float>(pInColor[y * rowStride + x])
-                                                : 0.0f;
-                        float e_space       = distanceSquared * spaceCoefficient;
-                        float one_norm_size = std::abs(pixelColor - centerColor);
-                        float e_color       = one_norm_size * one_norm_size * colorCoefficient;
-                        float weight        = std::exp(e_space + e_color);
+                        std::vector<float> pixels;
+                        std::vector<float> pixelColors;
+                        for (auto c = 0; c < channels; ++c)
+                        {
+                            float pixel      = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows))
+                                                 ? static_cast<float>(pIn[y * rowStride + x * channels + c])
+                                                 : 0.0f;
+                            float pixelColor = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows))
+                                                 ? static_cast<float>(pInColor[y * rowStride + x * channels + c])
+                                                 : 0.0f;
+                            pixels.emplace_back(pixel);
+                            pixelColors.emplace_back(pixelColor);
+                        }
+
+                        float e_space = distanceSquared * spaceCoefficient;
+                        float e_color = 0.0f;
+                        for (auto c = 0; c < channels; ++c)
+                        {
+                            e_color += std::abs(pixelColors[c] - centerColors[c]);
+                        }
+                        e_color = e_color * e_color * colorCoefficient;
+
+                        float weight = std::exp(e_space + e_color);
                         denominator += weight;
-                        numerator += weight * pixel;
+                        for (auto c = 0; c < channels; ++c)
+                        {
+                            numerators[c] += weight * pixels[c];
+                        }
                     }
                 }
             }
-            denominator             = (denominator != 0) ? denominator : 1.0f;
-            pOut[j * rowStride + k] = saturate_cast(numerator / denominator);
+            denominator = (denominator != 0) ? denominator : 1.0f;
+            for (auto c = 0; c < channels; ++c)
+            {
+                pOut[j * rowStride + k * channels + c] = saturate_cast(numerators[c] / denominator);
+            }
         }
     }
 }
 
 static void CPUJointBilateralFilterTensor(std::vector<uint8_t> &vIn, std::vector<uint8_t> &vInColor,
                                           std::vector<uint8_t> &vOut, int columns, int rows, int batch, int rowStride,
-                                          int sampleStride, int diameter, float sigmaColor, float sigmaSpace)
+                                          int channels, int sampleStride, int diameter, float sigmaColor,
+                                          float sigmaSpace)
 {
     int   radius           = diameter / 2;
     float spaceCoefficient = -1 / (2 * sigmaSpace * sigmaSpace);
@@ -136,7 +168,7 @@ static void CPUJointBilateralFilterTensor(std::vector<uint8_t> &vIn, std::vector
         uint8_t *pIn      = vIn.data() + i * sampleStride;
         uint8_t *pInColor = vInColor.data() + i * sampleStride;
         uint8_t *pOut     = vOut.data() + i * sampleStride;
-        CPUJointBilateralFilter(pIn, pInColor, pOut, columns, rows, rowStride, radius, colorCoefficient,
+        CPUJointBilateralFilter(pIn, pInColor, pOut, columns, rows, rowStride, channels, radius, colorCoefficient,
                                 spaceCoefficient);
     }
 }
@@ -145,8 +177,8 @@ static void CPUJointBilateralFilterVarShape(std::vector<std::vector<uint8_t>> &v
                                             std::vector<std::vector<uint8_t>> &vInColor,
                                             std::vector<std::vector<uint8_t>> &vOut, std::vector<int> &vColumns,
                                             std::vector<int> &vRows, std::vector<int> &vRowStride,
-                                            std::vector<int> &vDiameter, std::vector<float> &vSigmaColor,
-                                            std::vector<float> &vSigmaSpace)
+                                            std::vector<int> &vChannels, std::vector<int> &vDiameter,
+                                            std::vector<float> &vSigmaColor, std::vector<float> &vSigmaSpace)
 {
     for (size_t i = 0; i < vIn.size(); i++)
     {
@@ -154,7 +186,7 @@ static void CPUJointBilateralFilterVarShape(std::vector<std::vector<uint8_t>> &v
         float spaceCoefficient = -1 / (2 * vSigmaSpace[i] * vSigmaSpace[i]);
         float colorCoefficient = -1 / (2 * vSigmaColor[i] * vSigmaColor[i]);
         CPUJointBilateralFilter(vIn[i].data(), vInColor[i].data(), vOut[i].data(), vColumns[i], vRows[i], vRowStride[i],
-                                radius, colorCoefficient, spaceCoefficient);
+                                vChannels[i], radius, colorCoefficient, spaceCoefficient);
     }
 }
 
@@ -185,208 +217,222 @@ TEST_P(OpJointBilateralFilter, JointBilateralFilter_packed)
 {
     cudaStream_t stream;
     EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-    int   width          = GetParamValue<0>();
-    int   height         = GetParamValue<1>();
-    int   d              = GetParamValue<2>();
-    float sigmaColor     = GetParamValue<3>();
-    float sigmaSpace     = GetParamValue<4>();
-    int   numberOfImages = GetParamValue<5>();
-
-    nvcv::Tensor imgOut     = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8);
-    nvcv::Tensor imgIn      = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8);
-    nvcv::Tensor imgInColor = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8);
-
-    auto inData      = imgIn.exportData<nvcv::TensorDataStridedCuda>();
-    auto inColorData = imgInColor.exportData<nvcv::TensorDataStridedCuda>();
-    auto outData     = imgOut.exportData<nvcv::TensorDataStridedCuda>();
-
-    ASSERT_NE(nullptr, inData);
-    ASSERT_NE(nullptr, inColorData);
-    ASSERT_NE(nullptr, outData);
-
-    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
-    ASSERT_TRUE(inAccess);
-
-    auto inColorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inColorData);
-    ASSERT_TRUE(inColorAccess);
-
-    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData);
-    ASSERT_TRUE(outAccess);
-
-    int inSampleStride      = inAccess->numRows() * inAccess->rowStride();
-    int inColorSampleStride = inColorAccess->numRows() * inColorAccess->rowStride();
-    int outSampleStride     = outAccess->numRows() * outAccess->rowStride();
-
-    int inBufSize      = inSampleStride * inAccess->numSamples();
-    int inColorBufSize = inColorSampleStride * inColorAccess->numSamples();
-    int outBufSize     = outSampleStride * outAccess->numSamples();
-
-    std::vector<uint8_t> vIn(inBufSize);
-    std::vector<uint8_t> vInColor(inColorBufSize);
-    std::vector<uint8_t> vOut(outBufSize);
-
-    std::vector<uint8_t> inGold(inBufSize, 0);
-    std::vector<uint8_t> inColorGold(inColorBufSize, 0);
-    std::vector<uint8_t> outGold(outBufSize, 0);
-    for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns
-    for (int i = 0; i < inColorBufSize; i++)
-        inColorGold[i] = i % 109; // Use prime number to prevent weird tiling patterns
-    EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice));
-    EXPECT_EQ(cudaSuccess,
-              cudaMemcpy(inColorData->basePtr(), inColorGold.data(), inColorBufSize, cudaMemcpyHostToDevice));
-    CPUJointBilateralFilterTensor(inGold, inColorGold, outGold, inAccess->numCols(), inAccess->numRows(),
-                                  inAccess->numSamples(), inAccess->rowStride(), inSampleStride, d, sigmaColor,
-                                  sigmaSpace);
-
-    // run operator
-    cvcuda::JointBilateralFilter jointBilateralFilterOp;
-
-    EXPECT_NO_THROW(
-        jointBilateralFilterOp(stream, imgIn, imgInColor, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT));
-
-    // check cdata
-    std::vector<uint8_t> outTest(outBufSize);
-
-    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost));
+    int                            width          = GetParamValue<0>();
+    int                            height         = GetParamValue<1>();
+    int                            d              = GetParamValue<2>();
+    float                          sigmaColor     = GetParamValue<3>();
+    float                          sigmaSpace     = GetParamValue<4>();
+    int                            numberOfImages = GetParamValue<5>();
+    std::vector<nvcv::ImageFormat> fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8,
+                                        nvcv::FMT_RGBA8};
+
+    for (nvcv::ImageFormat fmt : fmts)
+    {
+        nvcv::Tensor imgOut     = nvcv::util::CreateTensor(numberOfImages, width, height, fmt);
+        nvcv::Tensor imgIn      = nvcv::util::CreateTensor(numberOfImages, width, height, fmt);
+        nvcv::Tensor imgInColor = nvcv::util::CreateTensor(numberOfImages, width, height, fmt);
+        const int    channels   = fmt.numChannels();
+
+        auto inData      = imgIn.exportData<nvcv::TensorDataStridedCuda>();
+        auto inColorData = imgInColor.exportData<nvcv::TensorDataStridedCuda>();
+        auto outData     = imgOut.exportData<nvcv::TensorDataStridedCuda>();
+
+        ASSERT_NE(nullptr, inData);
+        ASSERT_NE(nullptr, inColorData);
+        ASSERT_NE(nullptr, outData);
+
+        auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
+        ASSERT_TRUE(inAccess);
+
+        auto inColorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inColorData);
+        ASSERT_TRUE(inColorAccess);
+
+        auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData);
+        ASSERT_TRUE(outAccess);
+
+        int inSampleStride      = inAccess->numRows() * inAccess->rowStride();
+        int inColorSampleStride = inColorAccess->numRows() * inColorAccess->rowStride();
+        int outSampleStride     = outAccess->numRows() * outAccess->rowStride();
+
+        int inBufSize      = inSampleStride * inAccess->numSamples();
+        int inColorBufSize = inColorSampleStride * inColorAccess->numSamples();
+        int outBufSize     = outSampleStride * outAccess->numSamples();
+
+        std::vector<uint8_t> vIn(inBufSize);
+        std::vector<uint8_t> vInColor(inColorBufSize);
+        std::vector<uint8_t> vOut(outBufSize);
+
+        std::vector<uint8_t> inGold(inBufSize, 0);
+        std::vector<uint8_t> inColorGold(inColorBufSize, 0);
+        std::vector<uint8_t> outGold(outBufSize, 0);
+        for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns
+        for (int i = 0; i < inColorBufSize; i++)
+            inColorGold[i] = i % 109; // Use prime number to prevent weird tiling patterns
+        EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice));
+        EXPECT_EQ(cudaSuccess,
+                  cudaMemcpy(inColorData->basePtr(), inColorGold.data(), inColorBufSize, cudaMemcpyHostToDevice));
+        CPUJointBilateralFilterTensor(inGold, inColorGold, outGold, inAccess->numCols(), inAccess->numRows(),
+                                      inAccess->numSamples(), inAccess->rowStride(), channels, inSampleStride, d,
+                                      sigmaColor, sigmaSpace);
+
+        // run operator
+        cvcuda::JointBilateralFilter jointBilateralFilterOp;
+
+        EXPECT_NO_THROW(
+            jointBilateralFilterOp(stream, imgIn, imgInColor, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT));
+
+        // check cdata
+        std::vector<uint8_t> outTest(outBufSize);
+
+        EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+        EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost));
+        ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(),
+                                   inAccess->rowStride(), channels, inSampleStride, 0.9f));
+    }
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
-    ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(),
-                               inAccess->rowStride(), inSampleStride, 0.9f));
 }
 
 TEST_P(OpJointBilateralFilter, JointBilateralFilter_VarShape)
 {
     cudaStream_t stream;
     EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-    int               width          = GetParamValue<0>();
-    int               height         = GetParamValue<1>();
-    int               diameter       = GetParamValue<2>();
-    float             sigmaColor     = GetParamValue<3>();
-    float             sigmaSpace     = GetParamValue<4>();
-    int               numberOfImages = GetParamValue<5>();
-    nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8};
-
-    // Create input varshape
-    std::default_random_engine         rng;
-    std::uniform_int_distribution<int> udistWidth(width * 0.8, width * 1.1);
-    std::uniform_int_distribution<int> udistHeight(height * 0.8, height * 1.1);
-
-    std::vector<nvcv::Image> imgSrc;
-    std::vector<nvcv::Image> imgSrcColor;
-
-    std::vector<std::vector<uint8_t>> srcVec(numberOfImages);
-    std::vector<std::vector<uint8_t>> srcColorVec(numberOfImages);
-    std::vector<int>                  srcVecRowStride(numberOfImages);
-    std::vector<int>                  srcVecRows(numberOfImages);
-    std::vector<int>                  srcVecColumns(numberOfImages);
-    std::vector<std::vector<uint8_t>> goldVec(numberOfImages);
-    std::vector<std::vector<uint8_t>> dstVec(numberOfImages);
-    for (int i = 0; i < numberOfImages; ++i)
+    int                            width          = GetParamValue<0>();
+    int                            height         = GetParamValue<1>();
+    int                            diameter       = GetParamValue<2>();
+    float                          sigmaColor     = GetParamValue<3>();
+    float                          sigmaSpace     = GetParamValue<4>();
+    int                            numberOfImages = GetParamValue<5>();
+    std::vector<nvcv::ImageFormat> fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8,
+                                        nvcv::FMT_RGBA8};
+
+    for (nvcv::ImageFormat fmt : fmts)
     {
-        int          w = udistWidth(rng);
-        int          h = udistHeight(rng);
-        nvcv::Size2D sz(w, h);
-        imgSrc.emplace_back(sz, format);
-        imgSrcColor.emplace_back(sz, format);
-        int srcRowStride   = imgSrc[i].size().w * format.planePixelStrideBytes(0);
-        srcVecRowStride[i] = srcRowStride;
-        srcVecRows[i]      = imgSrc[i].size().h;
-        srcVecColumns[i]   = imgSrc[i].size().w;
-        std::uniform_int_distribution<uint8_t> udist(0, 255);
-
-        srcVec[i].resize(imgSrc[i].size().h * srcRowStride);
-        srcColorVec[i].resize(imgSrcColor[i].size().h * srcRowStride);
-        goldVec[i].resize(imgSrc[i].size().h * srcRowStride);
-        dstVec[i].resize(imgSrc[i].size().h * srcRowStride);
-        std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); });
-        std::generate(srcColorVec[i].begin(), srcColorVec[i].end(), [&]() { return udist(rng); });
-        std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; });
-        std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; });
-        auto imgData = imgSrc[i].exportData<nvcv::ImageDataStridedCuda>();
-        ASSERT_NE(imgData, nvcv::NullOpt);
-        auto imgColorData = imgSrcColor[i].exportData<nvcv::ImageDataStridedCuda>();
-        ASSERT_NE(imgColorData, nvcv::NullOpt);
-
-        // Copy input data to the GPU
-        ASSERT_EQ(cudaSuccess,
-                  cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride, srcVec[i].data(),
-                                    srcRowStride, srcRowStride, imgSrc[i].size().h, cudaMemcpyHostToDevice, stream));
-
-        ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgColorData->plane(0).basePtr, imgColorData->plane(0).rowStride,
-                                                 srcColorVec[i].data(), srcRowStride, srcRowStride,
-                                                 imgSrcColor[i].size().h, cudaMemcpyHostToDevice, stream));
-    }
+        // Create input varshape
+        std::default_random_engine         rng;
+        std::uniform_int_distribution<int> udistWidth(width * 0.8, width * 1.1);
+        std::uniform_int_distribution<int> udistHeight(height * 0.8, height * 1.1);
+
+        std::vector<nvcv::Image> imgSrc;
+        std::vector<nvcv::Image> imgSrcColor;
+
+        std::vector<std::vector<uint8_t>> srcVec(numberOfImages);
+        std::vector<std::vector<uint8_t>> srcColorVec(numberOfImages);
+        std::vector<int>                  srcVecRowStride(numberOfImages);
+        std::vector<int>                  srcVecRows(numberOfImages);
+        std::vector<int>                  srcVecColumns(numberOfImages);
+        std::vector<int>                  channelsVec(numberOfImages);
+        std::vector<std::vector<uint8_t>> goldVec(numberOfImages);
+        std::vector<std::vector<uint8_t>> dstVec(numberOfImages);
+        for (int i = 0; i < numberOfImages; ++i)
+        {
+            int          w = udistWidth(rng);
+            int          h = udistHeight(rng);
+            nvcv::Size2D sz(w, h);
+            imgSrc.emplace_back(sz, fmt);
+            imgSrcColor.emplace_back(sz, fmt);
+            int srcRowStride   = imgSrc[i].size().w * fmt.planePixelStrideBytes(0);
+            srcVecRowStride[i] = srcRowStride;
+            srcVecRows[i]      = imgSrc[i].size().h;
+            srcVecColumns[i]   = imgSrc[i].size().w;
+            channelsVec[i]     = fmt.numChannels();
+            std::uniform_int_distribution<uint8_t> udist(0, 255);
+
+            srcVec[i].resize(imgSrc[i].size().h * srcRowStride);
+            srcColorVec[i].resize(imgSrcColor[i].size().h * srcRowStride);
+            goldVec[i].resize(imgSrc[i].size().h * srcRowStride);
+            dstVec[i].resize(imgSrc[i].size().h * srcRowStride);
+            std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); });
+            std::generate(srcColorVec[i].begin(), srcColorVec[i].end(), [&]() { return udist(rng); });
+            std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; });
+            std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; });
+            auto imgData = imgSrc[i].exportData<nvcv::ImageDataStridedCuda>();
+            ASSERT_NE(imgData, nvcv::NullOpt);
+            auto imgColorData = imgSrcColor[i].exportData<nvcv::ImageDataStridedCuda>();
+            ASSERT_NE(imgColorData, nvcv::NullOpt);
+
+            // Copy input data to the GPU
+            ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride,
+                                                     srcVec[i].data(), srcRowStride, srcRowStride, imgSrc[i].size().h,
+                                                     cudaMemcpyHostToDevice, stream));
+
+            ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgColorData->plane(0).basePtr, imgColorData->plane(0).rowStride,
+                                                     srcColorVec[i].data(), srcRowStride, srcRowStride,
+                                                     imgSrcColor[i].size().h, cudaMemcpyHostToDevice, stream));
+        }
 
-    nvcv::ImageBatchVarShape batchSrc(numberOfImages);
-    batchSrc.pushBack(imgSrc.begin(), imgSrc.end());
-    nvcv::ImageBatchVarShape batchSrcColor(numberOfImages);
-    batchSrcColor.pushBack(imgSrcColor.begin(), imgSrcColor.end());
+        nvcv::ImageBatchVarShape batchSrc(numberOfImages);
+        batchSrc.pushBack(imgSrc.begin(), imgSrc.end());
+        nvcv::ImageBatchVarShape batchSrcColor(numberOfImages);
+        batchSrcColor.pushBack(imgSrcColor.begin(), imgSrcColor.end());
 
-    // Create output varshape
-    std::vector<nvcv::Image> imgDst;
-    for (int i = 0; i < numberOfImages; ++i)
-    {
-        imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format());
-    }
-    nvcv::ImageBatchVarShape batchDst(numberOfImages);
-    batchDst.pushBack(imgDst.begin(), imgDst.end());
+        // Create output varshape
+        std::vector<nvcv::Image> imgDst;
+        for (int i = 0; i < numberOfImages; ++i)
+        {
+            imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format());
+        }
+        nvcv::ImageBatchVarShape batchDst(numberOfImages);
+        batchDst.pushBack(imgDst.begin(), imgDst.end());
 
-    // Create diameter tensor
-    std::vector<int> vDiameter(numberOfImages, diameter);
-    nvcv::Tensor     diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32);
-    {
-        auto dev = diameterTensor.exportData<nvcv::TensorDataStridedCuda>();
-        ASSERT_NE(dev, nullptr);
+        // Create diameter tensor
+        std::vector<int> vDiameter(numberOfImages, diameter);
+        nvcv::Tensor     diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32);
+        {
+            auto dev = diameterTensor.exportData<nvcv::TensorDataStridedCuda>();
+            ASSERT_NE(dev, nullptr);
 
-        ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int),
-                                               cudaMemcpyHostToDevice, stream));
-    }
+            ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int),
+                                                   cudaMemcpyHostToDevice, stream));
+        }
 
-    // Create sigmaColor tensor
-    std::vector<float> vSigmaColor(numberOfImages, sigmaColor);
-    nvcv::Tensor       sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
-    {
-        auto dev = sigmaColorTensor.exportData<nvcv::TensorDataStridedCuda>();
-        ASSERT_NE(dev, nullptr);
+        // Create sigmaColor tensor
+        std::vector<float> vSigmaColor(numberOfImages, sigmaColor);
+        nvcv::Tensor       sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
+        {
+            auto dev = sigmaColorTensor.exportData<nvcv::TensorDataStridedCuda>();
+            ASSERT_NE(dev, nullptr);
 
-        ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(), vSigmaColor.size() * sizeof(float),
-                                               cudaMemcpyHostToDevice, stream));
-    }
+            ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(),
+                                                   vSigmaColor.size() * sizeof(float), cudaMemcpyHostToDevice, stream));
+        }
 
-    // Create sigmaSpace tensor
-    std::vector<float> vSigmaSpace(numberOfImages, sigmaSpace);
-    nvcv::Tensor       sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
-    {
-        auto dev = sigmaSpaceTensor.exportData<nvcv::TensorDataStridedCuda>();
-        ASSERT_NE(dev, nullptr);
+        // Create sigmaSpace tensor
+        std::vector<float> vSigmaSpace(numberOfImages, sigmaSpace);
+        nvcv::Tensor       sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32);
+        {
+            auto dev = sigmaSpaceTensor.exportData<nvcv::TensorDataStridedCuda>();
+            ASSERT_NE(dev, nullptr);
 
-        ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(), vSigmaSpace.size() * sizeof(float),
-                                               cudaMemcpyHostToDevice, stream));
-    }
+            ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(),
+                                                   vSigmaSpace.size() * sizeof(float), cudaMemcpyHostToDevice, stream));
+        }
 
-    // Create gold data
-    CPUJointBilateralFilterVarShape(srcVec, srcColorVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, vDiameter,
-                                    vSigmaColor, vSigmaSpace);
+        // Create gold data
+        CPUJointBilateralFilterVarShape(srcVec, srcColorVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride,
+                                        channelsVec, vDiameter, vSigmaColor, vSigmaSpace);
 
-    // Run operator
-    cvcuda::JointBilateralFilter jointBilateralFilterOp;
-    EXPECT_NO_THROW(jointBilateralFilterOp(stream, batchSrc, batchSrcColor, batchDst, diameterTensor, sigmaColorTensor,
-                                           sigmaSpaceTensor, NVCV_BORDER_CONSTANT));
+        // Run operator
+        cvcuda::JointBilateralFilter jointBilateralFilterOp;
+        EXPECT_NO_THROW(jointBilateralFilterOp(stream, batchSrc, batchSrcColor, batchDst, diameterTensor,
+                                               sigmaColorTensor, sigmaSpaceTensor, NVCV_BORDER_CONSTANT));
 
-    // Retrieve data from GPU
-    for (int i = 0; i < numberOfImages; i++)
-    {
-        auto imgData = imgDst[i].exportData<nvcv::ImageDataStridedCuda>();
-        ASSERT_NE(imgData, nvcv::NullOpt);
+        // Retrieve data from GPU
+        for (int i = 0; i < numberOfImages; i++)
+        {
+            auto imgData = imgDst[i].exportData<nvcv::ImageDataStridedCuda>();
+            ASSERT_NE(imgData, nvcv::NullOpt);
 
-        // Copy input data to the GPU
-        ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr,
-                                                 imgData->plane(0).rowStride, srcVecRowStride[i], imgDst[i].size().h,
-                                                 cudaMemcpyDeviceToHost, stream));
+            // Copy input data to the GPU
+            ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr,
+                                                     imgData->plane(0).rowStride, srcVecRowStride[i],
+                                                     imgDst[i].size().h, cudaMemcpyDeviceToHost, stream));
+        }
+        EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+
+        // Compare data
+        ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, channelsVec, 1.0f));
     }
-    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
-
-    // Compare data
-    ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, 1.0f));
 }
+
+#undef NVCV_IMAGE_FORMAT_2U8
diff --git a/tests/cvcuda/system/TestOpPillowResize.cpp b/tests/cvcuda/system/TestOpPillowResize.cpp
index 4f276dba6..4cc2c46f2 100644
--- a/tests/cvcuda/system/TestOpPillowResize.cpp
+++ b/tests/cvcuda/system/TestOpPillowResize.cpp
@@ -1091,11 +1091,14 @@ void StartVarShapeTest(int srcWidthBase, int srcHeightBase, int dstWidthBase, in
     std::uniform_int_distribution<int> rndDstWidth(dstWidthBase * 0.8, dstWidthBase * 1.1);
     std::uniform_int_distribution<int> rndDstHeight(dstHeightBase * 0.8, dstHeightBase * 1.1);
 
-    std::vector<nvcv::Image> imgSrc, imgDst;
+    std::vector<nvcv::Image>  imgSrc, imgDst;
+    std::vector<nvcv::Size2D> srcSizes, dstSizes;
     for (int i = 0; i < numberOfImages; ++i)
     {
         imgSrc.emplace_back(nvcv::Size2D{rndSrcWidth(randEng), rndSrcHeight(randEng)}, fmt);
         imgDst.emplace_back(nvcv::Size2D{rndDstWidth(randEng), rndDstHeight(randEng)}, fmt);
+        srcSizes.emplace_back(imgSrc.back().size());
+        dstSizes.emplace_back(imgDst.back().size());
     }
 
     nvcv::ImageBatchVarShape batchSrc(numberOfImages);
@@ -1142,14 +1145,11 @@ void StartVarShapeTest(int srcWidthBase, int srcHeightBase, int dstWidthBase, in
                                srcHeight, cudaMemcpyHostToDevice));
     }
 
-    nvcv::Size2D maxSrcSize = batchSrc.maxSize();
-    nvcv::Size2D maxDstSize = batchDst.maxSize();
-
     // Generate test result
     cvcuda::PillowResize pillowResizeOp;
 
     cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace(
-        pillowResizeOp.getWorkspaceRequirements(numberOfImages, maxSrcSize, maxDstSize, fmt));
+        pillowResizeOp.getWorkspaceRequirements(numberOfImages, srcSizes.data(), dstSizes.data(), fmt));
     EXPECT_NO_THROW(pillowResizeOp(stream, ws.get(), batchSrc, batchDst, interpolation));
 
     // Get test data back
@@ -1218,3 +1218,18 @@ TEST_P(OpPillowResize, varshape_correct_output)
     else if (nvcv::FMT_RGBf32 == fmt || nvcv::FMT_RGBAf32 == fmt)
         StartVarShapeTest<float>(srcWidth, srcHeight, dstWidth, dstHeight, interpolation, numberOfImages, fmt);
 }
+
+TEST(OpPillowResize, invalidGetWorkSpace)
+{
+    NVCVOperatorHandle pillowResizeHandle;
+    ASSERT_EQ(NVCV_SUCCESS, cvcudaPillowResizeCreate(&pillowResizeHandle));
+    NVCVSize2D inputSizesWH[1] = {
+        {224, 224}
+    };
+    NVCVSize2D outputSizesWH[1] = {
+        {112, 112}
+    };
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT,
+              cvcudaPillowResizeVarShapeGetWorkspaceRequirements(pillowResizeHandle, 1, inputSizesWH, outputSizesWH,
+                                                                 NVCV_IMAGE_FORMAT_U8, nullptr));
+}
diff --git a/tests/cvcuda/system/TestOpResize.cpp b/tests/cvcuda/system/TestOpResize.cpp
index 26140de44..eef44e285 100644
--- a/tests/cvcuda/system/TestOpResize.cpp
+++ b/tests/cvcuda/system/TestOpResize.cpp
@@ -37,20 +37,19 @@ namespace test = nvcv::test;
 namespace t    = ::testing;
 
 // clang-format off
-
 NVCV_TEST_SUITE_P(OpResize, test::ValueList<int, int, int, int, NVCVInterpolationType, int>
 {
     // srcWidth, srcHeight, dstWidth, dstHeight,       interpolation, numberImages
-    {        42,        48,       23,        24, NVCV_INTERP_NEAREST,           1},
-    {       113,        12,       12,        36, NVCV_INTERP_NEAREST,           1},
-    {       421,       148,      223,       124, NVCV_INTERP_NEAREST,           2},
-    {       313,       212,      412,       336, NVCV_INTERP_NEAREST,           3},
-    {        42,        40,       21,        20,  NVCV_INTERP_LINEAR,           1},
-    {        21,        21,       42,        42,  NVCV_INTERP_LINEAR,           1},
-    {       420,       420,      210,       210,  NVCV_INTERP_LINEAR,           4},
-    {       210,       210,      420,       420,  NVCV_INTERP_LINEAR,           5},
-    {        42,        40,       21,        20,   NVCV_INTERP_CUBIC,           1},
-    {        21,        21,       42,        42,   NVCV_INTERP_CUBIC,           6},
+    {         42,       48,       23,        24, NVCV_INTERP_NEAREST,           1},
+    {        113,       12,       12,        36, NVCV_INTERP_NEAREST,           1},
+    {        421,      148,      223,       124, NVCV_INTERP_NEAREST,           2},
+    {        313,      212,      412,       336, NVCV_INTERP_NEAREST,           3},
+    {         42,       40,       21,        20,  NVCV_INTERP_LINEAR,           1},
+    {         21,       21,       42,        42,  NVCV_INTERP_LINEAR,           1},
+    {        420,      420,      210,       210,  NVCV_INTERP_LINEAR,           4},
+    {        210,      210,      420,       420,  NVCV_INTERP_LINEAR,           5},
+    {         42,       40,       21,        20,   NVCV_INTERP_CUBIC,           1},
+    {         21,       21,       42,        42,   NVCV_INTERP_CUBIC,           6},
     {        420,      420,      420,       420,   NVCV_INTERP_CUBIC,           2},
     {        420,      420,      420,       420,   NVCV_INTERP_CUBIC,           1},
     {        420,      420,       40,        42,   NVCV_INTERP_CUBIC,           1},
@@ -59,9 +58,17 @@ NVCV_TEST_SUITE_P(OpResize, test::ValueList<int, int, int, int, NVCVInterpolatio
     {         44,       40,       22,        20,    NVCV_INTERP_AREA,           2},
     {         30,       30,       20,        20,    NVCV_INTERP_AREA,           2},
     {         30,       30,       60,        60,    NVCV_INTERP_AREA,           4},
-});
+    {       1080,     1920,      720,      1280,  NVCV_INTERP_LINEAR,           1},
+    {        720,     1280,      480,       854,  NVCV_INTERP_CUBIC,            1},
+    {       1440,     2560,     1080,      1920,  NVCV_INTERP_AREA,             1},
+    {       2160,     3840,     1080,      1920,  NVCV_INTERP_LINEAR,           1},
+    {       1080,     1920,      540,       960,  NVCV_INTERP_CUBIC,            1},
+    {        720,     1280,      360,       640,  NVCV_INTERP_AREA,             1},
+    {       2160,     3840,     1440,      2560,  NVCV_INTERP_LINEAR,           1},
+    {       1080,     1920,      360,       640,  NVCV_INTERP_CUBIC,            1},
+    {       1440,     2560,      720,      1280,  NVCV_INTERP_AREA,             1},
 
-// clang-format on
+});
 
 TEST_P(OpResize, tensor_correct_output)
 {
@@ -143,7 +150,15 @@ TEST_P(OpResize, tensor_correct_output)
         test::Resize(goldVec, dstVecRowStride, {dstWidth, dstHeight}, srcVec[i], srcVecRowStride, {srcWidth, srcHeight},
                      fmt, interpolation, false);
 
-        EXPECT_EQ(goldVec, testVec);
+        std::vector<int> mae(testVec.size());
+        for (size_t i = 0; i < mae.size(); ++i)
+        {
+            mae[i] = abs(static_cast<int>(goldVec[i]) - static_cast<int>(testVec[i]));
+        }
+
+        int maeThreshold = 1;
+
+        EXPECT_THAT(mae, t::Each(t::Le(maeThreshold)));
     }
 }
 
diff --git a/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp b/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp
new file mode 100644
index 000000000..81208d38a
--- /dev/null
+++ b/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp
@@ -0,0 +1,491 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Definitions.hpp"
+#include "ResizeUtils.hpp"
+
+#include <common/InterpUtils.hpp>
+#include <common/TypedTests.hpp>
+// #include <common/ValueTests.hpp>
+#include <cvcuda/OpConvertTo.hpp>
+#include <cvcuda/OpCustomCrop.hpp>
+#include <cvcuda/OpCvtColor.hpp>
+#include <cvcuda/OpReformat.hpp>
+#include <cvcuda/OpResize.hpp>
+#include <cvcuda/OpResizeCropConvertReformat.hpp>
+#include <cvcuda/Types.h> // for NVCVInterpolationType, NVCVChannelManip, etc.
+#include <nvcv/Image.hpp>
+#include <nvcv/Tensor.hpp>
+#include <nvcv/TensorDataAccess.hpp>
+#include <util/TensorDataUtils.hpp>
+
+#include <iostream>
+#include <random>
+#include <vector>
+
+namespace gt    = ::testing;
+namespace test  = nvcv::test;
+namespace cuda  = nvcv::cuda;
+namespace ttype = test::type;
+
+static std::default_random_engine randEng(std::random_device{}());
+
+template<typename T>
+using uniform_dist
+    = std::conditional_t<std::is_integral_v<T>, std::uniform_int_distribution<T>, std::uniform_real_distribution<T>>;
+
+inline NVCVChannelManip ChannelManip(nvcv::ImageFormat srcFormat, nvcv::ImageFormat dstFormat)
+{
+    const int           srcChannels = srcFormat.numChannels();
+    const nvcv::Swizzle srcSwizzle  = srcFormat.swizzle();
+    const nvcv::Swizzle dstSwizzle  = dstFormat.swizzle();
+
+    NVCVChannelManip manip = NVCV_CHANNEL_NO_OP;
+
+    if (srcChannels > 2 && srcSwizzle != dstSwizzle)
+    {
+        int  srcSwap = static_cast<int>(srcSwizzle), dstSwap = static_cast<int>(dstSwizzle);
+        bool srcRGB = (srcSwap == NVCV_SWIZZLE_XYZ0 || srcSwap == NVCV_SWIZZLE_XYZW || srcSwap == NVCV_SWIZZLE_XYZ1),
+             srcBGR = (srcSwap == NVCV_SWIZZLE_ZYX0 || srcSwap == NVCV_SWIZZLE_ZYXW || srcSwap == NVCV_SWIZZLE_ZYX1);
+        bool dstRGB = (dstSwap == NVCV_SWIZZLE_XYZ0 || dstSwap == NVCV_SWIZZLE_XYZW || dstSwap == NVCV_SWIZZLE_XYZ1),
+             dstBGR = (dstSwap == NVCV_SWIZZLE_ZYX0 || dstSwap == NVCV_SWIZZLE_ZYXW || dstSwap == NVCV_SWIZZLE_ZYX1);
+        bool swapRB = ((srcRGB && dstBGR) || (srcBGR && dstRGB));
+
+        if (swapRB && srcChannels == 3)
+        {
+            manip = NVCV_CHANNEL_REVERSE;
+        }
+    }
+    return manip;
+}
+
+template<typename SrcT, typename DstT>
+void CropConvert(DstT *dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, const SrcT *src,
+                 const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, const int numImages, const int2 cropPos,
+                 const NVCVChannelManip manip, const double scale = 1.0, const double offst = 0.0)
+{
+    int srcPlanes   = srcFormat.numPlanes();
+    int dstPlanes   = dstFormat.numPlanes();
+    int srcChannels = srcFormat.numChannels();
+    int dstChannels = dstFormat.numChannels();
+
+    size_t srcIncrX = srcChannels / srcPlanes; // 1 if planar; srcChannels if not.
+    size_t dstIncrX = dstChannels / dstPlanes; // 1 if planar; dstChannels if not.
+    size_t srcIncrY = srcIncrX * srcSize.w;
+    size_t dstIncrY = dstIncrX * dstSize.w;
+    size_t srcIncrC = (srcPlanes > 1 ? srcSize.w * srcSize.h : 1);
+    size_t dstIncrC = (dstPlanes > 1 ? dstSize.w * dstSize.h : 1);
+    size_t srcIncrN = srcSize.w * srcSize.h * srcChannels;
+    size_t dstIncrN = dstSize.w * dstSize.h * dstChannels;
+    size_t srcOffst = cropPos.y * srcIncrY + cropPos.x * srcIncrX;
+
+    int channelMap[4] = {0, 1, 2, 3};
+
+    int channels = (srcChannels < dstChannels ? srcChannels : dstChannels);
+
+    if (manip == NVCV_CHANNEL_REVERSE)
+    {
+        for (int c = 0; c < channels; ++c) channelMap[c] = channels - c - 1;
+    }
+
+    for (int i = 0; i < numImages; i++)
+    {
+        const SrcT *srcBase = src + i * srcIncrN + srcOffst;
+        DstT       *dstBase = dst + i * dstIncrN;
+
+        for (int y = 0; y < dstSize.h; y++)
+        {
+            const SrcT *srcRow = srcBase + y * srcIncrY;
+            DstT       *dstRow = dstBase + y * dstIncrY;
+
+            for (int x = 0; x < dstSize.w; x++)
+            {
+                const SrcT *srcPtr = srcRow + x * srcIncrX;
+                DstT       *dstPtr = dstRow + x * dstIncrX;
+
+                for (int c = 0; c < channels; c++)
+                {
+                    dstPtr[channelMap[c] * dstIncrC] = static_cast<DstT>(srcPtr[c * srcIncrC] * scale + offst);
+                }
+            }
+        }
+    }
+}
+
+template<typename SrcT, typename DstT>
+void CropConvert(std::vector<DstT> &dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat,
+                 const std::vector<SrcT> src, const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat,
+                 const int numImages, const int2 cropPos, const NVCVChannelManip manip, const double scale = 1.0,
+                 const double offst = 0.0)
+{
+    CropConvert(dst.data(), dstSize, dstFormat, src.data(), srcSize, srcFormat, numImages, cropPos, manip, scale,
+                offst);
+}
+
+template<typename SrcT, typename DstT>
+void CropConvert(DstT *dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, const std::vector<SrcT> src,
+                 const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, const int numImages, const int2 cropPos,
+                 const NVCVChannelManip manip, const double scale = 1.0, const double offst = 0.0)
+{
+    CropConvert(dst, dstSize, dstFormat, src.data(), srcSize, srcFormat, numImages, cropPos, manip, scale, offst);
+}
+
+template<typename T>
+void fillVec(std::vector<T> &vec, const nvcv::Size2D size, const nvcv::ImageFormat frmt, size_t offst = 0)
+{
+    int    planes   = frmt.numPlanes();
+    int    channels = frmt.numChannels();
+    size_t incrX    = channels / planes; // 1 if planar; dstChannels if not.
+    size_t incrY    = incrX * size.w;
+    size_t incrC    = (planes > 1 ? size.w * size.h : 1);
+
+    for (int y = 0; y < size.h; y++)
+    {
+        size_t yIncr = offst + y * incrY;
+
+        for (int x = 0; x < size.w; x++)
+        {
+            size_t xIncr = yIncr + x * incrX;
+
+            for (int c = 0; c < channels; c++)
+            {
+                vec[xIncr + c * incrC] = static_cast<T>((x + y + c) & 255);
+            }
+        }
+    }
+}
+
+#define _SHAPE(w, h, n) (int3{w, h, n})
+
+#define _TEST_ROW(SrcShape, ResizeDim, Interp, DstSize, CropPos, SrcFrmt, DstFrmt, SrcType, DstType)           \
+    ttype::Types<ttype::Value<SrcShape>, ttype::Value<ResizeDim>, ttype::Value<Interp>, ttype::Value<DstSize>, \
+                 ttype::Value<CropPos>, ttype::Value<SrcFrmt>, ttype::Value<DstFrmt>, SrcType, DstType>
+
+// clang-format off
+
+NVCV_TYPED_TEST_SUITE(
+    OpResizeCropConvertReformat, ttype::Types<
+    // Test cases: RGB (interleaved) -> BGR (planar); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE(   8,    8,  1), int2(  8,   8), NVCV_INTERP_LINEAR, int2(   6,   6), int2(  1,   1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  0
+    _TEST_ROW(_SHAPE(   8,    8,  1), int2( 16,  16), NVCV_INTERP_LINEAR, int2(  12,  12), int2(  2,   2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  1
+    _TEST_ROW(_SHAPE(  42,   48,  1), int2( 23,  24), NVCV_INTERP_LINEAR, int2(  15,  13), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  2
+    _TEST_ROW(_SHAPE(  42,   40,  3), int2( 21,  20), NVCV_INTERP_LINEAR, int2(  17,  13), int2(  1,   1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  3
+    _TEST_ROW(_SHAPE(  21,   21,  5), int2( 42,  42), NVCV_INTERP_LINEAR, int2(  32,  32), int2( 10,  10), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  4
+    _TEST_ROW(_SHAPE( 113,   12,  7), int2( 12,  36), NVCV_INTERP_LINEAR, int2(   7,  13), int2(  3,  11), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  5
+    _TEST_ROW(_SHAPE(  17,  151,  7), int2( 48,  16), NVCV_INTERP_LINEAR, int2(  32,  16), int2(  4,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  6
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  7
+    _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_LINEAR, int2( 640, 480), int2(101,  64), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  8
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   //  9
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p,    uchar3, uint8_t), // 10
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p,    uchar3, uint8_t), // 11
+
+    // Test cases: RGB (interleaved) -> RGB (planar); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p,  uchar3, float),   // 12
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p,  uchar3, float),   // 13
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p,    uchar3, uint8_t), // 14
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p,    uchar3, uint8_t), // 15
+
+    // Test cases: BGR (interleaved) -> RGB (planar); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p,  uchar3, float),   // 16
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p,  uchar3, float),   // 17
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p,    uchar3, uint8_t), // 18
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p,    uchar3, uint8_t), // 19
+
+    // Test cases: BGR (interleaved) -> BGR (planar); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   // 20
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p,  uchar3, float),   // 21
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p,    uchar3, uint8_t), // 22
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p,    uchar3, uint8_t), // 23
+
+    // Test cases: RGB (interleaved) -> BGR (interleaved); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE(   8,    8,  1), int2(  8,   8), NVCV_INTERP_LINEAR, int2(   6,   6), int2(  1,   1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32,   uchar3, float3),  // 24
+    _TEST_ROW(_SHAPE(   8,    8,  1), int2( 16,  16), NVCV_INTERP_LINEAR, int2(  12,  12), int2(  2,   2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32,   uchar3, float3),  // 25
+    _TEST_ROW(_SHAPE( 113,   12,  7), int2( 12,  36), NVCV_INTERP_LINEAR, int2(   7,  13), int2(  3,  11), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32,   uchar3, float3),  // 26
+    _TEST_ROW(_SHAPE(  17,  151,  7), int2( 48,  16), NVCV_INTERP_LINEAR, int2(  32,  16), int2(  4,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32,   uchar3, float3),  // 27
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32,   uchar3, float3),  // 28
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32,   uchar3, float3),  // 29
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8,     uchar3, uchar3),  // 30
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8,     uchar3, uchar3),  // 31
+
+    // Test cases: RGB (interleaved) -> RGB (interleaved); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32,   uchar3, float3),  // 32
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32,   uchar3, float3),  // 33
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8,     uchar3, uchar3),  // 34
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8,     uchar3, uchar3),  // 35
+
+    // Test cases: BGR (interleaved) -> RGB (interleaved); linear interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32,   uchar3, float3),  // 36
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32,   uchar3, float3),  // 37
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8,     uchar3, uchar3),  // 38
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8,     uchar3, uchar3),  // 39
+
+    // Test cases: RGB (interleaved) -> BGR (planar); nearest-neighbor interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE(   8,    8,  1), int2(  8,   8), NVCV_INTERP_NEAREST, int2(   6,   6), int2(  1,   1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float),   // 40
+    _TEST_ROW(_SHAPE(   8,    8,  5), int2( 16,  16), NVCV_INTERP_NEAREST, int2(  12,  12), int2(  2,   2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float),   // 41
+    _TEST_ROW(_SHAPE(  42,   48,  1), int2( 23,  24), NVCV_INTERP_NEAREST, int2(  15,  13), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float),   // 42
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float),   // 43
+    _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_NEAREST, int2( 640, 480), int2(101,  64), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float),   // 44
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float),   // 45
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p,   uchar3, uint8_t), // 46
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p,   uchar3, uint8_t), // 47
+
+    // Test cases: BGR (interleaved) -> RGB (planar); nearest-neighbor interpolation; float and uchar output.
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float),   // 48
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float),   // 49
+    _TEST_ROW(_SHAPE( 313,  212,  4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2(  0,   0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p,   uchar3, uint8_t), // 50
+    _TEST_ROW(_SHAPE(1280,  960,  3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15,  16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p,   uchar3, uint8_t)  // 51
+
+>);
+#undef _TEST_ROW
+
+// clang-format on
+
+TYPED_TEST(OpResizeCropConvertReformat, tensor_correct_output)
+{
+    int3 srcShape = ttype::GetValue<TypeParam, 0>;
+    int2 resize   = ttype::GetValue<TypeParam, 1>;
+
+    NVCVInterpolationType interp = ttype::GetValue<TypeParam, 2>;
+
+    int2 cropDim = ttype::GetValue<TypeParam, 3>;
+    int2 cropPos = ttype::GetValue<TypeParam, 4>;
+
+    nvcv::ImageFormat srcFormat{ttype::GetValue<TypeParam, 5>};
+    nvcv::ImageFormat dstFormat{ttype::GetValue<TypeParam, 6>};
+
+    using SrcVT = typename ttype::GetType<TypeParam, 7>;
+    using DstVT = typename ttype::GetType<TypeParam, 8>;
+    using SrcBT = typename cuda::BaseType<SrcVT>;
+    using DstBT = typename cuda::BaseType<DstVT>;
+
+    int srcW = srcShape.x;
+    int srcH = srcShape.y;
+    int dstW = cropDim.x;
+    int dstH = cropDim.y;
+    int tmpW = resize.x;
+    int tmpH = resize.y;
+
+    int numImages   = srcShape.z;
+    int srcChannels = srcFormat.numChannels();
+    int dstChannels = dstFormat.numChannels();
+    int srcPlanes   = srcFormat.numPlanes();
+    int dstPlanes   = dstFormat.numPlanes();
+    int srcPixElems = srcChannels / srcPlanes;
+    int dstPixElems = dstChannels / dstPlanes;
+
+    ASSERT_LE(srcChannels, 4);
+    ASSERT_EQ(srcChannels, dstChannels);
+
+    NVCVSize2D resizeDim{resize.x, resize.y};
+
+    NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat);
+
+    // Create input and output tensors.
+    nvcv::Tensor srcTensor = nvcv::util::CreateTensor(numImages, srcW, srcH, srcFormat);
+    nvcv::Tensor dstTensor = nvcv::util::CreateTensor(numImages, dstW, dstH, dstFormat);
+
+    auto src = srcTensor.exportData<nvcv::TensorDataStridedCuda>();
+    auto dst = dstTensor.exportData<nvcv::TensorDataStridedCuda>();
+
+    ASSERT_NE(src, nullptr);
+    ASSERT_NE(dst, nullptr);
+
+    auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*src);
+    ASSERT_TRUE(srcAccess);
+
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dst);
+    ASSERT_TRUE(dstAccess);
+
+    int srcRowElems = srcPixElems * srcW;
+    int tmpRowElems = srcPixElems * tmpW;
+    int dstRowElems = dstPixElems * dstW;
+
+    size_t srcElems = (size_t)srcRowElems * (size_t)srcH * (size_t)srcPlanes * (size_t)numImages;
+    size_t tmpElems = (size_t)tmpRowElems * (size_t)tmpH * (size_t)srcPlanes * (size_t)numImages;
+    size_t dstElems = (size_t)dstRowElems * (size_t)dstH * (size_t)dstPlanes * (size_t)numImages;
+
+    nvcv::Size2D srcSize{srcW, srcH};
+    nvcv::Size2D tmpSize{tmpW, tmpH};
+    nvcv::Size2D dstSize{dstW, dstH};
+
+    size_t srcPitch = srcW * sizeof(SrcVT);
+    size_t tmpPitch = tmpW * sizeof(SrcVT);
+    size_t dstPitch = dstW * sizeof(DstVT);
+
+    std::vector<SrcBT> srcVec(srcElems);
+    std::vector<SrcBT> tmpVec(tmpElems);
+    std::vector<DstBT> refVec(dstElems);
+
+    // Populate source tensor.
+    fillVec(srcVec, srcSize, srcFormat);
+
+    // Generate "gold" result for image and place in reference vector.
+    test::Resize(tmpVec, tmpPitch, tmpSize, srcVec, srcPitch, srcSize, srcFormat, interp, false);
+    CropConvert(refVec, dstSize, dstFormat, tmpVec, tmpSize, srcFormat, numImages, cropPos, manip);
+
+    // Copy source tensor to device.
+    ASSERT_EQ(cudaSuccess, cudaMemcpy2D(src->basePtr(), srcAccess->rowStride(), srcVec.data(), srcPitch, srcPitch,
+                                        srcH * srcPlanes, cudaMemcpyHostToDevice));
+
+    // Run fused ResizeCropConvertReformat operator.
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    cvcuda::ResizeCropConvertReformat resizeCrop;
+    EXPECT_NO_THROW(resizeCrop(stream, srcTensor, dstTensor, resizeDim, interp, cropPos, manip));
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // Copy destination tensor back to host.
+    std::vector<DstBT> dstVec(dstElems);
+    ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitch, dst->basePtr(), dstAccess->rowStride(), dstPitch,
+                                        dstH * dstPlanes, cudaMemcpyDeviceToHost));
+
+    // Compare "gold" reference to computed output.
+    VEC_EXPECT_NEAR(refVec, dstVec, 1);
+}
+
+TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output)
+{
+    int3 srcShape = ttype::GetValue<TypeParam, 0>;
+    int2 resize   = ttype::GetValue<TypeParam, 1>;
+
+    NVCVInterpolationType interp = ttype::GetValue<TypeParam, 2>;
+
+    int2 cropDim = ttype::GetValue<TypeParam, 3>;
+    int2 cropPos = ttype::GetValue<TypeParam, 4>;
+
+    nvcv::ImageFormat srcFormat{ttype::GetValue<TypeParam, 5>};
+    nvcv::ImageFormat dstFormat{ttype::GetValue<TypeParam, 6>};
+
+    using SrcVT = typename ttype::GetType<TypeParam, 7>;
+    using DstVT = typename ttype::GetType<TypeParam, 8>;
+    using SrcBT = typename cuda::BaseType<SrcVT>;
+    using DstBT = typename cuda::BaseType<DstVT>;
+
+    int srcW = srcShape.x;
+    int srcH = srcShape.y;
+    int dstW = cropDim.x;
+    int dstH = cropDim.y;
+    int tmpW = resize.x;
+    int tmpH = resize.y;
+
+    int numImages   = srcShape.z;
+    int srcChannels = srcFormat.numChannels();
+    int dstChannels = dstFormat.numChannels();
+    int srcPlanes   = srcFormat.numPlanes();
+    int dstPlanes   = dstFormat.numPlanes();
+    int srcPixElems = srcChannels / srcPlanes;
+    int dstPixElems = dstChannels / dstPlanes;
+
+    ASSERT_LE(srcChannels, 4);
+    ASSERT_EQ(srcChannels, dstChannels);
+
+    NVCVSize2D resizeDim{resize.x, resize.y};
+
+    NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat);
+
+    std::vector<nvcv::Image> srcImg;
+
+    uniform_dist<SrcBT> randVal(std::is_integral_v<SrcBT> ? cuda::TypeTraits<SrcBT>::min : SrcBT{0},
+                                std::is_integral_v<SrcBT> ? cuda::TypeTraits<SrcBT>::max : SrcBT{1});
+
+    std::uniform_int_distribution<int> randW(srcW * 0.8, srcW * 1.2);
+    std::uniform_int_distribution<int> randH(srcH * 0.8, srcH * 1.2);
+
+    int tmpRowElems = srcPixElems * tmpW;
+    int dstRowElems = dstPixElems * dstW;
+
+    size_t tmpElems = (size_t)tmpRowElems * (size_t)tmpH * (size_t)srcPlanes;
+    size_t refIncr  = (size_t)dstRowElems * (size_t)dstH * (size_t)dstPlanes;
+    size_t dstElems = refIncr * (size_t)numImages;
+
+    nvcv::Size2D tmpSize{tmpW, tmpH};
+    nvcv::Size2D dstSize{dstW, dstH};
+
+    std::vector<SrcBT> tmpVec(tmpElems);
+    std::vector<DstBT> refVec(dstElems);
+
+    size_t tmpPitch = tmpW * sizeof(SrcVT);
+    size_t dstPitch = dstW * sizeof(DstVT);
+
+    for (int i = 0; i < numImages; ++i)
+    {
+        int imgW = (interp ? randW(randEng) : srcW);
+        int imgH = (interp ? randH(randEng) : srcH);
+
+        srcImg.emplace_back(nvcv::Size2D{imgW, imgH}, srcFormat);
+
+        auto srcData = srcImg[i].exportData<nvcv::ImageDataStridedCuda>();
+        ASSERT_TRUE(srcData);
+
+        int imgRowElems = srcPixElems * imgW;
+
+        size_t imgPitch = imgW * sizeof(SrcVT);
+        size_t imgElems = (size_t)imgRowElems * (size_t)imgH * (size_t)srcPlanes;
+
+        nvcv::Size2D imgSize{imgW, imgH};
+
+        std::vector<SrcBT> imgVec(imgElems);
+
+        // Populate image tensor .
+        fillVec(imgVec, imgSize, srcFormat);
+
+        // Generate "gold" result for image and place in reference image plane.
+        DstBT *refPlane = refVec.data() + i * refIncr;
+
+        test::Resize(tmpVec, tmpPitch, tmpSize, imgVec, imgPitch, imgSize, srcFormat, interp, true);
+        CropConvert(refPlane, dstSize, dstFormat, tmpVec, tmpSize, srcFormat, 1, cropPos, manip);
+
+        // Copy source tensor to device.
+        ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->plane(0).basePtr, srcData->plane(0).rowStride, imgVec.data(),
+                                            imgPitch, imgPitch, imgH * srcPlanes, cudaMemcpyHostToDevice));
+    }
+
+    nvcv::ImageBatchVarShape src(numImages);
+
+    src.pushBack(srcImg.begin(), srcImg.end());
+
+    // Create output tensor.
+    nvcv::Tensor dstTensor = nvcv::util::CreateTensor(numImages, dstW, dstH, dstFormat);
+
+    auto dst = dstTensor.exportData<nvcv::TensorDataStridedCuda>();
+
+    ASSERT_NE(dst, nullptr);
+
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dst);
+    ASSERT_TRUE(dstAccess);
+
+    // Run fused ResizeCropConvertReformat operator.
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    cvcuda::ResizeCropConvertReformat resizeCrop;
+    EXPECT_NO_THROW(resizeCrop(stream, src, dstTensor, resizeDim, interp, cropPos, manip));
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+    // Copy destination tensor back to host.
+    std::vector<DstBT> dstVec(dstElems);
+    ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitch, dst->basePtr(), dstAccess->rowStride(), dstPitch,
+                                        dstH * dstPlanes * numImages, cudaMemcpyDeviceToHost));
+
+    // Compare "gold" reference to computed output.
+    VEC_EXPECT_NEAR(refVec, dstVec, 1);
+}
diff --git a/tests/nvcv_types/python/test_image.py b/tests/nvcv_types/python/test_image.py
index 4615ca0de..b9e5efae6 100644
--- a/tests/nvcv_types/python/test_image.py
+++ b/tests/nvcv_types/python/test_image.py
@@ -13,11 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 import numpy as np
-import pytest as t
 import nvcv
 import nvcv_util as util
+import pytest as t
+import torch
+
+import cvcuda
+
+
+def test_image_is_cached():
+    created_ids = set()
+
+    pt_img = torch.rand((1, 1), dtype=torch.float32, device="cuda")
+    img = cvcuda.as_image(pt_img)
+    created_ids.add(img.id)
+    del img  # delete img such that only cache has a reference to it
+
+    for i in range(50):
+        pt_img = torch.rand((1 + i, i + 2), dtype=torch.float32, device="cuda")
+        img = cvcuda.as_image(pt_img)
+        assert img.id in created_ids
+        del img
 
 
 def test_image_creation_works():
@@ -110,10 +127,7 @@ def test_wrap_host_buffer_infer_imgformat_multiple_planes(buffers, format):
     assert img.format == format
 
     img = nvcv.as_image(
-        [
-            torch.zeros(size=buf[0], dtype=buf[2], device="cuda").cuda()
-            for buf in buffers
-        ]
+        [torch.zeros(size=buf[0], dtype=buf[2], device="cuda") for buf in buffers]
     )
     assert img.width == 8
     assert img.height == 6
@@ -128,10 +142,7 @@ def test_wrap_host_buffer_explicit_format2(buffers, format):
     assert img.format == format
 
     img = nvcv.as_image(
-        [
-            torch.zeros(size=buf[0], dtype=buf[2], device="cuda").cuda()
-            for buf in buffers
-        ],
+        [torch.zeros(size=buf[0], dtype=buf[2], device="cuda") for buf in buffers],
         format,
     )
     assert img.width == 8
@@ -447,7 +458,7 @@ def test_image_is_kept_alive_by_cuda_array_interface():
 
     del img2
     # remove img2 from cache, but not img1, as it's being
-    # held by iface
+    # held by iface1
     nvcv.clear_cache()
 
     # now img1 is free for reuse
@@ -455,3 +466,21 @@ def test_image_is_kept_alive_by_cuda_array_interface():
 
     img3 = nvcv.Image((640, 480), nvcv.Format.U8)
     assert img3.cuda().__cuda_array_interface__["data"][0] == data_buffer1
+
+
+def test_image_wrapper_nodeletion():
+    """
+    Check if image wrappers deletes memory that's not ours.
+    """
+    # run twice, first run is without cache re-usage, second is with cache re-usage
+    for i in range(2):
+        np_img = np.random.rand(1 + i, 2 + i).astype(np.float32)
+        pt_img = torch.from_numpy(np_img).cuda()
+
+        nv_img = cvcuda.as_image(pt_img)
+        del nv_img
+
+        try:
+            assert (pt_img.cpu().numpy() == np_img).all()
+        except RuntimeError:
+            assert False, "Invalid memory"
diff --git a/tests/nvcv_types/python/test_imgbatchvarshape.py b/tests/nvcv_types/python/test_imgbatchvarshape.py
index 0caaedb38..cb0af3306 100644
--- a/tests/nvcv_types/python/test_imgbatchvarshape.py
+++ b/tests/nvcv_types/python/test_imgbatchvarshape.py
@@ -13,10 +13,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import nvcv
-import pytest as t
 import numpy as np
+import nvcv
 import nvcv_util as util
+import pytest as t
+import torch
+
+import cvcuda
+
+
+def test_imgbatchvarshape_are_cached():
+    created_ids = set()
+
+    # Create first VarShape
+    pt_imgs = []
+    for n in range(2):
+        pt_img = torch.rand((1 + n, 2 + n), dtype=torch.float32, device="cuda")
+        pt_imgs.append(pt_img)
+
+    batch = cvcuda.as_images(pt_imgs)
+
+    for img in batch:
+        created_ids.add(img.id)
+    del img
+    del batch
+
+    # Create more VarShapes, that reuse cache
+    for i in range(50):
+        pt_imgs = []
+        for _ in range(2):
+            pt_img = torch.rand((1 + i, 2 + i), dtype=torch.float32, device="cuda")
+            pt_imgs.append(pt_img)
+
+        batch = cvcuda.as_images(pt_imgs)
+
+        for img in batch:
+            assert img.id in created_ids
+        del img
+        del batch
 
 
 def test_imgbatchvarshape_creation_works():
@@ -151,3 +185,22 @@ def test_wrap_buffer_list(base_shape, dt, format):
         assert images[i].width == sh[1]
         assert images[i].height == sh[0]
         assert images[i].format == format
+
+
+def test_imgbatchvarshape_wrapper_nodeletion():
+    """
+    Check if imgbatchvarshape wrappers deletes memory that's not ours.
+    """
+
+    # run twice, first run is without cache re-usage, second is with cache re-usage
+    for i in range(2):
+        np_img = np.random.rand(1 + i, 2 + i).astype(np.float32)
+        pt_img = torch.from_numpy(np_img).cuda()
+
+        batch = cvcuda.as_images([pt_img])
+        del batch
+
+        try:
+            assert (pt_img.cpu().numpy() == np_img).all()
+        except RuntimeError:
+            assert False, "Invalid memory"