diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 172dcd538..185ef70bd 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,14 +1,3 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# name: "CodeQL" on: @@ -22,11 +11,11 @@ on: jobs: analyze: name: Analyze - runs-on: ${{ (matrix.language == 'c-cpp' && 'ubuntu-20.04-64core') || 'ubuntu-latest' }} - timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + runs-on: ubuntu-20.04-64core + timeout-minutes: 360 permissions: - actions: read - contents: read + actions: write + contents: write security-events: write strategy: @@ -36,7 +25,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: lfs: true submodules: 'recursive' @@ -44,19 +33,27 @@ jobs: - if: matrix.language == 'c-cpp' name: Setup environment run: | + sudo apt update -y && sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && \ sudo apt update -y && sudo apt install -y --no-install-recommends \ git git-lfs gcc-11 g++-11 ninja-build ccache libgtest-dev libgmock-dev \ - shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils + shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils \ + texlive-latex-extra ghostscript graphviz \ + && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \ + && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/bin/ /usr/local/ \ + && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/share/ /usr/local/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/doc/ /usr/local/ \ + && rm -rf /tmp/cmake-3.20.1* - if: matrix.language == 'c-cpp' name: Install Python Dependencies run: | - python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 breathe exhale \ - recommonmark graphviz numpy==1.24.1 + sudo apt update -y && sudo apt install -y --no-install-recommends \ + python3 python3-pip python3-dev python3-distutils doxygen && sudo rm -rf /var/lib/apt/lists/* \ + && python3 -m pip install sphinx-rtd-theme sphinx breathe exhale recommonmark graphviz \ + && python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 - if: matrix.language == 'c-cpp' name: Install CUDA Toolkit - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit with: cuda: '11.7.1' @@ -72,22 +69,44 @@ jobs: nvcc -V - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality - if: matrix.language != 'c-cpp' name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - if: matrix.language == 'c-cpp' name: Build CMake project run: | echo "Running CMake project build script" - ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF" $* + ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" $* - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" + + - if: matrix.language == 'c-cpp' && github.event_name == 'push' + name: Build Docs and Clean up Sphinx Build Directory + run: | + ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON" $* + find build/docs/sphinx -name '*.doctree' -delete + find build/docs/sphinx -name '*.map' -delete + find build/docs/sphinx -name '*.pickle' -delete + find build/docs/sphinx -name '*.inv' -delete + find build/docs/sphinx -name '*.gz' -delete + + - if: matrix.language == 'c-cpp' && github.event_name == 'push' + name: Create .nojekyll file + run: touch build/docs/sphinx/.nojekyll + + - if: matrix.language == 'c-cpp' && github.event_name == 'push' + name: Deploy to GitHub Pages + uses: JamesIves/github-pages-deploy-action@v4 + with: + folder: build/docs/sphinx + branch: gh-pages + clean: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index f5197e2ed..000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# See https://pre-commit.com for more information -# See https://pre-commit.com/hooks.html for more hooks - -default_stages: - - merge-commit - - commit - - post-rewrite - -repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 - hooks: - - id: check-merge-conflict - - id: trailing-whitespace - - id: end-of-file-fixer - - id: mixed-line-ending - args: ['--fix=lf'] - - id: check-executables-have-shebangs - - id: check-shebang-scripts-are-executable - - id: detect-private-key - - id: check-added-large-files - - id: check-case-conflict - - id: destroyed-symlinks - - id: check-yaml - exclude: '.clang-format' - -- repo: https://github.com/detailyang/pre-commit-shell - rev: 1.0.5 - hooks: - - id: shell-lint - args: ['-x', '--severity=warning'] - -- repo: https://github.com/pre-commit/mirrors-clang-format - rev: v14.0.6 - hooks: - - id: clang-format - -- repo: https://github.com/pryorda/dockerfilelint-precommit-hooks - rev: v0.1.0 - hooks: - - id: dockerfilelint - -- repo: https://github.com/psf/black - rev: 22.10.0 - hooks: - - id: black - -- repo: https://github.com/pycqa/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - args: - - "--max-line-length=110" - -- repo: local - hooks: - - id: copyright_check - name: 'check copyright message' - language: system - types: ['file', 'text', 'markdown'] - exclude_types: ['xml', 'json', 'csv'] - entry: ./lint/copyright_check.sh - exclude: 'models/.*' - - id: lfs_check - name: 'check LFS objects' - language: system - entry: ./lint/lfs_check.sh - require_serial: true - -- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook - rev: v9.13.0 - hooks: - - id: commitlint - stages: [commit-msg] - args: ['--config','lint/commitlint.config.js'] - additional_dependencies: ['@commitlint/config-conventional'] diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f98aedef..841801962 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ endif() project(cvcuda LANGUAGES C CXX - VERSION 0.7.0 + VERSION 0.8.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37852a875..ed687adb8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,7 +16,7 @@ # Contributing to CV-CUDA -**As of release v0.7.0-beta, CV-CUDA is not accepting outside contribution.** +**Currently, CV-CUDA is not accepting outside contributions.** Contributions to CV-CUDA fall into the following categories: @@ -28,8 +28,8 @@ Contributions to CV-CUDA fall into the following categories: 1. To propose a new feature, please file a new feature request [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the intended feature and discuss the design and implementation with the team and - community. NOTE: Currently, as of release v0.7.0-beta, CV-CUDA is not accepting - outside contribution. + community. NOTE: Currently, CV-CUDA is not accepting + outside contributions. 1. To ask a general question, please sumbit a question [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need more context on a particular issue, please ask in a comment. diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 83e42f22c..1b63db508 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -80,6 +80,7 @@ CV-CUDA includes: | Reformat | Converts a planar image into non-planar and vice versa | | Remap | Maps pixels in an image with one projection to another projection in a new image. | | Resize | Changes the size and scale of an image | +| ResizeCropConvertReformat | Performs fused Resize-Crop-Convert-Reformat sequence with optional channel reordering | | Rotate | Rotates a 2D array in multiples of 90 degrees | | SIFT | Identifies and describes features in images that are invariant to scale rotation and affine distortion. | | Thresholding | Chooses a global threshold value that is the same for all pixels across the image. | diff --git a/README.md b/README.md index 4eaf478d2..5a9af904d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.7.0--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.8.0--beta-blue) ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray) @@ -76,10 +76,10 @@ Choose the installation method that meets your environment needs. Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be found [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example: ```shell - pip install cvcuda_-0.7.0b0-cp-cp-linux_.whl + pip install cvcuda_--cp-cp-linux_.whl ``` -where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture. +where `` is the desired CUDA version, `` is the CV-CUDA release version, `` is the desired Python version and `` is the desired architecture. Please note that the Python wheels are standalone, they include both the C++/CUDA libraries and the Python bindings. @@ -185,8 +185,8 @@ Install the dependencies required to build the documentation: On Ubuntu, install the following packages using `apt` and `pip`: ```shell -apt install -y doxygen graphviz python3 python3-pip -python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme +apt install -y doxygen graphviz python3 python3-pip sphinx +python3 -m pip install breathe exhale recommonmark graphviz sphinx-rtd-theme ``` Build the documentation: @@ -249,7 +249,7 @@ pip install cvcuda_cu12--cp310-cp310-linux_x86_64.whl CV-CUDA is an open source project. As part of the Open Source Community, we are committed to the cycle of learning, improving, and updating that makes this -community thrive. However, as of release v0.7.0-beta, CV-CUDA is not yet ready +community thrive. However, CV-CUDA is not yet ready for external contributions. To understand the process for contributing the CV-CUDA, see our diff --git a/bench/BenchResizeCropConvertReformat.cpp b/bench/BenchResizeCropConvertReformat.cpp new file mode 100644 index 000000000..f058a10e0 --- /dev/null +++ b/bench/BenchResizeCropConvertReformat.cpp @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include +#include + +#include + +template +inline void ResizeCropConvertReformat(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + + NVCVInterpolationType interpType = benchutils::GetInterpolationType(state.get_string("interpolation")); + + int2 cropPos{1, 1}; + + NVCVSize2D resize; + + if (state.get_string("resizeType") == "EXPAND") + { + resize = NVCVSize2D{(int)(srcShape.y * 2), (int)(srcShape.z * 2)}; + } + else if (state.get_string("resizeType") == "CONTRACT") + { + resize = NVCVSize2D{(int)(srcShape.y / 2), (int)(srcShape.z / 2)}; + } + else + { + throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType")); + } + + NVCVChannelManip manip; + + if (state.get_string("manip") == "NO_OP") + { + manip = NVCV_CHANNEL_NO_OP; + } + else if (state.get_string("manip") == "REVERSE") + { + manip = NVCV_CHANNEL_REVERSE; + } + else + { + throw std::invalid_argument("Invalid channel manipulation = " + state.get_string("manip")); + } + + using BT = nvcv::cuda::BaseType; + long nc = nvcv::cuda::NumElements; + + long3 dstShape{srcShape.x, resize.h - cropPos.y, resize.w - cropPos.x}; + + if (dstShape.y <= 0 || dstShape.z <= 0) + { + throw std::invalid_argument("Invalid shape and resizeType"); + } + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::ResizeCropConvertReformat op; + + // clang-format off + + if (varShape < 0) // negative var shape means use Tensor + { + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, nc}, "NHWC"}, benchutils::GetDataType()); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, nc}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &resize, &interpType, &cropPos, &manip](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, resize, interpType, cropPos, manip); + }); + } + else // zero and positive var shape means use ImageBatchVarShape + { + nvcv::ImageBatchVarShape src(srcShape.x); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, nc}, "NHWC"}, benchutils::GetDataType()); + + benchutils::FillImageBatch(src, long2{srcShape.z, srcShape.y}, long2{varShape, varShape}, + benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &resize, &interpType, &cropPos, &manip](nvbench::launch &launch) + { + op(launch.get_stream(), src, dst, resize, interpType, cropPos, manip); + }); + } +} +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +// clang-format on + +using ResizeCropConvertReformatTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(ResizeCropConvertReformat, NVBENCH_TYPE_AXES(ResizeCropConvertReformatTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_int64_axis("varShape", {-1, 0}) + .add_string_axis("resizeType", {"EXPAND"}) + .add_string_axis("manip", {"NO_OP"}) + .add_string_axis("interpolation", {"LINEAR"}); diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index 3ca000274..a685f08ab 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -51,6 +51,7 @@ set(bench_sources BenchConvertTo.cpp BenchCopyMakeBorder.cpp BenchCropFlipNormalizeReformat.cpp + BenchResizeCropConvertReformat.cpp BenchCustomCrop.cpp BenchErase.cpp BenchGammaContrast.cpp diff --git a/bench/python/all_ops/op_as_image.py b/bench/python/all_ops/op_as_image.py new file mode 100644 index 000000000..bae98afa5 --- /dev/null +++ b/bench/python/all_ops/op_as_image.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvcv + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 +from bench_utils import AbstractOpBase + + +class OpAsImageFromNVCVImage(AbstractOpBase): + def setup(self, input): + # dummy run that does not use cache + img = nvcv.Image((128, 128), nvcv.Format.RGBA8) + + self.imglist = [] + for _ in range(10): + img = nvcv.Image((128, 128), nvcv.Format.RGBA8) + self.imglist.append(img.cuda()) + self.cycle = 0 + + def run(self, input): + nvcv.as_image(self.imglist[self.cycle % len(self.imglist)]) + self.cycle += 1 + return diff --git a/bench/python/all_ops/op_as_images.py b/bench/python/all_ops/op_as_images.py new file mode 100644 index 000000000..d5022cf23 --- /dev/null +++ b/bench/python/all_ops/op_as_images.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvcv + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 +from bench_utils import AbstractOpBase + + +class OpAsImagesFromNVCVImage(AbstractOpBase): + def setup(self, input): + # dummy run that does not use cache + nvcv.ImageBatchVarShape(100) + img = nvcv.Image((128, 128), nvcv.Format.RGBA8) + + self.imglists = [] + for _ in range(10): + imglist = [] + for _ in range(100): + img = nvcv.Image((128, 128), nvcv.Format.RGBA8) + imglist.append(img.cuda()) + self.imglists.append(imglist) + self.cycle = 0 + + def run(self, input): + nvcv.as_images(self.imglists[self.cycle % len(self.imglists)]) + self.cycle += 1 + return diff --git a/bench/python/all_ops/op_resize_crop_convert_reformat.py b/bench/python/all_ops/op_resize_crop_convert_reformat.py new file mode 100644 index 000000000..46001f068 --- /dev/null +++ b/bench/python/all_ops/op_resize_crop_convert_reformat.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import nvcv +import cvcuda + + +class OpResizeCropConvertReformat(AbstractOpBase): + def setup(self, input): + resize = 256 + crop = 224 + delta_shape = resize - crop + start = delta_shape // 2 + self.resize_dim = (resize, resize) + self.resize_interpolation = cvcuda.Interp.LINEAR + self.crop_rect = cvcuda.RectI(start, start, crop, crop) + + def run(self, input): + return cvcuda.resize_crop_convert_reformat( + input, + self.resize_dim, + self.resize_interpolation, + self.crop_rect, + layout="NHWC", + data_type=nvcv.Type.U8, + manip=cvcuda.ChannelManip.REVERSE, + ) diff --git a/bench/python/bench_utils.py b/bench/python/bench_utils.py index 91c9511a7..23578438e 100644 --- a/bench/python/bench_utils.py +++ b/bench/python/bench_utils.py @@ -266,19 +266,21 @@ def summarize_runs( results = [] - for op in baseline_perf["mean_all_batches"]["run_bench"]: + for op in baseline_perf["data_mean_all_procs"]["run_bench"]: if op.startswith("Op"): op_name = op[2:] row_dict = {} # Fetch the time and parameters from the JSON for baseline run. - baseline_run_time = baseline_perf["mean_all_batches"]["run_bench"][op][ + baseline_run_time = baseline_perf["data_mean_all_procs"]["run_bench"][op][ "run_op" - ]["cpu_time_minus_warmup_per_item"] + ]["cpu_time_minus_warmup_per_item"]["mean"] op_params = list( - baseline_perf["mean_all_batches"]["run_bench"][op]["op_params"].keys() + baseline_perf["data_mean_all_procs"]["run_bench"][op][ + "op_params" + ].keys() )[0] row_dict["operator name"] = op_name @@ -290,13 +292,15 @@ def summarize_runs( # Check if the OP was present. if ( op - in compare_perfs[compare_run_name]["mean_all_batches"][ + in compare_perfs[compare_run_name]["data_mean_all_procs"][ "run_bench" ] ): compare_run_time = compare_perfs[compare_run_name][ - "mean_all_batches" - ]["run_bench"][op]["run_op"]["cpu_time_minus_warmup_per_item"] + "data_mean_all_procs" + ]["run_bench"][op]["run_op"]["cpu_time_minus_warmup_per_item"][ + "mean" + ] else: compare_run_time = None diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 41baeada7..addfb8e02 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -19,7 +19,7 @@ # Usage: build_docs.sh [build folder] build_type="release" -build_dir="build" +build_dir="build-rel" if [[ $# -ge 1 ]]; then build_dir=$1 diff --git a/docker/config b/docker/config index 56df16cb9..563a3a99c 100644 --- a/docker/config +++ b/docker/config @@ -23,7 +23,7 @@ IMAGE_URL_BASE='' # change is done, such as removing some package, or updating # packaged versions that introduces incompatibilities. TAG_IMAGE=6 -TAG_IMAGE_SAMPLES=6.1 +TAG_IMAGE_SAMPLES=7 TAG_IMAGE_TEST=5 VER_CUDA=11.7.1 diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv index 85e45f080..9e286370f 100644 --- a/docs/sphinx/content/cvcuda_oplist.csv +++ b/docs/sphinx/content/cvcuda_oplist.csv @@ -43,6 +43,7 @@ RandomResizedCrop,Crops a random portion of an image and resizes it to a specifi Reformat,Converts a planar image into non-planar and vice versa Remap,Maps pixels in an image with one projection to another projection in a new image. Resize,Changes the size and scale of an image +ResizeCropConvertReformat,Performs fused Resize-Crop-Convert-Reformat sequence with optional channel reordering. Rotate,Rotates a 2D array in multiples of 90 degrees SIFT,Identifies and matches features in images that are invariant to scale rotation and affine distortion. Thresholding,Chooses a global threshold value that is the same for all pixels across the image. diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 254a0bf63..ceeb3265e 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -124,12 +124,13 @@ Copyright :maxdepth: 1 :hidden: - Beta.5 - Beta.4 - Beta.3 - Beta.2 - Beta.1 - Beta - Alpha.1 - Alpha - Pre-Alpha + v0.8.0-beta + v0.7.0-beta + v0.6.0-beta + v0.5.0-beta + v0.4.0-beta + v0.3.1-beta + v0.3.0-beta + v0.2.1-alpha + v0.2.0-alpha + v0.1.0-prealpha diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index 6c05a33d8..f1969c8ed 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -74,13 +74,13 @@ You can download the CV-CUDA tar, deb or wheel packages from `the asset section * Python Wheel File Installation - Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here `_ + Download the appropriate .whl file for your computer architecture, Python and CUDA version from `the asset section of the latest release `_ Execute the following command to install appropriate CV-CUDA Python wheel :: - pip install cvcuda_-0.7.0b0-cp-cp-linux_.whl + pip install cvcuda_--cp-cp-linux_.whl - where is the desired CUDA version, the desired Python version and the desired architecture. + where is the desired CUDA version, the CV-CUDA release version, the desired Python version and the desired architecture. Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings. diff --git a/docs/sphinx/relnotes/v0.1.0-prealpha.rst b/docs/sphinx/relnotes/v0.1.0-prealpha.rst index 474486572..ddb876c0e 100644 --- a/docs/sphinx/relnotes/v0.1.0-prealpha.rst +++ b/docs/sphinx/relnotes/v0.1.0-prealpha.rst @@ -16,7 +16,7 @@ .. _v0.1.0-prealpha: -PreAlpha +v0.1.0-preAlpha ======== CV-CUDA-0.1.0 is the first release of CV-CUDA. This release is for evaluation purposes only. diff --git a/docs/sphinx/relnotes/v0.2.0-alpha.rst b/docs/sphinx/relnotes/v0.2.0-alpha.rst index 0e8687216..b6ef95ade 100644 --- a/docs/sphinx/relnotes/v0.2.0-alpha.rst +++ b/docs/sphinx/relnotes/v0.2.0-alpha.rst @@ -16,7 +16,7 @@ .. _v0.2.0-alpha: -Alpha +v0.2.0-alpha ======== CV-CUDA 0.2.0 is the first open-source release of the project. diff --git a/docs/sphinx/relnotes/v0.2.1-alpha.rst b/docs/sphinx/relnotes/v0.2.1-alpha.rst index b77ee6d65..65d5b8861 100644 --- a/docs/sphinx/relnotes/v0.2.1-alpha.rst +++ b/docs/sphinx/relnotes/v0.2.1-alpha.rst @@ -16,7 +16,7 @@ .. _v0.2.1-alpha: -Alpha.1 +v0.2.1-alpha ======= General diff --git a/docs/sphinx/relnotes/v0.3.0-beta.rst b/docs/sphinx/relnotes/v0.3.0-beta.rst index f63c6d584..6473da539 100644 --- a/docs/sphinx/relnotes/v0.3.0-beta.rst +++ b/docs/sphinx/relnotes/v0.3.0-beta.rst @@ -16,7 +16,7 @@ .. _v0.3.0-beta: -Beta +v0.3.0-beta ==== CV-CUDA 0.3.0 is the next open-source release of the project. diff --git a/docs/sphinx/relnotes/v0.3.1-beta.rst b/docs/sphinx/relnotes/v0.3.1-beta.rst index 970359c98..c04a3d1f7 100644 --- a/docs/sphinx/relnotes/v0.3.1-beta.rst +++ b/docs/sphinx/relnotes/v0.3.1-beta.rst @@ -16,7 +16,7 @@ .. _v0.3.1-beta: -Beta.1 +0.3.1-beta ====== The v0.3.1 release provides several bug fixes along with documentation updates and performance improvements. diff --git a/docs/sphinx/relnotes/v0.4.0-beta.rst b/docs/sphinx/relnotes/v0.4.0-beta.rst index f6d99cb39..0f38b3138 100644 --- a/docs/sphinx/relnotes/v0.4.0-beta.rst +++ b/docs/sphinx/relnotes/v0.4.0-beta.rst @@ -16,7 +16,7 @@ .. _v0.4.0-beta: -Beta.2 +v0.4.0-beta ====== CV-CUDA 0.4.0 is a major release of the library providing multiple new operators, Jetson Orin support, and updated API documentation. diff --git a/docs/sphinx/relnotes/v0.5.0-beta.rst b/docs/sphinx/relnotes/v0.5.0-beta.rst index bd3633197..a15c1b98b 100644 --- a/docs/sphinx/relnotes/v0.5.0-beta.rst +++ b/docs/sphinx/relnotes/v0.5.0-beta.rst @@ -16,7 +16,7 @@ .. _v0.5.0-beta: -Beta.3 +v0.5.0-beta ====== CV-CUDA 0.5.0 is a comprehensive update introducing new security, compliance, and performance enhancements, alongside bug fixes and new features. diff --git a/docs/sphinx/relnotes/v0.6.0-beta.rst b/docs/sphinx/relnotes/v0.6.0-beta.rst index ca0995a67..c199fb2bd 100644 --- a/docs/sphinx/relnotes/v0.6.0-beta.rst +++ b/docs/sphinx/relnotes/v0.6.0-beta.rst @@ -16,7 +16,7 @@ .. _v0.6.0-beta: -Beta.4 +v0.6.0-beta ====== CV-CUDA 0.6.0 is a comprehensive update introducing new packaging and documentation enhancements, along with bug fixes and new features. diff --git a/docs/sphinx/relnotes/v0.7.0-beta.rst b/docs/sphinx/relnotes/v0.7.0-beta.rst index 5ad3ae437..196d236b4 100644 --- a/docs/sphinx/relnotes/v0.7.0-beta.rst +++ b/docs/sphinx/relnotes/v0.7.0-beta.rst @@ -16,10 +16,10 @@ .. _v0.7.0-beta: -Beta.5 +v0.7.0-beta ====== -CV-CUDA 0.7.0 introduces performance and support enhancements, along with bug fixes and new features. +CV-CUDA v0.7.0 introduces performance and support enhancements, along with bug fixes and new features. Release Highlights ------------------ diff --git a/docs/sphinx/relnotes/v0.8.0-beta.rst b/docs/sphinx/relnotes/v0.8.0-beta.rst new file mode 100644 index 000000000..59e97fab6 --- /dev/null +++ b/docs/sphinx/relnotes/v0.8.0-beta.rst @@ -0,0 +1,66 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.8.0-beta: + +v0.8.0-beta +====== + +Release Highlights +------------------ + +CV-CUDA v0.8.0 includes the following changes: + +* **New Operator**: + + * Introduced fused 'ResizeCropConvertReformat' operator + +* **New Features**: + + * Improved initialization of Image and ImageBatchVarShape: enabled efficient cache reuse + * Improved benchmarking utilities: added throughput computation and power/clock monitoring + * Added tests to Resize, BilateralFilter, CvtColor, Erase, JointBilateralFilter, PillowResize + +* **Bug Fixes**: + + * Fixed potential crash when using custom streams + * Switched PairwiseMatcher to use L2-norm as default + * Fixed documentation of CropFlipNormalizeReformat + + + +Compatibility and Known Limitations +----------------------------------- + +See main README on `CV-CUDA GitHub `_. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt index 66b53d87f..b533a105d 100644 --- a/python/mod_cvcuda/CMakeLists.txt +++ b/python/mod_cvcuda/CMakeLists.txt @@ -21,6 +21,8 @@ nvcv_python_add_module( OUTPUT_NAME cvcuda SOURCES Main.cpp + ChannelManipType.cpp + OpResizeCropConvertReformat.cpp OpPairwiseMatcher.cpp PairwiseMatcherType.cpp NormType.cpp diff --git a/python/mod_cvcuda/ChannelManipType.cpp b/python/mod_cvcuda/ChannelManipType.cpp new file mode 100644 index 000000000..e32085100 --- /dev/null +++ b/python/mod_cvcuda/ChannelManipType.cpp @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ChannelManipType.hpp" + +#include + +namespace cvcudapy { + +void ExportChannelManipType(py::module &m) +{ + py::enum_(m, "ChannelManip") + .value("NO_OP", NVCV_CHANNEL_NO_OP, "No manipulation (i.e., channels are unchanged)") + .value("REVERSE", NVCV_CHANNEL_REVERSE, "Reverse channel order (e.g., RGB to BGR, RGBA to ABGR, etc.)"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/ChannelManipType.hpp b/python/mod_cvcuda/ChannelManipType.hpp new file mode 100644 index 000000000..ee0a30ff1 --- /dev/null +++ b/python/mod_cvcuda/ChannelManipType.hpp @@ -0,0 +1,29 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVCV_PYTHON_CHANNELMANIPTYPE_HPP +#define NVCV_PYTHON_CHANNELMANIPTYPE_HPP + +#include + +namespace cvcudapy { +namespace py = ::pybind11; + +void ExportChannelManipType(py::module &m); +} // namespace cvcudapy + +#endif // NVCV_PYTHON_CHANNELMANIPTYPE_HPP diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp index aff67174b..f5c26574f 100644 --- a/python/mod_cvcuda/Main.cpp +++ b/python/mod_cvcuda/Main.cpp @@ -17,6 +17,7 @@ #include "AdaptiveThresholdType.hpp" #include "BorderType.hpp" +#include "ChannelManipType.hpp" #include "ColorConversionCode.hpp" #include "ConnectivityType.hpp" #include "InterpolationType.hpp" @@ -77,6 +78,7 @@ PYBIND11_MODULE(cvcuda, m) // Operators' auxiliary entities ExportInterpolationType(m); + ExportChannelManipType(m); ExportBorderType(m); ExportMorphologyType(m); ExportColorConversionCode(m); @@ -92,6 +94,7 @@ PYBIND11_MODULE(cvcuda, m) ExportPairwiseMatcherType(m); // CV-CUDA Operators + ExportOpResizeCropConvertReformat(m); ExportOpPairwiseMatcher(m); ExportOpLabel(m); ExportOpOSD(m); diff --git a/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp b/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp index 1eaacfce5..f5f8af139 100644 --- a/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp +++ b/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp @@ -102,9 +102,9 @@ void ExportOpCropFlipNormalizeReformat(py::module &m) rect (Tensor): The crop rectangle tensor which has shape of [batch_size, 1, 1, 4] in reference to the input tensor. The crop value of [crop_x, crop_y, crop_width, crop_height] stored in the final dimension of the crop tensor, provided per image. - flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping - around the x-axis and positive value (for example, 1) means flipping - around y-axis. Negative value (for example, -1) means flipping around both axes, provided per image. + flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping around the x-axis, + 1 means flipping around the y-axis, -1 means flipping around both axes, + and any other value will result in no flip, provided per image. base (Tensor): Tensor providing base values for normalization. scale (Tensor): Tensor providing scale values for normalization. globalscale (float ,optional): Additional scale value to be used in addition to scale @@ -142,9 +142,9 @@ void ExportOpCropFlipNormalizeReformat(py::module &m) rect (Tensor): The crop rectangle tensor which has shape of [batch_size, 1, 1, 4] in reference to the input tensor. The crop value of [crop_x, crop_y, crop_width, crop_height] stored in the final dimension of the crop tensor, provided per image. - flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping - around the x-axis and positive value (for example, 1) means flipping - around y-axis. Negative value (for example, -1) means flipping around both axes, provided per image. + flip_code (Tensor): A tensor flag to specify how to flip the array; 0 means flipping around the x-axis, + 1 means flipping around the y-axis, -1 means flipping around both axes, + and any other value will result in no flip, provided per image. base (Tensor): Tensor providing base values for normalization. scale (Tensor): Tensor providing scale values for normalization. globalscale (float ,optional): Additional scale value to be used in addition to scale diff --git a/python/mod_cvcuda/OpPairwiseMatcher.cpp b/python/mod_cvcuda/OpPairwiseMatcher.cpp index 195c1f19c..687570a03 100644 --- a/python/mod_cvcuda/OpPairwiseMatcher.cpp +++ b/python/mod_cvcuda/OpPairwiseMatcher.cpp @@ -44,7 +44,7 @@ TupleTensor3 PairwiseMatcherInto(Tensor &matches, std::optional numMatch if (!normType) { - normType = set1.dtype() == nvcv::TYPE_F32 ? NVCV_NORM_L2 : NVCV_NORM_HAMMING; + normType = NVCV_NORM_L2; } auto op = CreateOperator(algoChoice); @@ -149,8 +149,7 @@ void ExportOpPairwiseMatcher(py::module &m) cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is the best match (minimum distance) from 1st set to 2nd set and vice versa. matches_per_point (Number, optional): Number of best matches to return per point. - norm_type (cvcuda.Norm, optional): Choice on how distances are normalized. Defaults to cvcuda.Norm.L2 - for float input and cvcuda.Norm.HAMMING for other input data types. + norm_type (cvcuda.Norm, optional): Choice on how distances are normalized. Defaults to cvcuda.Norm.L2. algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match. stream (Stream, optional): CUDA Stream on which to perform the operation. @@ -186,8 +185,7 @@ void ExportOpPairwiseMatcher(py::module &m) cross_check (bool, optional): Use True to cross check best matches, a best match is only returned if it is the best match (minimum distance) from 1st set to 2nd set and vice versa. matches_per_point (Number, optional): Number of best matches to return per point. - norm_type (cvcuda.Norm, optional): Choice on how distances are normalized. Defaults to cvcuda.Norm.L2 - for float input and cvcuda.Norm.HAMMING for other input data types. + norm_type (cvcuda.Norm, optional): Choice on how distances are normalized. Defaults to cvcuda.Norm.L2. algo_choice (cvcuda.Matcher, optional): Choice of the algorithm to perform the match. stream (Stream, optional): CUDA Stream on which to perform the operation. diff --git a/python/mod_cvcuda/OpResizeCropConvertReformat.cpp b/python/mod_cvcuda/OpResizeCropConvertReformat.cpp new file mode 100644 index 000000000..a99c5072f --- /dev/null +++ b/python/mod_cvcuda/OpResizeCropConvertReformat.cpp @@ -0,0 +1,374 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operators.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cvcudapy { + +namespace { +Tensor ResizeCropConvertReformatInto(Tensor &dst, Tensor &src, const std::tuple resizeDim, + NVCVInterpolationType interp, const std::tuple cropPos, + const NVCVChannelManip manip, std::optional pstream) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + auto resize = CreateOperator(); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_MODE_READ, {src}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*resize}); + + nvcv::Size2D size_wh{std::get<0>(resizeDim), std::get<1>(resizeDim)}; + int2 crop_xy{std::get<0>(cropPos), std::get<1>(cropPos)}; + + resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip); + + return std::move(dst); +} + +Tensor ResizeCropConvertReformat(Tensor &src, const std::tuple resizeDim, NVCVInterpolationType interp, + const NVCVRectI cropRect, const char *layout, nvcv::DataType dataType, + const NVCVChannelManip manip, std::optional pstream) +{ + nvcv::TensorLayout srcLayout = src.layout(); + + if (srcLayout != NVCV_TENSOR_HWC && srcLayout != NVCV_TENSOR_NHWC) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_IMAGE_FORMAT, + "Input tensor must have layout 'HWC' or 'NHWC'."); + } + + nvcv::TensorLayout dstLayout = (layout && *layout ? nvcv::TensorLayout(layout) : nvcv::TensorLayout("")); + + if (dstLayout.rank() == 0) + { + dstLayout = srcLayout; + } + + if (dstLayout.rank() != srcLayout.rank()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_IMAGE_FORMAT, + "Output tensor rank must match input tensor rank."); + } + if (dstLayout != NVCV_TENSOR_HWC && dstLayout != NVCV_TENSOR_NHWC && dstLayout != NVCV_TENSOR_CHW + && dstLayout != NVCV_TENSOR_NCHW) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_IMAGE_FORMAT, + "Output tensor must have layout 'HWC', 'NHWC', 'CHW', or 'NCHW'."); + } + + nvcv::TensorShape srcShape = Permute(src.shape(), NVCV_TENSOR_NHWC); + + nvcv::TensorShape::ShapeType shape = srcShape.shape(); + + shape[2] = cropRect.width; + shape[1] = cropRect.height; + + nvcv::TensorShape dstShape = Permute(nvcv::TensorShape(shape, NVCV_TENSOR_NHWC), dstLayout); + + Tensor dst = Tensor::Create(dstShape, dataType); + + const std::tuple cropPos = std::make_tuple((int)cropRect.x, (int)cropRect.y); + + return ResizeCropConvertReformatInto(dst, src, resizeDim, interp, cropPos, manip, pstream); +} + +Tensor ResizeCropConvertReformatVarShapeInto(Tensor &dst, ImageBatchVarShape &src, const std::tuple resizeDim, + NVCVInterpolationType interp, const std::tuple cropPos, + const NVCVChannelManip manip, std::optional pstream) +{ + if (!pstream) + { + pstream = Stream::Current(); + } + + auto resize = CreateOperator(); + + ResourceGuard guard(*pstream); + guard.add(LockMode::LOCK_MODE_READ, {src}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*resize}); + + nvcv::Size2D size_wh(std::get<0>(resizeDim), std::get<1>(resizeDim)); + int2 crop_xy{std::get<0>(cropPos), std::get<1>(cropPos)}; + + resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip); + + return std::move(dst); +} + +Tensor ResizeCropConvertReformatVarShape(ImageBatchVarShape &src, const std::tuple resizeDim, + NVCVInterpolationType interp, const NVCVRectI cropRect, const char *layout, + nvcv::DataType dataType, const NVCVChannelManip manip, + std::optional pstream) +{ + const nvcv::ImageFormat srcFrmt = src.uniqueFormat(); + if (!srcFrmt) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must have same format across all images."); + } + + nvcv::TensorLayout dstLayout = (layout && *layout ? nvcv::TensorLayout(layout) : nvcv::TensorLayout("")); + + int channels = srcFrmt.numChannels(); + int images = src.numImages(); + + if (channels != 3) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must have 3 channels."); + } + + if (srcFrmt != nvcv::FMT_RGB8 && srcFrmt != nvcv::FMT_BGR8) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input must have three interleaved, 8-bit channels in RGB or BGR format."); + } + + nvcv::TensorShape shape; + + if (dstLayout.rank() == 0) + { + if (srcFrmt == nvcv::FMT_RGB8 || srcFrmt == nvcv::FMT_BGR8) + { + shape = nvcv::TensorShape{ + {images, cropRect.height, cropRect.width, channels}, + NVCV_TENSOR_NHWC + }; + } + } + else + { + if (dstLayout == NVCV_TENSOR_NHWC || dstLayout == NVCV_TENSOR_HWC || dstLayout == NVCV_TENSOR_NHW + || dstLayout == NVCV_TENSOR_HW) + shape = nvcv::TensorShape{ + {images, cropRect.height, cropRect.width, channels}, + NVCV_TENSOR_NHWC + }; + else if (dstLayout == NVCV_TENSOR_NCHW || dstLayout == NVCV_TENSOR_CHW) + shape = nvcv::TensorShape{ + {images, channels, cropRect.height, cropRect.width}, + NVCV_TENSOR_NCHW + }; + else + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Destination layout must be 'HWC', 'NHWC', 'NHW', 'HW', 'NCHW', or 'CHW'."); + } + } + + Tensor dst = Tensor::Create(shape, dataType); + + const std::tuple cropPos = std::make_tuple((int)cropRect.x, (int)cropRect.y); + + return ResizeCropConvertReformatVarShapeInto(dst, src, resizeDim, interp, cropPos, manip, pstream); +} + +} // namespace + +void ExportOpResizeCropConvertReformat(py::module &m) +{ + using namespace pybind11::literals; + + py::options options; + options.disable_function_signatures(); + + m.def("resize_crop_convert_reformat", &ResizeCropConvertReformat, "src"_a, "resize_dim"_a, "interp"_a, + "crop_rect"_a, py::kw_only(), "layout"_a = "", "data_type"_a = NVCV_DATA_TYPE_NONE, + "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + + cvcuda.resize_crop_convert_reformat(src: nvcv.Tensor, + resize_dim: tuple[int,int], + interp: cvcuda.Interp, + crop_rect: nvcv.RectI, + *, + layout: str = "", + data_type: nvcv.Type = 0, + manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor + + Executes the ResizeCropConvertReformat operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator + for more details and usage examples. + + Args: + src (nvcv.Tensor): Input tensor containing one or more images. + resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping). + interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and + NVCV_INTERP_LINEAR are available. + crop_rect (nvcv.RectI): Crop rectangle, (top, left, width, height), specifying the top-left corner and + width & height dimensions of the region to crop from the resized images. + layout(string, optional): String specifying output tensor layout (e.g., 'NHWC' or 'CHW'). Empty string + (default) indicates output tensor layout copies input. + data_type(nvcv.Type, optional): Data type of output tensor channel (e.g., uint8 or float). 0 (default) + indicates output tensor data type copies input. + manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) + indicates output tensor channels are unchanged. + stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + nvcv.Tensor: The output tensor. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("resize_crop_convert_reformat_into", &ResizeCropConvertReformatInto, "dst"_a, "src"_a, "resize_dim"_a, + "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + + cvcuda.resize_crop_convert_reformat_into(dst: nvcv.Tensor, + src: nvcv.Tensor, + resize_dim: tuple[int,int], + interp: cvcuda.Interp, + cropPos: tuple[int,int], + *, + manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + stream: Optional[nvcv.cuda.Stream] = None) + + Executes the ResizeCropConvertReformat operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator + for more details and usage examples. + + Args: + dst (nvcv.Tensor): Output tensor to store the result of the operation. Output tensor also specifies the + crop dimensions (i.e., width & height), as well as the output data type (e.g., uchar3 + or float) and tensor layout (e.g., 'NHWC' or 'NCHW'). + src (nvcv.Tensor): Input tensor containing one or more images. + resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping). + interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and + NVCV_INTERP_LINEAR are available. + cropPos (tuple[int,int]): Crop position, (top, left), specifying the top-left corner of the region to crop + from the resized images. The crop region's width and height is specified by the + output tensor's width & height. + manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) + indicates output tensor channels are unchanged. + stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + None + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("resize_crop_convert_reformat", &ResizeCropConvertReformatVarShape, "src"_a, "resize_dim"_a, "interp"_a, + "crop_rect"_a, py::kw_only(), "layout"_a = "", "data_type"_a = NVCV_DATA_TYPE_NONE, + "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + + cvcuda.resizeCropConvertReformat(src: nvcv.ImageBatchVarShape, + resize_dim: tuple[int,int], + interp: cvcuda.Interp, + crop_rect: nvcv.RectI, + *, + layout: str = "", + data_type: nvcv.Type = 0, + manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor + + Executes the ResizeCropConvertReformat operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator + for more details and usage examples. + + Args: + src (ImageBatchVarShape): Input image batch containing one or more images of varying sizes, but all images + must have the same data type, channels, and layout. + resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping). + interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and + NVCV_INTERP_LINEAR are available. + crop_rect (nvcv.RectI): Crop rectangle, (top, left, width, height), specifying the top-left corner and + width & height dimensions of the region to crop from the resized images. + layout(string, optional): String specifying output tensor layout (e.g., 'NHWC' or 'CHW'). Empty string + (default) indicates output tensor layout copies input. + data_type(nvcv.Type, optional): Data type of output tensor channel (e.g., uint8 or float). 0 (default) + indicates output tensor data type copies input. + manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) + indicates output tensor channels are unchanged. + stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + nvcv.Tensor: The output tensor. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + + m.def("resize_crop_convert_reformat_into", &ResizeCropConvertReformatVarShapeInto, "dst"_a, "src"_a, "resize_dim"_a, + "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + + cvcuda.resize_crop_convert_reformat_into(dst: nvcv.Tensor, + src: nvcv.ImageBatchVarShape, + resize_dim: tuple[int,int], + interp: cvcuda.Interp, + cropPos: tuple[int,int], + *, + manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + stream: Optional[nvcv.cuda.Stream] = None) + + Executes the ResizeCropConvertReformat operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the ResizeCropConvertReformat operator + for more details and usage examples. + + Args: + dst (nvcv.Tensor): Output tensor to store the result of the operation. Output tensor also specifies the + crop dimensions (i.e., width & height), as well as the output data type (e.g., uchar3 + or float) and tensor layout (e.g., 'NHWC' or 'NCHW'). + src (ImageBatchVarShape): Input image batch containing one or more images of varying sizes, but all images + must have the same data type, channels, and layout. + resize_dim (tuple[int,int]): Dimensions, width & height, of resized tensor (prior to cropping). + interp (cvcuda.Interp): Interpolation type used for resizing. Currently, only NVCV_INTERP_NEAREST and + NVCV_INTERP_LINEAR are available. + cropPos (tuple[int,int]): Crop position, (top, left), specifying the top-left corner of the region to + crop from the resized images. The crop region's width and height is specified by + the output tensor's width & height. + manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) + indicates output tensor channels are unchanged. + stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + None + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp index 6197e43ba..55307fcff 100644 --- a/python/mod_cvcuda/Operators.hpp +++ b/python/mod_cvcuda/Operators.hpp @@ -99,6 +99,7 @@ void ExportOpLabel(py::module &m); void ExportOpPairwiseMatcher(py::module &m); void ExportOpStack(py::module &m); void ExportOpFindHomography(py::module &m); +void ExportOpResizeCropConvertReformat(py::module &m); // Helper class that serves as generic python-side operator class. // OP: native operator class diff --git a/python/mod_nvcv/Cache.cpp b/python/mod_nvcv/Cache.cpp index 23053439d..29c6d8d5f 100644 --- a/python/mod_nvcv/Cache.cpp +++ b/python/mod_nvcv/Cache.cpp @@ -151,6 +151,37 @@ std::vector> Cache::fetch(const IKey &key) const return v; } +#ifndef NDEBUG +void Cache::dbgPrintCacheForKey(const IKey &key, const std::string &prefix) +{ + std::vector> v; + std::unique_lock lk(pimpl->mtx); + auto itrange = pimpl->items.equal_range(&key); + + for (auto it = itrange.first; it != itrange.second; ++it) + { + std::cerr << prefix << typeid(*(it->second)).name() << " - " << it->second.use_count() << std::endl; + } +} +#endif + +std::shared_ptr Cache::fetchOne(const IKey &key) const +{ + std::unique_lock lk(pimpl->mtx); + + auto itrange = pimpl->items.equal_range(&key); + + for (auto it = itrange.first; it != itrange.second; ++it) + { + if (!it->second->isInUse()) + { + return it->second; + } + } + + return {}; +} + void Cache::clear() { std::unique_lock lk(pimpl->mtx); diff --git a/python/mod_nvcv/Cache.hpp b/python/mod_nvcv/Cache.hpp index c933c9196..fdc962911 100644 --- a/python/mod_nvcv/Cache.hpp +++ b/python/mod_nvcv/Cache.hpp @@ -76,6 +76,12 @@ class PYBIND11_EXPORT Cache void removeAllNotInUseMatching(const IKey &key); std::vector> fetch(const IKey &key) const; + std::shared_ptr fetchOne(const IKey &key) const; + +#ifndef NDEBUG + // Make this function available only in Debug builds + void dbgPrintCacheForKey(const IKey &key, const std::string &prefix = ""); +#endif template std::vector> fetchAll() const diff --git a/python/mod_nvcv/Image.cpp b/python/mod_nvcv/Image.cpp index 703611db7..393937306 100644 --- a/python/mod_nvcv/Image.cpp +++ b/python/mod_nvcv/Image.cpp @@ -463,28 +463,7 @@ Image::Image(std::vector> bufs, const nvcv::Imag { m_wrapData.emplace(); - NVCV_ASSERT(bufs.size() >= 1); - m_wrapData->devType = bufs[0]->dlTensor().device.device_type; - - if (bufs.size() == 1) - { - m_wrapData->obj = py::cast(bufs[0]); - } - else - { - for (size_t i = 1; i < bufs.size(); ++i) - { - if (bufs[i]->dlTensor().device.device_type != bufs[0]->dlTensor().device.device_type - || bufs[i]->dlTensor().device.device_id != bufs[0]->dlTensor().device.device_id) - { - throw std::runtime_error("All buffers must belong to the same device, but some don't."); - } - } - - m_wrapData->obj = py::cast(std::move(bufs)); - } - - m_impl = nvcv::ImageWrapData(imgData); + this->setWrapData(std::move(bufs), imgData); } Image::Image(std::vector bufs, const nvcv::ImageDataStridedHost &hostData, int rowAlign) @@ -571,6 +550,52 @@ std::shared_ptr Image::WrapExternalBuffer(ExternalBuffer &buffer, nvcv::I return WrapExternalBufferVector({obj}, fmt); } +std::vector> Image::WrapExternalBufferMany(std::vector> &buffers, + nvcv::ImageFormat fmt) +{ + // This is the key of an image wrapper. + // All image wrappers have the same key. + Image::Key key; + + std::vector> items = Cache::Instance().fetch(key); + + std::vector> out; + out.reserve(buffers.size()); + + for (size_t i = 0; i < buffers.size(); ++i) + { + std::vector> spBuffers; + spBuffers.push_back(buffers[i]); + + if (!spBuffers.back()) + throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces"); + + std::vector bufinfos; + bufinfos.emplace_back(spBuffers[0]->dlTensor()); + nvcv::ImageDataStridedCuda imgData = CreateNVCVImageDataCuda(bufinfos, fmt); + + // None found? + if (items.empty()) + { + // Need to add wrappers into cache so that they don't get destroyed by + // the cuda stream when they're last used, and python script isn't + // holding a reference to them. If we don't do it, things might break. + std::shared_ptr img(new Image(std::move(spBuffers), imgData)); + Cache::Instance().add(*img); + out.push_back(img); + } + else + { + std::shared_ptr img = std::static_pointer_cast(items.back()); + items.pop_back(); + img->setWrapData(std::move(spBuffers), imgData); + out.push_back(img); + } + } + + return out; +} + std::shared_ptr Image::WrapExternalBufferVector(std::vector buffers, nvcv::ImageFormat fmt) { std::vector> spBuffers; @@ -594,16 +619,55 @@ std::shared_ptr Image::WrapExternalBufferVector(std::vector b // This is the key of an image wrapper. // All image wrappers have the same key. Image::Key key; - // We take this opportunity to remove from cache all wrappers that aren't - // being used. They aren't reusable anyway. - Cache::Instance().removeAllNotInUseMatching(key); - // Need to add wrappers to cache so that they don't get destroyed by - // the cuda stream when they're last used, and python script isn't - // holding a reference to them. If we don't do it, things might break. - std::shared_ptr img(new Image(std::move(spBuffers), imgData)); - Cache::Instance().add(*img); - return img; + std::shared_ptr item = Cache::Instance().fetchOne(key); + + // None found? + if (!item) + { + // Need to add wrappers into cache so that they don't get destroyed by + // the cuda stream when they're last used, and python script isn't + // holding a reference to them. If we don't do it, things might break. + std::shared_ptr img(new Image(std::move(spBuffers), imgData)); + Cache::Instance().add(*img); + return img; + } + else + { + std::shared_ptr img = std::static_pointer_cast(item); + img->setWrapData(std::move(spBuffers), imgData); + return img; + } +} + +void Image::setWrapData(std::vector> bufs, const nvcv::ImageDataStridedCuda &imgData) +{ + NVCV_ASSERT(m_wrapData); + + NVCV_ASSERT(bufs.size() >= 1); + m_wrapData->devType = bufs[0]->dlTensor().device.device_type; + + if (bufs.size() == 1) + { + m_wrapData->obj = py::cast(bufs[0]); + } + else + { + for (size_t i = 1; i < bufs.size(); ++i) + { + if (bufs[i]->dlTensor().device.device_type != bufs[0]->dlTensor().device.device_type + || bufs[i]->dlTensor().device.device_id != bufs[0]->dlTensor().device.device_id) + { + throw std::runtime_error("All buffers must belong to the same device, but some don't."); + } + } + + m_wrapData->obj = py::cast(std::move(bufs)); + } + + //We recreate the nvcv::Image wrapper (m_impl) because it's cheap. + //It's not cheap to create nvcvpy::Image as it might have allocated expensive resources (cudaEvent_t in Resource parent). + m_impl = nvcv::ImageWrapData(imgData); } std::shared_ptr Image::CreateHost(py::buffer buffer, nvcv::ImageFormat fmt, int rowAlign) diff --git a/python/mod_nvcv/Image.hpp b/python/mod_nvcv/Image.hpp index eb3c7f768..4bf10a887 100644 --- a/python/mod_nvcv/Image.hpp +++ b/python/mod_nvcv/Image.hpp @@ -46,7 +46,9 @@ class Image final : public Container static std::shared_ptr CreateHost(py::buffer buffer, nvcv::ImageFormat fmt, int rowAlign); static std::shared_ptr CreateHostVector(std::vector buffer, nvcv::ImageFormat fmt, int rowAlign); - static std::shared_ptr WrapExternalBuffer(ExternalBuffer &buffer, nvcv::ImageFormat fmt); + static std::shared_ptr WrapExternalBuffer(ExternalBuffer &buffer, nvcv::ImageFormat fmt); + static std::vector> WrapExternalBufferMany( + std::vector> &buffer, nvcv::ImageFormat fmt); static std::shared_ptr WrapExternalBufferVector(std::vector buffer, nvcv::ImageFormat fmt); std::shared_ptr shared_from_this(); @@ -106,6 +108,8 @@ class Image final : public Container explicit Image(std::vector> buf, const nvcv::ImageDataStridedCuda &imgData); explicit Image(std::vector buf, const nvcv::ImageDataStridedHost &imgData, int rowalign); + void setWrapData(std::vector> buf, const nvcv::ImageDataStridedCuda &imgData); + nvcv::Image m_impl; // must come before m_key Key m_key; diff --git a/python/mod_nvcv/ImageBatch.cpp b/python/mod_nvcv/ImageBatch.cpp index 58831fe9e..c0595b22b 100644 --- a/python/mod_nvcv/ImageBatch.cpp +++ b/python/mod_nvcv/ImageBatch.cpp @@ -51,7 +51,8 @@ std::shared_ptr ImageBatchVarShape::Create(int capacity) else { // Get the first one - auto batch = std::static_pointer_cast(vcont[0]); + auto batch = std::static_pointer_cast(vcont[0]); + Image::Key key; batch->clear(); // make sure it's in pristine state return batch; } @@ -60,17 +61,22 @@ std::shared_ptr ImageBatchVarShape::Create(int capacity) std::shared_ptr ImageBatchVarShape::WrapExternalBufferVector(std::vector buffers, nvcv::ImageFormat fmt) { - auto batch = Create(buffers.size()); - for (auto &obj : buffers) + std::vector> buflist; + buflist.reserve(buffers.size()); + + for (py::object &obj : buffers) { std::shared_ptr buffer = cast_py_object_as(obj); if (!buffer) { throw std::runtime_error("Input buffer doesn't provide cuda_array_interface or DLPack interfaces"); } - auto image = Image::WrapExternalBuffer(*buffer, fmt); - batch->pushBack(*image); + buflist.push_back(buffer); } + + std::shared_ptr batch = Create(buffers.size()); + batch->pushBackMany(Image::WrapExternalBufferMany(buflist, fmt)); + return batch; } @@ -127,16 +133,24 @@ void ImageBatchVarShape::pushBack(Image &img) m_list.push_back(img.shared_from_this()); } -void ImageBatchVarShape::pushBackMany(std::vector> &imgList) +void ImageBatchVarShape::pushBackMany(const std::vector> &imgList) { - for (auto &img : imgList) + std::vector handlelist; + handlelist.reserve(imgList.size()); + for (size_t i = 0; i < imgList.size(); ++i) { - m_list.push_back(img); - if (img) - m_impl.pushBack(img->impl()); + if (imgList[i]) + { + handlelist.push_back(imgList[i]->impl().handle()); + } else - m_impl.pushBack(nvcv::Image()); + { + handlelist.push_back(nullptr); + } + m_list.push_back(imgList[i]); } + + nvcv::detail::CheckThrow(nvcvImageBatchVarShapePushImages(m_impl.handle(), handlelist.data(), handlelist.size())); } void ImageBatchVarShape::popBack(int imgCount) diff --git a/python/mod_nvcv/ImageBatch.hpp b/python/mod_nvcv/ImageBatch.hpp index 7006d4cfd..ffa58d865 100644 --- a/python/mod_nvcv/ImageBatch.hpp +++ b/python/mod_nvcv/ImageBatch.hpp @@ -55,7 +55,7 @@ class ImageBatchVarShape : public Container Size2D maxSize() const; void pushBack(Image &img); - void pushBackMany(std::vector> &imgList); + void pushBackMany(const std::vector> &imgList); void popBack(int imgCount); void clear(); diff --git a/python/mod_nvcv/Main.cpp b/python/mod_nvcv/Main.cpp index d02bf3896..372f02a24 100644 --- a/python/mod_nvcv/Main.cpp +++ b/python/mod_nvcv/Main.cpp @@ -48,24 +48,32 @@ PYBIND11_MODULE(nvcv, m) using namespace nvcvpy::priv; - // Core entities - Cache::Export(m); - - { - py::module_ cuda = m.def_submodule("cuda"); - Stream::Export(cuda); - } + // These will be destroyed in the reverse order here + // Since everything is ref counted the order should not matter + // but it is safer to ini them in order + // Core entities + ExportCAPI(m); + Resource::Export(m); + Container::Export(m); ExternalBuffer::Export(m); + + // Supporting objects ExportImageFormat(m); ExportDataType(m); ExportRect(m); - Resource::Export(m); - Container::Export(m); + ExportColorSpec(m); + + // Objects Tensor::Export(m); TensorBatch::Export(m); Image::Export(m); ImageBatchVarShape::Export(m); - ExportCAPI(m); - ExportColorSpec(m); + + // Cache and Streams + Cache::Export(m); + { + py::module_ cuda = m.def_submodule("cuda"); + Stream::Export(cuda); + } } diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp index a8d0fe67a..5aea1d631 100644 --- a/python/mod_nvcv/Resource.cpp +++ b/python/mod_nvcv/Resource.cpp @@ -31,15 +31,6 @@ Resource::Resource() m_id = idnext++; m_event = nullptr; - try - { - util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); - } - catch (...) - { - cudaEventDestroy(m_event); - throw; - } } Resource::~Resource() @@ -52,6 +43,15 @@ uint64_t Resource::id() const return m_id; } +cudaEvent_t Resource::event() +{ + if (m_event == nullptr) + { + util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)); + } + return m_event; +} + void Resource::submitSync(Stream &stream) { //Check if we have a last stream, if not set it to the current stream @@ -69,8 +69,8 @@ void Resource::submitSync(Stream &stream) // if we are on a different stream we need to wait for that stream to finish // write event on the old stream, the new stream will have to wait for it to be done - util::CheckThrow(cudaEventRecord(m_event, m_lastStream.value()->handle())); - util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_event)); + util::CheckThrow(cudaEventRecord(event(), m_lastStream.value()->handle())); + util::CheckThrow(cudaStreamWaitEvent(stream.handle(), event())); // update the last stream since we changed streams m_lastStream.reset(); diff --git a/python/mod_nvcv/Resource.hpp b/python/mod_nvcv/Resource.hpp index 010c8b33c..f66b4b5ac 100644 --- a/python/mod_nvcv/Resource.hpp +++ b/python/mod_nvcv/Resource.hpp @@ -91,6 +91,8 @@ class PYBIND11_EXPORT Resource : public virtual Object uint64_t m_id; /**< The unique identifier of the resource. */ cudaEvent_t m_event; /**< The CUDA event used for synchronization. */ std::optional> m_lastStream; /**< Cache the last stream used for this resource. */ + + cudaEvent_t event(); }; } // namespace nvcvpy::priv diff --git a/python/mod_nvcv/Stream.cpp b/python/mod_nvcv/Stream.cpp index bd3c1f9dc..39afa8203 100644 --- a/python/mod_nvcv/Stream.cpp +++ b/python/mod_nvcv/Stream.cpp @@ -436,24 +436,32 @@ void Stream::Export(py::module &m) util::RegisterCleanup(m, [globalStream]() { - for (std::shared_ptr stream : Cache::Instance().fetchAll()) + try { - stream->sync(); + for (std::shared_ptr stream : Cache::Instance().fetchAll()) + { + stream->sync(); + } + globalStream->sync(); + + // There should only be 1 stream in the stack, namely the + // global stream. + auto s = StreamStack::Instance().top(); + if (s != globalStream) + { + std::cerr << "Stream stack leak detected" << std::endl; + } + + // Make sure stream stack is empty + while (auto s = StreamStack::Instance().top()) + { + StreamStack::Instance().pop(); + } } - globalStream->sync(); - - // There should only be 1 stream in the stack, namely the - // global stream. - auto s = StreamStack::Instance().top(); - if (s != globalStream) - { - std::cerr << "Stream stack leak detected" << std::endl; - } - - // Make sure stream stack is empty - while (auto s = StreamStack::Instance().top()) + catch (const std::exception &e) { - StreamStack::Instance().pop(); + //Do nothing here this can happen if someone closes the cuda context prior to exit. + std::cerr << "Warning CVCUDA cleanup may be incomplete due to: " << e.what() << "\n"; } }); } diff --git a/samples/common/python/perf_utils.py b/samples/common/python/perf_utils.py index 7c32a2bdc..1d854337f 100644 --- a/samples/common/python/perf_utils.py +++ b/samples/common/python/perf_utils.py @@ -218,9 +218,12 @@ def finalize(self): # "data" : { # ... # } - # "mean_data" : { + # "data_stats_minus_warmup" : { # ... # } + # "gpu_metrics" : { + # ... + # }, # "batch_info" : { # ... # } @@ -234,8 +237,9 @@ def finalize(self): # # The data field stores timing info of all batches keyed with raw flattened # names of the NVTX push/pop ranges. - # The mean_data stores the mean timing info for NVTX ranges across - # all the batches. + # The data_stats_minus_warmup stores various stats (e.g. mean, median) for + # NVTX ranges across all the batches. + # The gpu_metrics field stores various GPU metrics (e.g. power and utilization) # The batch_info stores the batch index and batch size of each batch. # The inside batch info is list of NVTX range names which executed inside # a batch. @@ -248,7 +252,8 @@ def finalize(self): # benchmark_dict = { "data": self.timing_info, - "mean_data": {}, + "data_stats_minus_warmup": {}, + "gpu_metrics": {}, "batch_info": self.batch_info, "inside_batch_info": self.inside_batch_info, "meta": {}, @@ -296,12 +301,13 @@ def finalize(self): return {} -def maximize_clocks(logger): +def maximize_clocks(logger, device_id=0): """ Maximizes the GPU clocks. Useful to do it before any type of performance benchmarking. + :param device_id: The GPU device ID whose clocks should be maximized. """ - logger.info("Trying to maximize the GPU clocks...") + logger.info("Trying to maximize the GPU clocks for device: %d" % device_id) gpu_available = torch.cuda.device_count() > 0 @@ -353,6 +359,7 @@ def maximize_clocks(logger): "nvidia-smi", "--query-gpu=power.limit", "--format=csv,nounits,noheader", + "-i=%d" % device_id, ], stdout=subprocess.PIPE, ) @@ -365,6 +372,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--query-gpu=power.max_limit", "--format=csv,nounits,noheader", ], @@ -378,6 +386,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--power-limit=%f" % max_power_limit, ], stdout=subprocess.PIPE, @@ -392,6 +401,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--query-gpu=clocks.max.graphics", "--format=csv,nounits,noheader", ], @@ -407,6 +417,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--lock-gpu-clocks=%d,%d" % (max_graphics_clock, max_graphics_clock), ], stdout=subprocess.PIPE, @@ -421,6 +432,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--query-gpu=clocks.max.memory", "--format=csv,nounits,noheader", ], @@ -436,6 +448,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--lock-memory-clocks=%d,%d" % (max_memory_clock, max_memory_clock), ], stdout=subprocess.PIPE, @@ -451,6 +464,7 @@ def maximize_clocks(logger): proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--applications-clocks=%d,%d" % (max_memory_clock, max_graphics_clock), ], stdout=subprocess.PIPE, @@ -465,7 +479,12 @@ def maximize_clocks(logger): # 7. Get the GPU Performance State. P0 state means the most performance. proc_ret = subprocess.run( - ["nvidia-smi", "--query-gpu=pstate", "--format=csv,nounits,noheader"], + [ + "nvidia-smi", + "-i=%d" % device_id, + "--query-gpu=pstate", + "--format=csv,nounits,noheader", + ], stdout=subprocess.PIPE, ) if proc_ret.returncode: @@ -477,22 +496,26 @@ def maximize_clocks(logger): logger.info("Current GPU performance state is %s." % gpu_perf_state) if gpu_perf_state == "P0": - logger.info("Clocks are now maximized.") + logger.info("Clocks for device %d are now maximized." % device_id) return True, was_persistence_mode_on, current_power_limit else: - logger.info("Unable to maximize all GPU clocks to reach the P0 state.") + logger.info( + "Unable to maximize GPU clocks of device %d to reach the P0 state." + % device_id + ) return False, was_persistence_mode_on, current_power_limit def reset_clocks( logger, + device_id=0, was_persistence_mode_on=False, current_power_limit=None, ): """ Resets the GPU clocks. """ - logger.info("Trying to reset the GPU clocks...") + logger.info("Trying to reset the GPU clocks for device: %d" % device_id) gpu_available = torch.cuda.device_count() > 0 @@ -501,7 +524,8 @@ def reset_clocks( # 1. Reset the memory clock. proc_ret = subprocess.run( - ["nvidia-smi", "--reset-memory-clocks"], stdout=subprocess.PIPE + ["nvidia-smi", "-i=%d" % device_id, "--reset-memory-clocks"], + stdout=subprocess.PIPE, ) if proc_ret.returncode: logger.warning("Unable to reset the memory clock back to normal.") @@ -510,7 +534,8 @@ def reset_clocks( # 2. Reset GPU clock. proc_ret = subprocess.run( - ["nvidia-smi", "--reset-gpu-clocks"], stdout=subprocess.PIPE + ["nvidia-smi", "-i=%d" % device_id, "--reset-gpu-clocks"], + stdout=subprocess.PIPE, ) if proc_ret.returncode: logger.warning("Unable to reset the GPU clock back to normal.") @@ -519,7 +544,8 @@ def reset_clocks( # 3. Reset application clocks. proc_ret = subprocess.run( - ["nvidia-smi", "--reset-applications-clocks"], stdout=subprocess.PIPE + ["nvidia-smi", "-i=%d" % device_id, "--reset-applications-clocks"], + stdout=subprocess.PIPE, ) if proc_ret.returncode: logger.warning("Unable to reset the application clocks back to normal.") @@ -531,6 +557,7 @@ def reset_clocks( proc_ret = subprocess.run( [ "nvidia-smi", + "-i=%d" % device_id, "--power-limit=%f" % current_power_limit, ], stdout=subprocess.PIPE, @@ -562,7 +589,12 @@ def reset_clocks( # 7. Get GPU Performance State proc_ret = subprocess.run( - ["nvidia-smi", "--query-gpu=pstate", "--format=csv,nounits,noheader"], + [ + "nvidia-smi", + "-i=%d" % device_id, + "--query-gpu=pstate", + "--format=csv,nounits,noheader", + ], stdout=subprocess.PIPE, ) if proc_ret.returncode: @@ -774,12 +806,12 @@ def summarize_runs( """ def _parse_json_for_time(json_data): - mean_all_batches = json_data["mean_all_batches"] - sample_name_key = list(mean_all_batches.keys())[0] + mean_all_procs = json_data["data_mean_all_procs"] + sample_name_key = list(mean_all_procs.keys())[0] - cpu_time_minus_warmup_per_item = mean_all_batches[sample_name_key][ - "run_sample" - ]["pipeline"]["cpu_time_minus_warmup_per_item"] + cpu_time_minus_warmup_per_item = mean_all_procs[sample_name_key]["run_sample"][ + "pipeline" + ]["cpu_time_minus_warmup_per_item"]["mean"] return cpu_time_minus_warmup_per_item diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py index 938f080c8..57507f2e6 100644 --- a/samples/scripts/benchmark.py +++ b/samples/scripts/benchmark.py @@ -20,10 +20,14 @@ import os import sys import json +import time import logging import argparse import subprocess +import numpy as np +import pandas as pd import multiprocessing as mp +import matplotlib.pyplot as plt common_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), @@ -37,7 +41,7 @@ class NvtxRangeTimeInfo: """ - A class encapsulating the time information associated with an NVTX range. + A data class to hold the time information of an NVTX range. """ def __init__(self, start_ms, end_ms): @@ -58,7 +62,7 @@ def duration_ms(self): class NvtxRange: """ - A class representing an NVTX range with its timing information. + A data class representing an NVTX range with its CPU and GPU time information. """ def __init__( @@ -395,116 +399,156 @@ def calc_mean_ranges(all_range_info): return mean_range_info -class MeanDictInfo: +class NumpyValuesEncoder(json.JSONEncoder): """ - A small data structure to help track various stats over multiple dictionaries. - For example, we can use to create one dictionary that represents the sum of - many dictionaries. In that, this data structure can track the total value - (i.e the sum) and how many items were added into making that sum (i.e len). + Helps encode various Numpy data-types correctly in the JSON encoder. """ - def __init__(self, value): - self.value = value - self.len = 0 + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return super(NumpyValuesEncoder, self).default(obj) -def recurse_sum_dict(input_dict, target_dict): +def recurse_gather_dict(input_dict, target_dict): """ - Recursively sums up value of all keys of input_dict in another dictionary. - This is useful for computing the total (or the mean eventually) of all the - keys in a dictionary. - This function uses and inserts a special object `MeanDictInfo` as values of - the leaf nodes. That object is useful to track not just the sum total values - but also how many items were summed up in there. + Recursively gathers values of all keys of input_dict in another dictionary. + This is useful for computing various stats on the data such as mean, median or + std-dev of all the keys in a dictionary. + This function collects data in a list. :param input_dict: The dictionary that should be used as input. :param target_dict: The single dictionary in which all the sums should be gathered. """ + assert type(input_dict) is type(target_dict) + assert isinstance(input_dict, dict) # Loop over all the keys in the input dictionary. for key in input_dict: + if key in ["total_items", "total_items_warmup", "total_items_minus_warmup"]: + continue # We skip these key. + # Check if the value is another dictionary. - if isinstance(input_dict[key], dict): + elif isinstance(input_dict[key], dict): # Create this if our target_dict did not already have it. if key not in target_dict: target_dict[key] = {} # Recurse the same function again. - recurse_sum_dict(input_dict[key], target_dict[key]) + recurse_gather_dict(input_dict[key], target_dict[key]) - # Check if the value is a list or tuple. + # Check if the value is a list or tuple. We will store inside a list of lists. elif isinstance(input_dict[key], list) or isinstance(input_dict[key], tuple): - - # Create this if our target_dict did not already have it. Instead - # of saving just the sums, we will save both, the sum and the len - # telling us how many numbers were summed up. We use the MeanDictInfo - # object for this. + # Create this if our target_dict did not already have it. if key not in target_dict: - target_dict[key] = MeanDictInfo(value=[]) + target_dict[key] = [] for _ in range(len(input_dict[key])): - target_dict[key].value.append(0) + target_dict[key].append([]) # This creates list of lists. for i in range(len(input_dict[key])): - target_dict[key].value[i] += input_dict[key][i] - - # Increment the length. - target_dict[key].len += 1 + target_dict[key][i].append(input_dict[key][i]) - # For anything else, we assume it was a number. + # For anything else, we assume it was a number. We will store inside a list. else: if key not in target_dict: - target_dict[key] = MeanDictInfo(value=0) + target_dict[key] = [] - target_dict[key].value += input_dict[key] - target_dict[key].len += 1 + target_dict[key].append(input_dict[key]) -def recurse_divide_dict(input_dict, divide_by=None): +def recurse_calc_stats_dict( + input_dict, + compute_mean_only=False, + compute_throughput=False, + throughput_multiplier=1, +): """ - Recursively divides the value of all keys of input_dict. - The denominator can be a fixed value `divide_by` or it can be dynamically - inferred from the length attributes of the MeanDictInfo object. + Recursively calculates various stats on the value of all keys of input_dict. :param input_dict: The dictionary that should be used as input. - :param divide_by: Optional value to use in the denominator. + :param compute_mean_only: A flag indicating whether only the mean should be computed or not. + Computes a lot of other stats (e.g. median, min, max...) if set to False. + :param compute_throughput: A flag indicating whether throughput should be computed or not. + Only set to True when running in parallel with all resources maximized otherwise throughput + calculation may give incorrect results. + :param throughput_multiplier: A number with which the throughput is multiplied to calculate the + total throughput. Usually set to the number of parallel processes or threads executing in parallel. """ # Loop over all the keys in the input dictionary. - for key in input_dict: + for key in list(input_dict.keys()): # Check if the value is another dictionary. if isinstance(input_dict[key], dict): - recurse_divide_dict(input_dict[key], divide_by) - - # Check if the value is a MeanDictInfo object. - elif isinstance(input_dict[key], MeanDictInfo): - if isinstance(input_dict[key].value, list) or isinstance( - input_dict[key].value, tuple - ): - # Use the user given divide by if supplied or else use the len - divide_by = divide_by if divide_by else input_dict[key].len - - for i in range(len(input_dict[key].value)): - input_dict[key].value[i] /= divide_by - input_dict[key].value[i] = round(input_dict[key].value[i], 4) - else: - divide_by = divide_by if divide_by else input_dict[key].len - - input_dict[key].value /= divide_by - input_dict[key].value = round(input_dict[key].value, 4) - - # Remove the MeanDictInfo object and store the value directly. - input_dict[key] = input_dict[key].value - - elif isinstance(input_dict[key], list) or isinstance(input_dict[key], tuple): - if divide_by is None: - raise ValueError( - "divide_by must not be None when the values of the dictionary are " - "not MeanDictInfo objects." - ) - - for i in range(len(input_dict[key].value)): - input_dict[key][i] /= divide_by - input_dict[key][i] = round(input_dict[key][i], 4) + recurse_calc_stats_dict( + input_dict[key], + compute_mean_only, + compute_throughput, + throughput_multiplier, + ) else: - input_dict[key] /= divide_by - input_dict[key] = round(input_dict[key], 4) + assert isinstance(input_dict[key], list) + + # Compute all stats. + if compute_mean_only: + stats_dict = { + "total_items": len(input_dict[key]), + "mean": round(np.mean(input_dict[key], axis=-1), 4), + } + else: + stats_dict = { + "total_items": len(input_dict[key]), + "min": round(np.min(input_dict[key], axis=-1), 4), + "max": round(np.max(input_dict[key], axis=-1), 4), + "mean": round(np.mean(input_dict[key], axis=-1), 4), + "std": round(np.std(input_dict[key], axis=-1), 4), + "median": round(np.median(input_dict[key], axis=-1), 4), + "percentile_95": round( + np.percentile(input_dict[key], 95, axis=-1), 4 + ), + } + + if compute_throughput: + throughput_unit = ( + "frames_per_second" + if "_per_item" in key + else "batches_per_second" + ) + stats_dict["throughput"] = { + "multiplier": throughput_multiplier, + "unit": throughput_unit, + # NOTE: Minimum throughput corresponds to maximum latency. + "min": round( + 1000 * throughput_multiplier / stats_dict["max"], 2 + ) + if stats_dict["max"] > 0 + else 0, + # NOTE: Maximum throughput corresponds to minimum latency. + "max": round( + 1000 * throughput_multiplier / stats_dict["min"], 2 + ) + if stats_dict["min"] > 0 + else 0, + "mean": round( + 1000 * throughput_multiplier / stats_dict["mean"], 2 + ) + if stats_dict["mean"] > 0 + else 0, + "median": round( + 1000 * throughput_multiplier / stats_dict["median"], 2 + ) + if stats_dict["median"] > 0 + else 0, + "percentile_95": round( + 1000 * throughput_multiplier / stats_dict["percentile_95"], + 2, + ) + if stats_dict["percentile_95"] > 0 + else 0, + } + + # Assign in-place. + input_dict[key] = stats_dict def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): @@ -556,7 +600,7 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): # } # } # - # 4. Finally, it computes what we call as the mean values of the timings from all + # 4. Finally, it computes various stats (e.g mean, median) of the timings from all # the batches. In other words, it computes how much range X would take on an # average when it is averaged across all the batches. To do this, we again use # the information present inside benchmark.json and apply basic recursion math. @@ -638,6 +682,29 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): current_dict[parts[-1]]["gpu_time"] / batch_size, 4 ) + # Pass the total_items information to all the children of this batch + # i.e. keys which were present in inside_batch_info + # unless they already had it before (i.e. very weird case where someone + # inserted a batch in a batch with different inner batch size). + def _recurse_update_children_total_items(in_dict): + for k in list(in_dict.keys()): + if isinstance(in_dict[k], dict): + _recurse_update_children_total_items(in_dict[k]) + else: + # Add total items if not already present. + if "total_items" not in in_dict: + in_dict["total_items"] = batch_size + in_dict["cpu_time_per_item"] = round( + in_dict["cpu_time"] / batch_size, 4 + ) + in_dict["gpu_time_per_item"] = round( + in_dict["gpu_time"] / batch_size, 4 + ) + + # Apply it. + if batch_size > 0: + _recurse_update_children_total_items(current_dict[parts[-1]]) + # Maintain global counts of various batch level stats # for example, counting the total items seen at this # batch level. @@ -673,7 +740,7 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): elif path in benchmark_dict["inside_batch_info"]: # We could be inside a batch. Nothing to do here. We won't have the total_items yet - # at this level + # at this level. We will update it when we come on a parent level. pass else: # We are one or more levels outside/above the batch level. @@ -725,6 +792,14 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): for batch_level_prefix in batch_dicts: batch_dict = batch_dicts[batch_level_prefix] + batch_dict["total_items_warmup"] = ( + batch_dict["total_items"] - total_items_minus_warmup[batch_level_prefix] + ) + + batch_dict["total_items_minus_warmup"] = total_items_minus_warmup[ + batch_level_prefix + ] + batch_dict["cpu_time_minus_warmup"] = round( (batch_dict["cpu_time"] - total_warmup_cpu_time[batch_level_prefix]), 4 ) @@ -747,15 +822,11 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): 4, ) - batch_dict["total_items_minus_warmup"] = total_items_minus_warmup[ - batch_level_prefix - ] - # The processing is over. So we assign the expanded version of data into the # original benchmark dictionary. benchmark_dict["data"] = unfltten_data_dict - # Finally, process the batches to find out the mean batch timings. + # Finally, process the batches to calculate various stats on the batch timings. # i.e. how much did range X took on an average across all the batches. # Again, we will not use any batches that are warm-up batches in this calculation. # For this to happen, we need to rely on the batch_info keys. Those are the @@ -772,8 +843,7 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): # during all the batches. Our division logic takes care of properly # dividing with the current count anyway. # - mean_data = {} - total_batches_used = 0 + data_stats = {} for batch_range_name in benchmark_dict["batch_info"]: batch_idx, batch_size = benchmark_dict["batch_info"][batch_range_name] # Next, we find out the batch level prefix. This is the key in which @@ -801,28 +871,26 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): and (batch_idx + 1 + warmup_batches) <= benchmark_dict["meta"]["total_batches"][batch_level_prefix] ): - # Keep on updating the mean_data dictionary. This will - # create a dictionary that is union of all the dicts of the batch level. + # Keep on updating the data_stats dictionary. This will + # create a dictionary that is union of all the dictionaries of the batch level. nested_keys = batch_range_name.split("/") - target_dict = benchmark_dict["data"] + source_dict = benchmark_dict["data"] for k in nested_keys: - target_dict = target_dict[k] + source_dict = source_dict[k] - # Need to recursively update the mean_data based on - # the target_dict. We will sum the values up. - if batch_level_prefix not in mean_data: - mean_data[batch_level_prefix] = {} + # Need to recursively update the data_stats based on + # the source_dict. We will sum the values up. + if batch_level_prefix not in data_stats: + data_stats[batch_level_prefix] = {} - recurse_sum_dict(target_dict, mean_data[batch_level_prefix]) - total_batches_used += 1 + recurse_gather_dict(source_dict, target_dict=data_stats[batch_level_prefix]) - # Once all the sums are calculated, we need to divide by the length to figure + # Once all the numbers are gathered, we need to divide by the length to figure # out the mean values. - recurse_divide_dict(mean_data) - benchmark_dict["mean_data"] = mean_data + recurse_calc_stats_dict(data_stats) + benchmark_dict["data_stats_minus_warmup"] = data_stats # Remove the batch_info and inside_batch_info keys as they are no longer needed. - del benchmark_dict["batch_info"] del benchmark_dict["inside_batch_info"] @@ -862,7 +930,7 @@ def benchmark_script( # Setup the command that will launch nsys and ask it to benchmark the script # that we were interested in. - nsys_root_path = "/opt/nvidia/nsight-systems/2023.2.1/" + nsys_root_path = "/opt/nvidia/nsight-systems/2024.2.1/" nsys_binary_path = os.path.join(nsys_root_path, "bin/nsys") nsys_reports_path = os.path.join(nsys_root_path, "target-linux-x64/reports") nsys_gpu_proj_trace_report_path = os.path.join( @@ -874,7 +942,7 @@ def benchmark_script( if not os.path.isfile(nsys_binary_path): raise ValueError( - "Unable to locate nsys binary at %s. Make sure you have nsight-systems 2023.2.1 installed." + "Unable to locate nsys binary at %s. Make sure you have nsight-systems 2024.2.1 installed." % nsys_binary_path ) @@ -953,9 +1021,6 @@ def benchmark_script( all_range_info = merge_cpu_and_gpu_ranges(cpu_range_info, gpu_range_info) # Calculate averages across processes/threads mean_ranges_info = calc_mean_ranges(all_range_info) - # and save it. - with open(os.path.join(output_dir, "perf_report_nvtx_mean.json"), "w") as f: - f.write(json.dumps(mean_ranges_info, indent=4)) # Final step is to pull the data from the all_range_info we generated above and fill # it in the benchmark_dict. @@ -974,14 +1039,98 @@ def benchmark_script( # Write the updated benchmark dictionary. with open(benchmark_json_path, "w") as f: - f.write(json.dumps(benchmark_dict, indent=4)) + f.write(json.dumps(benchmark_dict, indent=4, cls=NumpyValuesEncoder)) + + # Delete the temporary files. + os.remove(os.path.join(output_dir, "perf_report_nvtx_pushpop_trace.json")) + os.remove(os.path.join(output_dir, "perf_report_nvtx_gpu_proj_trace.json")) + os.remove(os.path.join(output_dir, "perf_report.sqlite")) return 0, output_dir +def monitor_gpu_metrics(list_of_device_ids, terminate_event, gpu_metrics_info): + """ + Monitors various GPU metrics directly from NVIDIA-smi. These metrics are not accessible + from NSYS as of now. The monitoring stays on until an event is received. + :param list_of_device_ids: A list of string values of the GPU-ids that are being used in this + benchmark run. + :param terminate_event: An event that can mark an end of the monitoring process. + :param gpu_metrics_info: A multiprocessing share dictionary to store the results of monitoring. + """ + # Initialize the gpu_metrics_info dictionary for the first time. We will save the + # following pieces of information per GPU. + # 1) The total GPU power drawn in Watts + # 2) The GPU utilization in %. + + # We will work in a local dictionary first. Only when we are done that we would + # transfer its contents to the mp managed dictionary. Because otherwise the mp + # managed dictionary has no way of knowing when a nested key-value changes and it + # won't update/save it. + gpu_metrics_info_local = {"power.draw.watts": {}, "utilization.gpu": {}} + for device_id in list_of_device_ids: + gpu_metrics_info_local["power.draw.watts"]["GPU: %s" % device_id] = [] + gpu_metrics_info_local["utilization.gpu"]["GPU: %s" % device_id] = [] + + # Begin the monitoring loop. Continue till we are asked to stopped by the event. + while not terminate_event.is_set(): + # Use nvidia-smi to get power draw and GPU utilization numbers for all GPUs. + proc_ret = subprocess.run( + [ + "nvidia-smi", + "-i=%s" % ",".join(list_of_device_ids), + "--query-gpu=power.draw,utilization.gpu", + "--format=csv,nounits,noheader", + ], + stdout=subprocess.PIPE, + ) + if proc_ret.returncode == 0: + outputs = proc_ret.stdout.decode().strip().split("\n") + for idx, device_id in enumerate(list_of_device_ids): + power_draw, gpu_util = outputs[idx].split(",") + power_draw = float(power_draw) + gpu_util = float(gpu_util) + + gpu_metrics_info_local["power.draw.watts"][ + "GPU: %s" % device_id + ].append(power_draw) + gpu_metrics_info_local["utilization.gpu"]["GPU: %s" % device_id].append( + gpu_util + ) + else: + for device_id in list_of_device_ids: + gpu_metrics_info_local["power.draw.watts"][ + "GPU: %s" % device_id + ].append(0.0) + gpu_metrics_info_local["utilization.gpu"]["GPU: %s" % device_id].append( + 0.0 + ) + + # Sleep a bit + time.sleep(0.5) # 500 milliseconds + + # Update the mp managed dictionary + gpu_metrics_info.update(gpu_metrics_info_local) + + +def plot_gpu_metrics(gpu_metrics_info, output_dir): + """ + Plots GPU metrics as a matplotlib plot. + """ + for metric_name in gpu_metrics_info: + # Create pandas data frame. + df = pd.DataFrame(gpu_metrics_info[metric_name]) + ax = df.plot(title=metric_name) + ax.set_xlabel("Execution time") + ax.set_ylabel(metric_name) + fig = ax.get_figure() + fig.savefig(os.path.join(output_dir, "plot.%s.jpg" % metric_name)) + plt.close(fig) + + def main(): parser = argparse.ArgumentParser( - "Performance benchmarking script for CV-CUDA samples.", + "Performance benchmarking script for CV-CUDA.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) @@ -1072,17 +1221,38 @@ def main(): ) # Maximize the clocks. - if args.maximize_clocks: - ( - did_maximize_clocks, - was_persistence_mode_on, - current_power_limit, - ) = maximize_clocks(logger) - - # Start multiple processes, per num_processes per num_gpus. + clocks_info = [] + all_device_ids = [] + for gpu_idx in range(args.num_gpus): + device_id = args.gpu_offset_id + gpu_idx + + all_device_ids.append(str(device_id)) + if args.maximize_clocks: + ( + did_maximize_clocks, + was_persistence_mode_on, + current_power_limit, + ) = maximize_clocks(logger, device_id) + clocks_info.append((was_persistence_mode_on, current_power_limit)) + + # We will start multiple processes, per num_processes per num_gpus in Pool to run the benchmarks. pool = mp.Pool() + # Create an event to signal other processes (e.g. monitor_gpu_metrics to stop when main pool has stopped) + pool_terminate_event = mp.Event() + # Create a shared dict to retrieve the results of monitor_gpu_metrics + mp_manager = mp.Manager() + gpu_metrics_info = mp_manager.dict() + # Finally allocate a list to store the Pool's process's results. results = [] + # Begin by starting one process to keep on monitoring various GPU metrics that are not available via NSYS. + gpu_metric_monitor_proc = mp.Process( + target=monitor_gpu_metrics, + args=(all_device_ids, pool_terminate_event, gpu_metrics_info), + ) + gpu_metric_monitor_proc.start() + + # Then we start the multiprocessing Pool. for gpu_idx in range(args.num_gpus): for process_idx in range(args.num_processes): # Since each the output of each process needs to be stored in a different directory, @@ -1124,58 +1294,162 @@ def main(): pool.close() pool.join() + # Set the terminate event so other processes know that the Pool has finished. + pool_terminate_event.set() + # Wait for the gpu_metric_monitor process to finish. + gpu_metric_monitor_proc.join() + # Reset the clocks. if args.maximize_clocks: - reset_clocks( - logger, - was_persistence_mode_on, - current_power_limit, - ) + for gpu_idx in range(args.num_gpus): + device_id = args.gpu_offset_id + gpu_idx + + was_persistence_mode_on, current_power_limit = clocks_info[gpu_idx] + + reset_clocks( + logger, + device_id, + was_persistence_mode_on, + current_power_limit, + ) else: logger.warning("Clocks were not maximized during this run.") - # Now we need to calculate the average of all the perf-numbers. - # This means average across all the processes that we had launched. - # This can only be done if all processes finished without error. - # So we will check that first and if that is the case, we will - # read their benchmark.json data in a list to later calculate - # the average. - all_benchmark_data_dicts = [] - all_benchmark_mean_batch_dicts = [] + # We must create a copy of gpu_metrics_info to detach it from multiprocessing. + gpu_metrics_info = gpu_metrics_info.copy() + + # Plot the GPU metrics. + plot_gpu_metrics(gpu_metrics_info, args.output_dir) + + # Now we need to : + # 1) Write the gpu_metrics_info in to the benchmark.json files stored per + # process and + # 2) Calculate various stats at the all processes level. + # e.g. If we ran 1 or more processes, there will be a benchmark_mean.json + # created in the output root folder with mean and other stats computed from + # all benchmark.json files of all the processes. + # This can only be done if all processes finished without error. + # So we will check that first and if that is the case, we will + # read their benchmark.json data in a list to later calculate various stats. + all_data_dicts = [] for r in results: # Grab the return result from the pool. proc_ret_code, proc_output_dir = r.get() if proc_ret_code: # Any non-zero return code mean the process failed. raise Exception( - "Failed to execute process: %d on gpu: %d" % (process_idx, gpu_idx) + "Process: %d on gpu: %d exited with a non-zero return code: %d" + % (process_idx, gpu_idx, proc_ret_code) ) else: # Zero return code means success. Read the benchmark.json. with open(os.path.join(proc_output_dir, "benchmark.json"), "r") as f: benchmark_dict = json.loads(f.read()) - # Append to our list of data dict and mean_batch data dict - all_benchmark_data_dicts.append(benchmark_dict["data"]) - all_benchmark_mean_batch_dicts.append(benchmark_dict["mean_data"]) + # Update this benchmark dict with GPU metrics for this GPU id. + for metric_name in gpu_metrics_info: + device_id_of_this_proc = benchmark_dict["meta"]["device"]["id"] + benchmark_dict["gpu_metrics"][metric_name] = gpu_metrics_info[ + metric_name + ]["GPU: %d" % device_id_of_this_proc] + + with open(os.path.join(proc_output_dir, "benchmark.json"), "w") as f: + f.write(json.dumps(benchmark_dict, indent=4, cls=NumpyValuesEncoder)) + + # Append to our list of data dict. + all_data_dicts.append(benchmark_dict["data"]) + + # 1) Compute mean of the data field from all processes... + data_mean_all_procs = {} + # First recursively collect all values from all the data dictionaries of all processes. + for data_dict in all_data_dicts: + recurse_gather_dict(data_dict, data_mean_all_procs) + # And then compute just the mean over this. + recurse_calc_stats_dict(data_mean_all_procs, compute_mean_only=True) + + # 2) Compute various stats of the data_stats_minus_warmup field from all processes... + # Now compute all the stats (such as mean, median etc) for all processes from all numbers. + # NOTE: We have already computed these stats per process in the benchmark.json's + # data_stats_minus_warmup field. This time, we want to do it over all the processes. Instead + # of taking mean of those numbers, we will calculate the freshly, combining all data points. + # This results in much accurate statistics. + # We will use last process's benchmark_dict to use query some important fields such as + # batch_info and total_batches etc. This assumes that all processes ran the same code. + data_stats_all_procs = {} + for batch_range_name in benchmark_dict["batch_info"]: + batch_idx, batch_size = benchmark_dict["batch_info"][batch_range_name] + # Next, we find out the batch level prefix. This is the key in which + # the batches are nested. One profiling session can have multiple levels + # at which batches may be used. + # e.g. + # program_X: + # method_A: + # batch_1 + # batch_2 + # method_B: + # batch_1 + # batch_2 + # + # We need to find mean at these two levels (i.e. method_A and method_B) + # in this case. + # programA/method_A and program_A/method_B are the batch level prefix here. + # We can easily get those by using the dirname method since those are like + # the directory names in a path. + batch_level_prefix = os.path.dirname(batch_range_name) + + if ( + batch_size > 0 + and batch_idx + 1 > args.warmup_batches + and (batch_idx + 1 + args.warmup_batches) + <= benchmark_dict["meta"]["total_batches"][batch_level_prefix] + ): + # Keep on updating the data_stats dictionary. This will create a dictionary + # that is union of all the dictionaries at the batch level for all processes. + nested_keys = batch_range_name.split("/") + + for data_dict in all_data_dicts: + source_dict = data_dict - mean_all_batch_data = {} - mean_data = {} + # Go deep down the nested key path from the root. + for k in nested_keys: + source_dict = source_dict[k] - # First recursively sum up all values from all dictionaries. - for data_dict in all_benchmark_data_dicts: - recurse_sum_dict(data_dict, mean_all_batch_data) - # And then divide by the length to get the mean values. - recurse_divide_dict(mean_all_batch_data) + # Need to recursively update the data_stats_all_procs based on + # the source_dict. We will sum the values up. + if batch_level_prefix not in data_stats_all_procs: + data_stats_all_procs[batch_level_prefix] = {} - for mean_batch_dict in all_benchmark_mean_batch_dicts: - recurse_sum_dict(mean_batch_dict, mean_data) - # And then divide by the length to get the mean values. - recurse_divide_dict(mean_data) + recurse_gather_dict( + source_dict, target_dict=data_stats_all_procs[batch_level_prefix] + ) + + # Once all the data points are gathered, we need to divide by the length to figure + # out the mean values. + recurse_calc_stats_dict( + data_stats_all_procs, + compute_throughput=True, + throughput_multiplier=args.num_gpus * args.num_processes, + ) + + # 3). Compute stats of of all GPU metrics for all GPUs involved. + gpu_metrics_all_procs = {} + for metric_name in gpu_metrics_info: + gpu_metrics_all_procs[metric_name] = [] + for device_id in gpu_metrics_info[metric_name]: + # Gather all + gpu_metrics_all_procs[metric_name].extend( + gpu_metrics_info[metric_name][device_id] + ) + + # Compute stats. + recurse_calc_stats_dict( + gpu_metrics_all_procs, + ) mean_benchmark_data = { - "mean_all_batches": mean_all_batch_data, - "mean_data": mean_data, + "data_mean_all_procs": data_mean_all_procs, + "data_stats_minus_warmup_all_procs": data_stats_all_procs, + "gpu_metrics_all_procs": gpu_metrics_all_procs, "meta": {"args": {}}, } for arg in vars(args): @@ -1184,7 +1458,7 @@ def main(): # Write it in a file. mean_benchmark_json_path = os.path.join(args.output_dir, "benchmark_mean.json") with open(mean_benchmark_json_path, "w") as f: - f.write(json.dumps(mean_benchmark_data, indent=4)) + f.write(json.dumps(mean_benchmark_data, indent=4, cls=NumpyValuesEncoder)) logger.info( "Benchmarking completed successfully. Results saved at: %s" % mean_benchmark_json_path diff --git a/samples/scripts/benchmark_samples.sh b/samples/scripts/benchmark_samples.sh index 7b97c3f78..669db058e 100755 --- a/samples/scripts/benchmark_samples.sh +++ b/samples/scripts/benchmark_samples.sh @@ -31,7 +31,7 @@ if [ "$#" -lt 1 ]; then exit 1 fi -SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) SAMPLES_ROOT="$(dirname "$SCRIPT_DIR")" # removes the scripts dir OUTPUT_DIR="$1" USE_TRT=${2:-True} diff --git a/samples/scripts/build_samples.sh b/samples/scripts/build_samples.sh index 59b4b4986..b54d07ff3 100755 --- a/samples/scripts/build_samples.sh +++ b/samples/scripts/build_samples.sh @@ -18,10 +18,11 @@ # Builds samples # Usage: build_samples.sh [build folder] +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" build_dir="build" mkdir -p $build_dir cd $build_dir -cmake .. && make +cmake "$SCRIPT_DIR/.." && make diff --git a/samples/scripts/install_dependencies.sh b/samples/scripts/install_dependencies.sh index cd2e6fb72..236f5753c 100755 --- a/samples/scripts/install_dependencies.sh +++ b/samples/scripts/install_dependencies.sh @@ -1,6 +1,6 @@ #!/bin/bash -e -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -99,7 +99,7 @@ chmod a+x /usr/local/bin/tao-converter # Install NVIDIA NSIGHT 2023.2.1 cd /tmp -wget https://developer.download.nvidia.com/devtools/nsight-systems/nsight-systems-2023.2.1_2023.2.1.122-1_amd64.deb +wget https://developer.download.nvidia.com/devtools/nsight-systems/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb apt-get update && apt-get install -y \ libsm6 \ libxrender1 \ @@ -107,7 +107,7 @@ apt-get update && apt-get install -y \ libxext6 \ libx11-dev \ libxkbfile-dev \ - /tmp/nsight-systems-2023.2.1_2023.2.1.122-1_amd64.deb \ + /tmp/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb \ && rm -rf /var/lib/apt/lists/* echo "export PATH=$PATH:/opt/tensorrt/bin" >> ~/.bashrc diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index 7a8fc3025..40fae3cb9 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -23,7 +23,7 @@ set -e export CUDA_MODULE_LOADING="LAZY" -SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" SAMPLES_DIR="$(dirname "$SCRIPT_DIR")" CLASSIFICATION_OUT_DIR=/tmp/classification SEGMENTATION_OUT_DIR="/tmp/segmentation" @@ -79,9 +79,9 @@ python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos # Run the classification C++ sample. Since the Python sample was already run, we can reuse the TensorRT model # and the labels file generated by it. # Batch size 1 -LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.1.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 1 +LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e "$CLASSIFICATION_OUT_DIR/1/model.4.224.224.trtmodel" -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l "$CLASSIFICATION_OUT_DIR/1/labels.txt" -b 1 # Batch size 2 -LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.2.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2 +LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e "$CLASSIFICATION_OUT_DIR/1/model.4.224.224.trtmodel" -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l "$CLASSIFICATION_OUT_DIR/1/labels.txt" -b 2 # Run the segmentation Python sample with default settings, without any command-line args. rm -rf "$SEGMENTATION_OUT_DIR" diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt index 202caf756..b2b3b1245 100644 --- a/src/cvcuda/CMakeLists.txt +++ b/src/cvcuda/CMakeLists.txt @@ -69,6 +69,7 @@ set(CV_CUDA_OP_FILES OpPairwiseMatcher.cpp OpFindHomography.cpp OpStack.cpp + OpResizeCropConvertReformat.cpp ) # filter only one that matches the patern (case insensitive), should be set on the global level diff --git a/src/cvcuda/OpResizeCropConvertReformat.cpp b/src/cvcuda/OpResizeCropConvertReformat.cpp new file mode 100644 index 000000000..2db154d6f --- /dev/null +++ b/src/cvcuda/OpResizeCropConvertReformat.cpp @@ -0,0 +1,71 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "priv/OpResizeCropConvertReformat.hpp" + +#include "priv/SymbolVersioning.hpp" + +#include +#include +#include +#include + +namespace priv = cvcuda::priv; + +CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatCreate, (NVCVOperatorHandle * handle)) +{ + return nvcv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Pointer to NVCVOperator handle must not be NULL"); + } + + *handle = reinterpret_cast(new priv::ResizeCropConvertReformat()); + }); +} + +CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out, + const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip)) +{ + return nvcv::ProtectCall( + [&] + { + nvcv::TensorWrapHandle input(in), output(out); + priv::ToDynamicRef(handle)(stream, input, output, resizeDim, interpolation, + cropPos, manip); + }); +} + +CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatVarShapeSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, NVCVImageBatchHandle in, NVCVTensorHandle out, + const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip)) +{ + return nvcv::ProtectCall( + [&] + { + nvcv::ImageBatchVarShapeWrapHandle input(in); + nvcv::TensorWrapHandle output(out); + priv::ToDynamicRef(handle)(stream, input, output, resizeDim, interpolation, + cropPos, manip); + }); +} diff --git a/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h b/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h index 7366c0c6d..80d2ee002 100644 --- a/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h +++ b/src/cvcuda/include/cvcuda/OpCropFlipNormalizeReformat.h @@ -105,7 +105,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaCropFlipNormalizeReformatCreate(NVCVOperatorHandl * 64bit Float | No * * Output: - * Data Layout: [kNHWC, kHWC, kNCHW, KCHW] + * Data Layout: [kNHWC, kNCHW] * Channels: [1, 3, 4] * * Data Type | Allowed @@ -145,10 +145,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaCropFlipNormalizeReformatCreate(NVCVOperatorHandl * * @param [in] borderValue Border value to be used for constant border mode \p NVCV_BORDER_CONSTANT. * - * @param [in] flipCode a tensor flag to specify how to flip the array; 0 means flipping - * around the x-axis and positive value (for example, 1) means flipping - * around y-axis. Negative value (for example, -1) means flipping around - * both axes. + * @param [in] flipCode a tensor flag to specify how to flip the array; 0 means flipping around the x-axis, + * 1 means flipping around the y-axis, -1 means flipping around both axes, and any other value will result + * in no flip. * * @param [in] base Tensor providing base values for normalization. * diff --git a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h new file mode 100644 index 000000000..041ff8003 --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h @@ -0,0 +1,176 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpResizeCropConvertReformat.h + * + * @brief Defines functions that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines. + * @defgroup NVCV_C_ALGORITHM__RESIZE_CROP Resize Crop + * @{ + */ + +#ifndef CVCUDA__RESIZE_CROP_H +#define CVCUDA__RESIZE_CROP_H + +#include "Operator.h" +#include "Types.h" +#include "detail/Export.h" + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Constructs and an instance of the ResizeCropConvertReformat operator. + * + * @param [out] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. + * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatCreate(NVCVOperatorHandle *handle); + +/** Executes the ResizeCropConvertReformat operation on the given cuda stream. This operation + * does not wait for completion. + * + * ResizeCropConvertReformat performs the following operations in order: + * 1) Resize either a single tensor or each image in an ImageBatchVarShape + * to a specified width and height (other dimensions are unchanged). + * 2) Crops a specified region of size width x height (determined by the + * output tensor's width & height) starting at the pixel position + * (cropPos.x, cropPos.y) out of the resized tensor. + * 3) Convert the element data type to the output tensor's data type. For + * example, convert uchar elements to float. Limited options availble. + * 4) Optional channel manipulation--i.e., re-order the channels + * of a tensor (e.g., RGB to BGR). Limited options available. + * 5) If output tensor's layout doesn't match the input's layout, reshape + * the layout to match output layout (e.g., NHWC to NCHW). Limited + * options available. + * NOTE: Since all images in an ImageBatchVarShape are resized to the + * same size, the resulting collection now fits in a single tensor. + * + * Limitations: + * + * Input: STILL NEED TO FILL THIS IN + * Data Layout: [NVCV_TENSOR_HWC, NVCV_TENSOR_NHWC] + * Channels: [1, 3] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | No + * 16bit Unsigned | No + * 16bit Signed | No + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | No + * 64bit Float | No + * + * Output: + * Data Layout: [NVCV_TENSOR_NHWC, NVCV_TENSOR_HWC, + * NVCV_TENSOR_NCHW, NVCV_TENSOR_CHW] + * Channels: [1, 3] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | No + * 16bit Unsigned | No + * 16bit Signed | No + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | Yes + * 64bit Float | No + * + * Input/Output dependency + * + * Property | Input == Output + * -------------- | ------------- + * Data Layout | No (Limited) + * Data Type | No (Limited) + * Number | Yes + * Channels | Yes + * Width | No + * Height | No + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] in Input tensor or image batch. The images in an image batch can be of different + * sizes, but all images must have the same data type, channels, and layout. + * + * @param [in] resizeDim Dimensions, {width, height}, to resize the tensor method to be used, + * see \ref NVCVSize2D for more details. + * + * @param [in] interpolation Interpolation method to be used, see \ref NVCVInterpolationType for + * more details. Currently, only NVCV_INTERP_NEAREST and NVCV_INTERP_LINEAR + * are available. + * + * @param [in] cropPos Crop position, (x, y), specifying the top-left corner of the crop region. + * The crop region's width and height is specified by the output tensor's + * width & height. + * @note: The crop must fall within the resized image. Let (x, y, w, h) + * represent the crop rectangle, where x & y are the cropPos coordinates + * and w & h are the output tensor's width and height, then the following + * must all be true: + * x >= 0 + * y >= 0 + * x + w <= resizeDim.w + * y + h <= resizeDim.h + * + * + * @param [in] manip Channel manipulation to be used (e.g., reshuffle RGB to BGR), + * see \ref NVCVChannelManip for more details. + * + * @param [out] out Output tensor. In addition to the output tensor determining the crop width + * and height, the output tensor also specifies the data type (e.g., uchar3 or + * float) and tensor layout (NHWC or NCHW), with limitations. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +/** @{ */ +CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + NVCVTensorHandle in, NVCVTensorHandle out, + const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, + const int2 cropPos, const NVCVChannelManip manip); + +CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + NVCVImageBatchHandle in, NVCVTensorHandle out, + const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, + const int2 cropPos, + const NVCVChannelManip manip); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDA__RESIZE_CROP_H */ diff --git a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp new file mode 100644 index 000000000..1e7fb143f --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp @@ -0,0 +1,97 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpResizeCropConvertReformat.hpp + * + * @brief Defines the public C++ class for that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines. + * @defgroup NVCV_CPP_ALGORITHM__RESIZE_CROP ResizeCropConvertReformat + * @{ + */ + +#ifndef CVCUDA__RESIZE_CROP_HPP +#define CVCUDA__RESIZE_CROP_HPP + +#include "IOperator.hpp" +#include "OpResizeCropConvertReformat.h" + +#include +#include +#include +#include +#include + +namespace cvcuda { + +class ResizeCropConvertReformat final : public IOperator +{ +public: + explicit ResizeCropConvertReformat(); + + ~ResizeCropConvertReformat(); + + void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP); + + void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out, + const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP); + + virtual NVCVOperatorHandle handle() const noexcept override; + +private: + NVCVOperatorHandle m_handle; +}; + +inline ResizeCropConvertReformat::ResizeCropConvertReformat() +{ + nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatCreate(&m_handle)); + assert(m_handle); +} + +inline ResizeCropConvertReformat::~ResizeCropConvertReformat() +{ + nvcvOperatorDestroy(m_handle); + m_handle = nullptr; +} + +inline void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, + const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, + const int2 cropPos, const NVCVChannelManip manip) +{ + nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatSubmit(m_handle, stream, in.handle(), out.handle(), + resizeDim, interpolation, cropPos, manip)); +} + +inline void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, + const nvcv::Tensor &out, const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip) +{ + nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatVarShapeSubmit(m_handle, stream, in.handle(), out.handle(), + resizeDim, interpolation, cropPos, manip)); +} + +inline NVCVOperatorHandle ResizeCropConvertReformat::handle() const noexcept +{ + return m_handle; +} + +} // namespace cvcuda + +#endif // CVCUDA__RESIZE_CROP_HPP diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h index 8dc5131f9..d222b38eb 100644 --- a/src/cvcuda/include/cvcuda/Types.h +++ b/src/cvcuda/include/cvcuda/Types.h @@ -58,6 +58,16 @@ typedef enum NVCV_CLOSE = 3, } NVCVMorphologyType; +// clang-format off +// @brief Select how channel data is manipulated--both interleaved (NHWC) and planar (NCHW) +typedef enum +{ + NVCV_CHANNEL_NO_OP = 0, //!< do not manipulate channel data + NVCV_CHANNEL_REVERSE = 1, //!< reverse channel order (e.g., RGB to BGR, BGR to RGB, RGBA to ABGR, etc.) +} NVCVChannelManip; + +// clang-format on + // @brief Flag to choose the color conversion to be used typedef enum { diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt index fa0e8c390..85683e409 100644 --- a/src/cvcuda/priv/CMakeLists.txt +++ b/src/cvcuda/priv/CMakeLists.txt @@ -68,6 +68,7 @@ set(CV_CUDA_PRIV_OP_FILES OpPairwiseMatcher.cu OpStack.cpp OpFindHomography.cu + OpResizeCropConvertReformat.cu ) # filter only one that matches the patern (case insensitive), should be set on the global level diff --git a/src/cvcuda/priv/OpResizeCropConvertReformat.cu b/src/cvcuda/priv/OpResizeCropConvertReformat.cu new file mode 100644 index 000000000..624624284 --- /dev/null +++ b/src/cvcuda/priv/OpResizeCropConvertReformat.cu @@ -0,0 +1,584 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "OpResizeCropConvertReformat.hpp" +#include "legacy/CvCudaLegacy.h" +#include "legacy/CvCudaLegacyHelpers.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; + +namespace cuda_op = nvcv::legacy::cuda_op; +namespace helpers = nvcv::legacy::helpers; + +namespace { + +//******************** NN = Nearest Neighbor (TensorWrap src) + +template +__global__ void resizeCrop_NN(SrcWrapper src, DstT *dst, const int src_w, const int src_h, const int dst_w, + const int dst_h, const float scale_x, const float scale_y, const int crop_x, + const int crop_y, const size_t incrN, const size_t incrH, const size_t incrW, + const size_t incrC, const uchar4 mapC) +{ + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((dst_x < dst_w) && (dst_y < dst_h)) + { // Generic copy pixel to pixel. + const int sample = blockIdx.z; + + dst += sample * incrN + dst_y * incrH + dst_x * incrW; + + const int sx = cuda::min(cuda::round((dst_x + crop_x) * scale_x), src_w - 1); + const int sy = cuda::min(cuda::round((dst_y + crop_y) * scale_y), src_h - 1); + + SrcT v = *src.ptr(sample, sy, sx); + + // Channel manipulation, convert type, and reformat. + dst[mapC.x * incrC] = (DstT)v.x; + dst[mapC.y * incrC] = (DstT)v.y; + dst[mapC.z * incrC] = (DstT)v.z; + } +} // resizeCrop_NN + +//******************** Bilinear (TensorWrap src) + +template +__global__ void resizeCrop_bilinear(SrcWrapper src, DstT *dst, const int src_w, const int src_h, const int dst_w, + const int dst_h, const float scale_x, const float scale_y, const int crop_x, + const int crop_y, const size_t incrN, const size_t incrH, const size_t incrW, + const size_t incrC, const uchar4 mapC) +{ + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + + if (dst_x < dst_w && dst_y < dst_h) + { + const int sample = blockIdx.z; + + // Float space for weighted addition. + // Compute y coordinate. + float fy = (float)((dst_y + crop_y + 0.5f) * scale_y - 0.5f); + int sy = cuda::round(fy); + fy -= sy; + sy = cuda::max(0, cuda::min(sy, src_h - 2)); + + // Row pointers. + const SrcT *aPtr = src.ptr(sample, sy, 0); // Start of upper row. + const SrcT *bPtr = src.ptr(sample, sy + 1, 0); // Start of lower row. + + dst += sample * incrN + dst_y * incrH + dst_x * incrW; + + { // Compute source data position and weight for [x0] components. + float fx = (float)((dst_x + crop_x + 0.5f) * scale_x - 0.5f); + int sx = cuda::round(fx); + fx -= sx; + + fx *= ((sx >= 0) && (sx < src_w - 1)); + sx = cuda::max(0, cuda::min(sx, src_w - 2)); + + SrcT v = cuda::SaturateCast((1.0f - fx) * (aPtr[sx] * (1.0f - fy) + bPtr[sx] * fy) + + fx * (aPtr[sx + 1] * (1.0f - fy) + bPtr[sx + 1] * fy)); + // Channel manipulation, convert type, and reformat. + dst[mapC.x * incrC] = (DstT)v.x; + dst[mapC.y * incrC] = (DstT)v.y; + dst[mapC.z * incrC] = (DstT)v.z; + } + } +} // resizeCrop_bilinear + +//******************** NN = Nearest Neighbor (ImageBatchVarShape src) + +template +__global__ void resizeCrop_NN_varShape(SrcWrapper src, DstT *dst, const int dst_w, const int dst_h, + const float resize_w, const float resize_h, const int crop_x, const int crop_y, + const size_t incrN, const size_t incrH, const size_t incrW, const size_t incrC, + const uchar4 mapC) +{ + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + + if (dst_x < dst_w && dst_y < dst_h) + { // Generic copy pixel to pixel. + const int sample = blockIdx.z; + const int src_w = src.width(sample); + const int src_h = src.height(sample); + + const float scale_x = static_cast(src_w) / resize_w; + const float scale_y = static_cast(src_h) / resize_h; + + dst += sample * incrN + dst_y * incrH + dst_x * incrW; + + const int sx = cuda::min(cuda::round((dst_x + crop_x) * scale_x), src_w - 1); + const int sy = cuda::min(cuda::round((dst_y + crop_y) * scale_y), src_h - 1); + + SrcT v = *src.ptr(sample, sy, sx); + + // Channel manipulation, convert type, and reformat. + dst[mapC.x * incrC] = (DstT)v.x; + dst[mapC.y * incrC] = (DstT)v.y; + dst[mapC.z * incrC] = (DstT)v.z; + } +} // resizeCrop_NN_varShape + +//******************** Bilinear (ImageBatchVarShape src) + +template +__global__ void resizeCrop_bilinear_varShape(SrcWrapper src, DstT *dst, const int dst_w, const int dst_h, + const float resize_w, const float resize_h, const int crop_x, + const int crop_y, const size_t incrN, const size_t incrH, + const size_t incrW, const size_t incrC, const uchar4 mapC) +{ + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((dst_x < dst_w) && (dst_y < dst_h)) + { + const int sample = blockIdx.z; + const int src_w = src.width(sample); + const int src_h = src.height(sample); + + // Float space for weighted addition. + float scale_x = static_cast(src_w) / resize_w; + float scale_y = static_cast(src_h) / resize_h; + + // Compute y coordinate. + float fy = (float)((dst_y + crop_y + 0.5f) * scale_y - 0.5f); + int sy = cuda::round(fy); + fy -= sy; + sy = cuda::max(0, cuda::min(sy, src_h - 2)); + + // Row pointers. + const SrcT *aPtr = src.ptr(sample, sy, 0); // Start of upper row. + const SrcT *bPtr = src.ptr(sample, sy + 1, 0); // Start of lower row. + + dst += sample * incrN + dst_y * incrH + dst_x * incrW; + + { // Cimpute source data position and weight for [x0] components. + float fx = (float)((dst_x + crop_x + 0.5f) * scale_x - 0.5f); + int sx = cuda::round(fx); + fx -= sx; + + fx *= ((sx >= 0) && (sx < src_w - 1)); + sx = cuda::max(0, cuda::min(sx, src_w - 2)); + + SrcT v = cuda::SaturateCast((1.0f - fx) * (aPtr[sx] * (1.0f - fy) + bPtr[sx] * fy) + + fx * (aPtr[sx + 1] * (1.0f - fy) + bPtr[sx + 1] * fy)); + // Channel manipulation, convert type, and reformat. + dst[mapC.x * incrC] = (DstT)v.x; + dst[mapC.y * incrC] = (DstT)v.y; + dst[mapC.z * incrC] = (DstT)v.z; + } + } +} // resizeCrop_bilinear_varShape + +#define MAP(m, i, v) ((uint8_t *)&(m))[i] = (v) + +inline uchar4 remapChannels(const NVCVChannelManip manip, int channels) +{ + uchar4 map = make_uchar4(0, 1, 2, 3); + + if (manip == NVCV_CHANNEL_REVERSE) + { + for (int c = 0; c < channels; ++c) MAP(map, c, channels - c - 1); + } + return map; +} + +#undef MAP + +template +void resizeCropConvertReformat(const nvcv::TensorDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData, + const NVCVSize2D resizeDim, NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip, cudaStream_t stream) + +{ + using SrcBaseT = cuda::BaseType; + using DstBaseT = cuda::BaseType; + + auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(srcData); + NVCV_ASSERT(srcAccess); + + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); + NVCV_ASSERT(dstAccess); + + const int samples = srcAccess->numSamples(); + const int channels = srcAccess->numChannels(); + const int src_w = srcAccess->numCols(); + const int src_h = srcAccess->numRows(); + const int dst_w = dstAccess->numCols(); + const int dst_h = dstAccess->numRows(); + + NVCV_ASSERT(samples == dstAccess->numSamples()); + NVCV_ASSERT(channels == dstAccess->numChannels()); + + float scale_x = (float)src_w / resizeDim.w; + float scale_y = (float)src_h / resizeDim.h; + + const int planes = dstAccess->numPlanes(); + + const uchar4 remap = remapChannels(manip, channels); + + const size_t incrC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1); + const size_t incrW = channels / planes; // 1 if planar; channels if not. + const size_t incrH = dstAccess->rowStride() / sizeof(DstBaseT); + const size_t incrN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT); + + const int THREADS_PER_BLOCK = 256; //256? 64? + const int BLOCK_WIDTH = 16; //as in 32x4 or 32x8. 16x8 and 16x16 are also viable + + const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1); + const dim3 gridSize(util::DivUp(dst_w, blockSize.x), util::DivUp(dst_h, blockSize.y), samples); + + auto src = cuda::CreateTensorWrapNHW(srcData); + + DstBaseT *dst = reinterpret_cast(dstData.basePtr()); + + //Note: resize is fundamentally a gather memory operation, with a little bit of compute + // our goals are to (a) maximize throughput, and (b) minimize occupancy for the same performance + + switch (interpolation) + { + case NVCV_INTERP_NEAREST: + resizeCrop_NN<<>>(src, dst, src_w, src_h, dst_w, dst_h, scale_x, scale_y, + cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap); + break; + + case NVCV_INTERP_LINEAR: + resizeCrop_bilinear<<>>(src, dst, src_w, src_h, dst_w, dst_h, scale_x, scale_y, + cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, + remap); + break; + + case NVCV_INTERP_CUBIC: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_CUBIC"); + break; + + case NVCV_INTERP_AREA: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_AREA"); + break; + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); + break; + } //switch +} //resize + +template +void resizeCropConvertReformat(const nvcv::ImageBatchVarShapeDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip, cudaStream_t stream) +{ + using SrcBaseT = cuda::BaseType; + using DstBaseT = cuda::BaseType; + + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); + NVCV_ASSERT(dstAccess); + + const nvcv::ImageFormat srcFrmt = srcData.uniqueFormat(); + NVCV_ASSERT(srcFrmt); + + const int samples = srcData.numImages(); + const int channels = srcFrmt.numChannels(); + const int dst_w = dstAccess->numCols(); + const int dst_h = dstAccess->numRows(); + + NVCV_ASSERT(samples == dstAccess->numSamples()); + NVCV_ASSERT(channels == dstAccess->numChannels()); + + const int planes = dstAccess->numPlanes(); + const uchar4 remap = remapChannels(manip, channels); + + const size_t incrC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1); + const size_t incrW = channels / planes; // 1 if planar; channels if not. + const size_t incrH = dstAccess->rowStride() / sizeof(DstBaseT); + const size_t incrN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT); + + const int THREADS_PER_BLOCK = 256; //Performance degrades above 256 and below 16 (GMEM speed limited) + const int BLOCK_WIDTH = 8; //as in 32x4 or 32x8 or 8x32. + + const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1); + const dim3 gridSize(util::DivUp(dst_w, blockSize.x), util::DivUp(dst_h, blockSize.y), samples); + + cuda::ImageBatchVarShapeWrap src(srcData); + + DstBaseT *dst = reinterpret_cast(dstData.basePtr()); + + switch (interpolation) + { + case NVCV_INTERP_NEAREST: + resizeCrop_NN_varShape<<>>( + src, dst, dst_w, dst_h, resizeDim.w, resizeDim.h, cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap); + break; + + case NVCV_INTERP_LINEAR: + resizeCrop_bilinear_varShape<<>>( + src, dst, dst_w, dst_h, resizeDim.w, resizeDim.h, cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap); + break; + + case NVCV_INTERP_CUBIC: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_CUBIC"); + break; + + case NVCV_INTERP_AREA: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_AREA"); + break; + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); + break; + + } //switch interpolation +} + +} // anonymous namespace + +// clang-format off +namespace cvcuda::priv { +ResizeCropConvertReformat::ResizeCropConvertReformat() { } + +// clang-format on + +void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, + const int2 cropPos, const NVCVChannelManip manip) const +{ + auto srcData = src.exportData(); + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input must be a cuda-accessible, pitch-linear tensor"); + } + + auto dstData = dst.exportData(); + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be a cuda-accessible, pitch-linear tensor"); + } + + auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData); + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData); + + const int samples = srcAccess->numSamples(); + const int channels = srcAccess->numChannels(); + + const int dst_w = dstAccess->numCols(); + const int dst_h = dstAccess->numRows(); + + if (samples != dstAccess->numSamples()) + { + std::string msg = "Input and output must have the same batch size (i.e., same number of images): Provided " + + std::to_string(samples) + " input and " + std::to_string(dstAccess->numSamples()) + + " output images / samples"; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); + } + + if (channels != dstAccess->numChannels()) + { + std::string msg = "Input and output must have same number of channels: Provided " + std::to_string(channels) + + " input and " + std::to_string(dstAccess->numChannels()) + " output channels"; + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + if (channels != 3) + { + std::string msg = "Only three-channel input is currently supported: Provided " + std::to_string(channels) + + " input channels"; + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + cuda_op::DataType srcType = helpers::GetLegacyDataType((*srcData).dtype()); + cuda_op::DataType dstType = helpers::GetLegacyDataType((*dstData).dtype()); + + if (srcType != cuda_op::kCV_8U) + { + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Input must be of data type uchar."); + } + + if (dstType != cuda_op::kCV_8U && dstType != cuda_op::kCV_32F) + { + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Output must be of data type uchar or float."); + } + + nvcv::TensorLayout srcLayout = srcData->layout(); + nvcv::TensorLayout dstLayout = dstData->layout(); + + if (!(srcLayout == NVCV_TENSOR_NHWC || srcLayout == NVCV_TENSOR_HWC)) + { + const char *layout = nvcvTensorLayoutGetName(&srcLayout.m_layout); + std::string msg = "Input tensor must have 'NHWC' or 'HWC' layout: Layout provided " + std::string(layout); + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + if (!(dstLayout == NVCV_TENSOR_NHWC || dstLayout == NVCV_TENSOR_HWC || dstLayout == NVCV_TENSOR_NCHW + || dstLayout == NVCV_TENSOR_CHW)) + { + const char *layout = nvcvTensorLayoutGetName(&dstLayout.m_layout); + std::string msg + = "Output tensor must have 'NHWC', 'NCHW', 'HWC', or 'CHW' layout: Layout provided " + std::string(layout); + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > resizeDim.w || cropPos.y + dst_h > resizeDim.h) + { + std::string msg = "Invalid crop region: crop region(x, y, w, h) = (" + std::to_string(cropPos.x) + ", " + + std::to_string(cropPos.y) + ", " + std::to_string(dst_w) + ", " + std::to_string(dst_h) + + ") extends beyond bounds of resized tensor"; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); + } + + if (srcType == cuda_op::kCV_8U) + { + if (dstType == cuda_op::kCV_8U) + { + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, + stream); + } + else if (dstType == cuda_op::kCV_32F) + { + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, + stream); + } + } +} + +void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &src, + const nvcv::Tensor &dst, const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip) const +{ + auto srcData = src.exportData(stream); + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input data must be a cuda-accessible, varshape image batch"); + } + + auto dstData = dst.exportData(); + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be a cuda-accessible, pitch-linear tensor"); + } + + const nvcv::ImageFormat srcFrmt = src.uniqueFormat(); + + if (!srcFrmt) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "All input images in a batch must have the same format (including number of channels)"); + } + + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData); + + const int samples = srcData->numImages(); + const int channels = srcFrmt.numChannels(); + + const int dst_w = dstAccess->numCols(); + const int dst_h = dstAccess->numRows(); + + if (samples != dstAccess->numSamples()) + { + std::string msg = "Input and output must have the same batch size (i.e., same number of images): Provided " + + std::to_string(samples) + " input and " + std::to_string(dstAccess->numChannels()) + + " output channels"; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); + } + + if (channels != dstAccess->numChannels()) + { + std::string msg = "Input and output must have same number of channels: Provided " + std::to_string(channels) + + " input and " + std::to_string(dstAccess->numChannels()) + " output channels"; + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + if (channels != 3) + { + std::string msg = "Only three-channel input is currently supported: Provided " + std::to_string(channels) + + " input channels"; + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + cuda_op::DataType srcType = helpers::GetLegacyDataType(srcFrmt); + cuda_op::DataType dstType = helpers::GetLegacyDataType((*dstData).dtype()); + + if (srcType != cuda_op::kCV_8U) + { + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Input must be of data type uchar."); + } + + if (dstType != cuda_op::kCV_8U && dstType != cuda_op::kCV_32F) + { + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Output must be of data type uchar or float."); + } + + nvcv::TensorLayout dstLayout = dstData->layout(); + + if (srcFrmt.numPlanes() > 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "Input must be non-planar (i.e., interleaved)."); + } + + if (!(dstLayout == NVCV_TENSOR_NHWC || dstLayout == NVCV_TENSOR_HWC || dstLayout == NVCV_TENSOR_NCHW + || dstLayout == NVCV_TENSOR_CHW)) + { + const char *layout = nvcvTensorLayoutGetName(&dstLayout.m_layout); + std::string msg + = "Output tensor must have 'NHWC', 'NCHW', 'HWC', or 'CHW' layout: Layout provided " + std::string(layout); + throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); + } + + if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > resizeDim.w || cropPos.y + dst_h > resizeDim.h) + { + std::string msg = "Invalid crop region: crop region(x, y, w, h) = (" + std::to_string(cropPos.x) + ", " + + std::to_string(cropPos.y) + ", " + std::to_string(dst_w) + ", " + std::to_string(dst_h) + + ") extends beyond bounds of resized tensor"; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); + } + + if (srcType == cuda_op::kCV_8U) + { + if (dstType == cuda_op::kCV_8U) + { + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, + stream); + } + else if (dstType == cuda_op::kCV_32F) + { + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, + stream); + } + } +} +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpResizeCropConvertReformat.hpp b/src/cvcuda/priv/OpResizeCropConvertReformat.hpp new file mode 100644 index 000000000..eea8d7df5 --- /dev/null +++ b/src/cvcuda/priv/OpResizeCropConvertReformat.hpp @@ -0,0 +1,54 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpResizeCropConvertReformat.hpp + * + * @brief Defines the private C++ class for that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines. + */ + +#ifndef CVCUDA_PRIV__RESIZE_CROP_HPP +#define CVCUDA_PRIV__RESIZE_CROP_HPP + +#include "IOperator.hpp" +#include "legacy/CvCudaLegacy.h" + +#include // for NVCVInterpolationType, NVCVChannelManip, etc. +#include +#include + +#include + +namespace cvcuda::priv { + +class ResizeCropConvertReformat final : public IOperator +{ +public: + explicit ResizeCropConvertReformat(); + + void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, + const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP) const; + + void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out, + const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP) const; +}; + +} // namespace cvcuda::priv + +#endif // CVCUDA_PRIV__RESIZE_CROP_HPP diff --git a/src/cvcuda/priv/legacy/custom_crop.cu b/src/cvcuda/priv/legacy/custom_crop.cu index eea4ebfe7..3695007a3 100644 --- a/src/cvcuda/priv/legacy/custom_crop.cu +++ b/src/cvcuda/priv/legacy/custom_crop.cu @@ -123,7 +123,7 @@ ErrorCode CustomCrop::infer(const TensorDataStridedCuda &inData, const TensorDat if (start_x < 0 || start_y < 0 || end_x >= cols || end_y >= rows) { - LOG_ERROR("Invliad Roi range x " << roi.x << " y " << roi.y << " width " << roi.width << " height " + LOG_ERROR("Invalid Roi range x " << roi.x << " y " << roi.y << " width " << roi.width << " height " << roi.height); return ErrorCode::INVALID_PARAMETER; } diff --git a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp index b925ce60a..e4afd3ca0 100644 --- a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp @@ -175,7 +175,7 @@ class TensorWrap * * @return The const array (as a pointer) containing run-time pitches in bytes. */ - __host__ __device__ const int *strides() const + const __host__ __device__ int *strides() const { return m_strides; } @@ -483,6 +483,34 @@ __host__ auto CreateTensorWrapNHWC(const TensorDataStridedCuda &tensor) static_cast(tensorAccess->rowStride()), static_cast(tensorAccess->colStride())); } +/** + * Factory function to create an NCHW tensor wrap given a tensor data. + * + * The output \ref TensorWrap is an NCHW 4D tensor allowing to access data per batch (N), per channel (C), per row (H), and per column + * (W) of the input tensor. The input tensor data must have either NCHW or CHW layout, where + * the channel C is of type \p T, e.g. T=uchar for each channel of either RGB8 or RGBA8. + * + * @sa NVCV_CPP_CUDATOOLS_TENSORWRAP + * + * @tparam T Type of the values to be accessed in the tensor wrap. + * + * @param[in] tensor Reference to the tensor that will be wrapped. + * + * @return Tensor wrap useful to access tensor data in CUDA kernels. + */ +template>> +__host__ auto CreateTensorWrapNCHW(const TensorDataStridedCuda &tensor) +{ + auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); + assert(tensorAccess); + assert(tensorAccess->sampleStride() <= TypeTraits::max); + assert(tensorAccess->chStride() <= TypeTraits::max); + assert(tensorAccess->rowStride() <= TypeTraits::max); + + return Tensor4DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), + static_cast(tensorAccess->chStride()), static_cast(tensorAccess->rowStride())); +} + } // namespace nvcv::cuda #endif // NVCV_CUDA_TENSOR_WRAP_HPP diff --git a/tests/cvcuda/python/test_opresizecropconvertreformat.py b/tests/cvcuda/python/test_opresizecropconvertreformat.py new file mode 100644 index 000000000..0f9f2a796 --- /dev/null +++ b/tests/cvcuda/python/test_opresizecropconvertreformat.py @@ -0,0 +1,422 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest as t +import nvcv +import cvcuda +import torch + +# NOTE: The following tests for resize_crop_convert_reformat DO NOT TEST: +# 1. The correctness of the output data +# 2. Whether the format conversion actually worked correctly w.r.t. the data +# 3. Whether the channel swapping actually worked correctly w.r.t. the data + + +@t.mark.parametrize( + "tensor_params, resize_dim, resize_interpolation, crop_rect_params, " + "out_layout, out_dtype, manip, out_expected_shape, is_positive_test", + [ + ( + ((4, 512, 512, 3), np.uint8, "NHWC"), # Basic test + (256, 256), + cvcuda.Interp.LINEAR, + (0, 0, 224, 224), + "NCHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (4, 3, 224, 224), + True, + ), + ( + ((4, 512, 512, 3), np.uint8, "NHWC"), + (256, 256), + cvcuda.Interp.NEAREST, # With NEAREST Interpolation + (0, 0, 224, 224), + "NCHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (4, 3, 224, 224), + True, + ), + ( + ((4, 512, 512, 3), np.uint8, "NHWC"), + (256, 256), + cvcuda.Interp.NEAREST, + (0, 0, 224, 224), + "", # Empty output layout means keep the same as input + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (4, 224, 224, 3), + True, + ), + ( + ((4, 512, 512, 3), np.uint8, "NHWC"), + (256, 256), + cvcuda.Interp.NEAREST, + (0, 0, 224, 224), + "", # Empty output layout means keep the same layout as input + 0, # Zero means keep the same dtype as input + cvcuda.ChannelManip.REVERSE, + (4, 224, 224, 3), + True, + ), + ( + ((17, 678, 1027, 3), np.uint8, "NHWC"), # Odd sizes + (251, 256), # Odd sizes + cvcuda.Interp.LINEAR, + (0, 0, 200, 22), + "NCHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (17, 3, 22, 200), + True, + ), + ( + ((17, 678, 1027, 3), np.uint8, "NHWC"), + (251, 256), + cvcuda.Interp.LINEAR, + (0, 0, 200, 22), + "NHWC", # Same output layout as the input tensor + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (17, 22, 200, 3), + True, + ), + ( + ((3, 40, 20, 3), np.uint8, "NHWC"), + (160, 160), + cvcuda.Interp.NEAREST, + (10, 20, 20, 35), + "NCHW", + nvcv.Type.U8, # Same dtype as the input tensor + cvcuda.ChannelManip.NO_OP, # No op here + (3, 3, 35, 20), + True, + ), + ( + ((512, 512, 3), np.uint8, "HWC"), # Single image case. + (256, 256), + cvcuda.Interp.LINEAR, + (0, 0, 224, 224), + "CHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (3, 224, 224), + True, + ), + ( + ((3, 512, 512), np.uint8, "CHW"), # Unsupported input CHW + (256, 256), + cvcuda.Interp.LINEAR, + (0, 0, 224, 224), + "CHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (3, 224, 224), + False, # Negative test + ), + ( + ((512, 1024, 3), np.uint8, "HWC"), # Large sizes + (1024, 256), # Unchanged resize width + cvcuda.Interp.LINEAR, + (0, 0, 1024, 224), # Unchanged crop width + "CHW", + nvcv.Type.F32, + cvcuda.ChannelManip.NO_OP, + (3, 224, 1024), + True, + ), + ( + ((512, 1024, 3), np.float32, "HWC"), # Unsupported input dtype + (1024, 256), + cvcuda.Interp.LINEAR, + (0, 0, 1024, 224), + "CHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (3, 224, 1024), + False, # Negative test + ), + ( + ((1, 2, 3), np.uint8, "HWC"), # Very small sizes + (60, 5), + cvcuda.Interp.LINEAR, + (0, 0, 59, 5), + "CHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (3, 5, 59), + True, + ), + ( + ((1, 2, 3), np.uint8, "HWC"), + (60, 5), + cvcuda.Interp.LINEAR, + (0, 0, 61, 5), # Out of range crop. + "CHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (3, 5, 59), + False, # Negative test + ), + ( + ((4, 512, 512, 3), np.uint8, "NHWC"), + (256, 256), + cvcuda.Interp.AREA, # With Area Interpolation + (0, 0, 224, 224), + "NCHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (4, 3, 224, 224), + False, # Negative test + ), + ], +) +def test_op_resize_crop_convert_reformat( + tensor_params, + resize_dim, + resize_interpolation, + crop_rect_params, + out_layout, + out_dtype, + manip, + out_expected_shape, + is_positive_test, +): + + inputTensor = cvcuda.Tensor(*tensor_params) + out_layout = out_layout if out_layout else str(inputTensor.layout) + out_dtype = out_dtype if out_dtype else inputTensor.dtype + + try: + out1 = cvcuda.resize_crop_convert_reformat( + inputTensor, + resize_dim, + resize_interpolation, + cvcuda.RectI(*crop_rect_params), + layout=out_layout, + data_type=out_dtype, + manip=manip, + ) + except Exception as e: + if is_positive_test: + raise e + else: + # This is pass for a negative test. + pass + + if is_positive_test: + assert out1.layout == out_layout + assert out1.shape == out_expected_shape + assert out1.dtype == out_dtype + + out2 = cvcuda.Tensor(out_expected_shape, out_dtype, out_layout) + + try: + tmp = cvcuda.resize_crop_convert_reformat_into( + out2, + inputTensor, + resize_dim, + resize_interpolation, + [crop_rect_params[1], crop_rect_params[0]], + manip=manip, + ) + except Exception as e: + if is_positive_test: + raise e + else: + # This is pass for a negative test. + pass + + if is_positive_test: + assert tmp is out2 + assert out2.layout == out_layout + assert out2.shape == out_expected_shape + assert out2.dtype == out_dtype + + # Compare the two + if is_positive_test: + out1 = torch.as_tensor(out1.cuda()) + out2 = torch.as_tensor(out2.cuda()) + assert torch.equal(out1, out2) + + +@t.mark.parametrize( + "num_images, min_size, max_size, resize_dim, resize_interpolation, crop_rect_params, " + "out_layout, out_dtype, manip, out_expected_shape, is_positive_test", + [ + ( + 10, # Basic test + (50, 50), + (512, 512), + (256, 256), + cvcuda.Interp.LINEAR, + (0, 0, 224, 224), + "NCHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (10, 3, 224, 224), + True, + ), + ( + 1, # Only one image + (500, 300), + (800, 700), # Bigger sizes + (400, 200), + cvcuda.Interp.LINEAR, + (0, 0, 224, 190), + "NHWC", # Same output layout as the input + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (1, 190, 224, 3), + True, + ), + ( + 50, # More images + (1, 1), # Very small image + (50, 70), + (400, 200), + cvcuda.Interp.LINEAR, + (0, 0, 224, 190), + "NHWC", + nvcv.Type.F32, + cvcuda.ChannelManip.NO_OP, # No channels swapping + (50, 190, 224, 3), + True, + ), + ( + 50, + (1, 1), + (50, 70), + (400, 200), + cvcuda.Interp.LINEAR, + (0, 0, 224, 190), + "NCHW", + nvcv.Type.U8, # Same uint8 dtype as the input + cvcuda.ChannelManip.REVERSE, + (50, 3, 190, 224), + True, + ), + ( + 50, + (1, 1), + (50, 70), + (400, 200), + cvcuda.Interp.LINEAR, + (0, 0, 224, 190), + "NCHW", + nvcv.Type.U8, + cvcuda.ChannelManip.NO_OP, # NO_OP + (50, 3, 190, 224), + True, + ), + ( + 50, + (1, 1), + (50, 70), + (400, 200), + cvcuda.Interp.LINEAR, + (0, 0, 224, 190), + "", # Same uint8 dtype as the input + 0, # Same uint8 dtype as the input + cvcuda.ChannelManip.REVERSE, + (50, 190, 224, 3), + True, + ), + ], +) +def test_op_resize_crop_convert_reformat_varshape( + num_images, + min_size, + max_size, + resize_dim, + resize_interpolation, + crop_rect_params, + out_layout, + out_dtype, + manip, + out_expected_shape, + is_positive_test, +): + + inputVarShape = cvcuda.ImageBatchVarShape(num_images) + out_layout = out_layout if out_layout else "NHWC" + out_dtype = out_dtype if out_dtype else nvcv.Type.U8 + + inputVarShape.pushback( + [ + cvcuda.Image( + ( + min_size[0] + (max_size[0] - min_size[0]) * i // num_images, + min_size[1] + (max_size[1] - min_size[1]) * i // num_images, + ), + cvcuda.Format.RGB8, + ) + for i in range(num_images) + ] + ) + + try: + out1 = cvcuda.resize_crop_convert_reformat( + inputVarShape, + resize_dim=resize_dim, + interp=resize_interpolation, + crop_rect=cvcuda.RectI(*crop_rect_params), + layout=out_layout, + data_type=out_dtype, + manip=manip, + ) + except Exception as e: + if is_positive_test: + raise e + else: + # This is pass for a negative test. + pass + + if is_positive_test: + assert out1.layout == out_layout + assert out1.shape == out_expected_shape + assert out1.dtype == out_dtype + + out2 = cvcuda.Tensor(out_expected_shape, out_dtype, out_layout) + + try: + tmp = cvcuda.resize_crop_convert_reformat_into( + out2, + inputVarShape, + resize_dim, + resize_interpolation, + [crop_rect_params[1], crop_rect_params[0]], + manip=manip, + ) + except Exception as e: + if is_positive_test: + raise e + else: + # This is pass for a negative test. + pass + + if is_positive_test: + assert tmp is out2 + assert out2.layout == out_layout + assert out2.shape == out_expected_shape + assert out2.dtype == out_dtype + + if is_positive_test: + # Compare the two + out1 = torch.as_tensor(out1.cuda()) + out2 = torch.as_tensor(out2.cuda()) + assert torch.equal(out1, out2) diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt index e82b17070..0b3375c3d 100644 --- a/tests/cvcuda/system/CMakeLists.txt +++ b/tests/cvcuda/system/CMakeLists.txt @@ -31,6 +31,7 @@ endif() # system core ------------------------------------------------- add_executable(cvcuda_test_system + TestOpResizeCropConvertReformat.cpp TestOpPairwiseMatcher.cpp TestOpStack.cpp TestOpLabel.cpp diff --git a/tests/cvcuda/system/ResizeUtils.cpp b/tests/cvcuda/system/ResizeUtils.cpp index 401ed59f8..98bfa2792 100644 --- a/tests/cvcuda/system/ResizeUtils.cpp +++ b/tests/cvcuda/system/ResizeUtils.cpp @@ -28,50 +28,42 @@ namespace nvcv::test { -void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, - bool isVarshape) +template +void _Resize(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const std::vector &hSrc, int srcStep, + nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, bool isVarshape) { - if (interpolation == NVCV_INTERP_NEAREST || interpolation == NVCV_INTERP_LINEAR - || interpolation == NVCV_INTERP_CUBIC) - { - ResizedCrop(hDst, dstRowStride, dstSize, hSrc, srcRowStride, srcSize, 0, 0, srcSize.h, srcSize.w, fmt, - interpolation); - return; - } - - double iScale = static_cast(srcSize.h) / dstSize.h; - double jScale = static_cast(srcSize.w) / dstSize.w; + double scaleH = static_cast(srcSize.h) / dstSize.h; + double scaleW = static_cast(srcSize.w) / dstSize.w; assert(fmt.numPlanes() == 1); - int elementsPerPixel = fmt.numChannels(); + int channels = fmt.numChannels(); - uint8_t *dstPtr = hDst.data(); - const uint8_t *srcPtr = hSrc.data(); + T *dstPtr = hDst.data(); + const T *srcPtr = hSrc.data(); - for (int di = 0; di < dstSize.h; di++) + for (int dy = 0; dy < dstSize.h; dy++) { - for (int dj = 0; dj < dstSize.w; dj++) + for (int dx = 0; dx < dstSize.w; dx++) { - if (interpolation == NVCV_INTERP_AREA) + if (interp == NVCV_INTERP_AREA) { - double fsx1 = dj * jScale; - double fsx2 = fsx1 + jScale; - double fsy1 = di * iScale; - double fsy2 = fsy1 + iScale; + double fsx1 = dx * scaleW; + double fsx2 = fsx1 + scaleW; + double fsy1 = dy * scaleH; + double fsy2 = fsy1 + scaleH; int sx1 = cuda::round(fsx1); int sx2 = cuda::round(fsx2); int sy1 = cuda::round(fsy1); int sy2 = cuda::round(fsy2); - for (int k = 0; k < elementsPerPixel; k++) + for (int c = 0; c < channels; c++) { double out = 0.0; - if (std::ceil(jScale) == jScale && std::ceil(iScale) == iScale) + if (std::ceil(scaleW) == scaleW && std::ceil(scaleH) == scaleH) { - double invscale = 1.f / (jScale * iScale); + double invscale = 1.f / (scaleW * scaleH); for (int dy = sy1; dy < sy2; ++dy) { @@ -79,90 +71,88 @@ void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, { if (dy >= 0 && dy < srcSize.h && dx >= 0 && dx < srcSize.w) { - out = out + srcPtr[dy * srcRowStride + dx * elementsPerPixel + k] * invscale; + out = out + srcPtr[dy * srcStep + dx * channels + c] * invscale; } } } } else { - if (!isVarshape || (iScale >= 1.0f && jScale >= 1.0f)) + if (!isVarshape || (scaleH >= 1.0f && scaleW >= 1.0f)) { double invscale - = 1.f / (std::min(jScale, srcSize.w - fsx1) * std::min(iScale, srcSize.h - fsy1)); + = 1.f / (std::min(scaleW, srcSize.w - fsx1) * std::min(scaleH, srcSize.h - fsy1)); for (int dy = sy1; dy < sy2; ++dy) { for (int dx = sx1; dx < sx2; ++dx) if (dy >= 0 && dy < srcSize.h && dx >= 0 && dx < srcSize.w) - out = out + srcPtr[dy * srcRowStride + dx * elementsPerPixel + k] * invscale; + out = out + srcPtr[dy * srcStep + dx * channels + c] * invscale; if (sx1 > fsx1) if (dy >= 0 && dy < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) out = out - + srcPtr[dy * srcRowStride + (sx1 - 1) * elementsPerPixel + k] + + srcPtr[dy * srcStep + (sx1 - 1) * channels + c] * ((sx1 - fsx1) * invscale); if (sx2 < fsx2) if (dy >= 0 && dy < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) out = out - + srcPtr[dy * srcRowStride + sx2 * elementsPerPixel + k] - * ((fsx2 - sx2) * invscale); + + srcPtr[dy * srcStep + sx2 * channels + c] * ((fsx2 - sx2) * invscale); } if (sy1 > fsy1) for (int dx = sx1; dx < sx2; ++dx) if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && dx >= 0 && dx < srcSize.w) out = out - + srcPtr[(sy1 - 1) * srcRowStride + dx * elementsPerPixel + k] + + srcPtr[(sy1 - 1) * srcStep + dx * channels + c] * ((sy1 - fsy1) * invscale); if (sy2 < fsy2) for (int dx = sx1; dx < sx2; ++dx) if (sy2 >= 0 && sy2 < srcSize.h && dx >= 0 && dx < srcSize.w) out = out - + srcPtr[sy2 * srcRowStride + dx * elementsPerPixel + k] - * ((fsy2 - sy2) * invscale); + + srcPtr[sy2 * srcStep + dx * channels + c] * ((fsy2 - sy2) * invscale); if ((sy1 > fsy1) && (sx1 > fsx1)) if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) out = out - + srcPtr[(sy1 - 1) * srcRowStride + (sx1 - 1) * elementsPerPixel + k] + + srcPtr[(sy1 - 1) * srcStep + (sx1 - 1) * channels + c] * ((sy1 - fsy1) * (sx1 - fsx1) * invscale); if ((sy1 > fsy1) && (sx2 < fsx2)) if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) out = out - + srcPtr[(sy1 - 1) * srcRowStride + sx2 * elementsPerPixel + k] + + srcPtr[(sy1 - 1) * srcStep + sx2 * channels + c] * ((sy1 - fsy1) * (fsx2 - sx2) * invscale); if ((sy2 < fsy2) && (sx2 < fsx2)) if (sy2 >= 0 && sy2 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) out = out - + srcPtr[sy2 * srcRowStride + sx2 * elementsPerPixel + k] + + srcPtr[sy2 * srcStep + sx2 * channels + c] * ((fsy2 - sy2) * (fsx2 - sx2) * invscale); if ((sy2 < fsy2) && (sx1 > fsx1)) if (sy2 >= 0 && sy2 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) out = out - + srcPtr[sy2 * srcRowStride + (sx1 - 1) * elementsPerPixel + k] + + srcPtr[sy2 * srcStep + (sx1 - 1) * channels + c] * ((fsy2 - sy2) * (sx1 - fsx1) * invscale); } else // zoom in for varshape { - double iScale_inv = 1.0 / iScale; - double jScale_inv = 1.0 / jScale; + double scaleH_inv = 1.0 / scaleH; + double scaleW_inv = 1.0 / scaleW; sy1 = cuda::round(fsy1); sx1 = cuda::round(fsx1); - float fy = (float)(float(di + 1) - float(sy1 + 1) * iScale_inv); + float fy = (float)(float(dy + 1) - float(sy1 + 1) * scaleH_inv); fy = fy <= 0 ? 0.f : fy - cuda::round(fy); float cbufy[2]; cbufy[0] = 1.f - fy; cbufy[1] = fy; - float fx = (float)(float(dj + 1) - float(sx1 + 1) * jScale_inv); + float fx = (float)(float(dx + 1) - float(sx1 + 1) * scaleW_inv); fx = fx <= 0 ? 0.f : fx - cuda::round(fx); if (sx1 < 0) @@ -181,144 +171,205 @@ void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, float cbufx[2]; cbufx[0] = 1.f - fx; cbufx[1] = fx; - out = srcPtr[sy1 * srcRowStride + sx1 * elementsPerPixel + k] * cbufx[0] * cbufy[0] - + srcPtr[(sy1 + 1) * srcRowStride + sx1 * elementsPerPixel + k] * cbufx[0] * cbufy[1] - + srcPtr[sy1 * srcRowStride + (sx1 + 1) * elementsPerPixel + k] * cbufx[1] * cbufy[0] - + srcPtr[(sy1 + 1) * srcRowStride + (sx1 + 1) * elementsPerPixel + k] * cbufx[1] - * cbufy[1]; + out = srcPtr[sy1 * srcStep + sx1 * channels + c] * cbufx[0] * cbufy[0] + + srcPtr[(sy1 + 1) * srcStep + sx1 * channels + c] * cbufx[0] * cbufy[1] + + srcPtr[sy1 * srcStep + (sx1 + 1) * channels + c] * cbufx[1] * cbufy[0] + + srcPtr[(sy1 + 1) * srcStep + (sx1 + 1) * channels + c] * cbufx[1] * cbufy[1]; } } out = std::rint(std::abs(out)); - dstPtr[di * dstRowStride + dj * elementsPerPixel + k] = out < 0 ? 0 : (out > 255 ? 255 : out); + dstPtr[dy * dstStep + dx * channels + c] = out < MinVal ? MinVal : (out > MaxVal ? MaxVal : out); } } } } } -void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, - nvcv::ImageFormat fmt, NVCVInterpolationType interpolation) +template +void _ResizedCrop(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const std::vector &hSrc, int srcStep, + nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat fmt, + NVCVInterpolationType interp) { - double iScale = static_cast(crop_rows) / dstSize.h; - double jScale = static_cast(crop_cols) / dstSize.w; + double scaleH = static_cast(crop_rows) / dstSize.h; + double scaleW = static_cast(crop_cols) / dstSize.w; assert(fmt.numPlanes() == 1); - int elementsPerPixel = fmt.numChannels(); + int channels = fmt.numChannels(); - uint8_t *dstPtr = hDst.data(); - const uint8_t *srcPtr = hSrc.data(); + T *dstPtr = hDst.data(); + const T *srcPtr = hSrc.data(); - for (int di = 0; di < dstSize.h; di++) + for (int dy = 0; dy < dstSize.h; dy++) { - for (int dj = 0; dj < dstSize.w; dj++) + for (int dx = 0; dx < dstSize.w; dx++) { - if (interpolation == NVCV_INTERP_NEAREST) + if (interp == NVCV_INTERP_NEAREST) { - double fi = iScale * di + top; - double fj = jScale * dj + left; + double fy = scaleH * dy + top; + double fx = scaleW * dx + left; - int si = std::floor(fi); - int sj = std::floor(fj); + int sy = std::floor(fy); + int sx = std::floor(fx); - si = std::min(si, srcSize.h - 1); - sj = std::min(sj, srcSize.w - 1); + sy = std::min(sy, srcSize.h - 1); + sx = std::min(sx, srcSize.w - 1); - for (int k = 0; k < elementsPerPixel; k++) + for (int c = 0; c < channels; c++) { - dstPtr[di * dstRowStride + dj * elementsPerPixel + k] - = srcPtr[si * srcRowStride + sj * elementsPerPixel + k]; + dstPtr[dy * dstStep + dx * channels + c] = srcPtr[sy * srcStep + sx * channels + c]; } } - else if (interpolation == NVCV_INTERP_LINEAR) + else if (interp == NVCV_INTERP_LINEAR) { - double fi = iScale * (di + 0.5) - 0.5 + top; - double fj = jScale * (dj + 0.5) - 0.5 + left; + double fy = scaleH * (dy + 0.5) - 0.5 + top; + double fx = scaleW * (dx + 0.5) - 0.5 + left; - int si = std::floor(fi); - int sj = std::floor(fj); + int sy = std::floor(fy); + int sx = std::floor(fx); - fi -= si; - fj -= sj; + fy -= sy; + fx -= sx; - fj = (sj < 0 || sj >= srcSize.w - 1) ? 0 : fj; + fx = (sx < 0 || sx >= srcSize.w - 1) ? 0 : fx; - si = std::max(0, std::min(si, srcSize.h - 2)); - sj = std::max(0, std::min(sj, srcSize.w - 2)); + sy = std::max(0, std::min(sy, srcSize.h - 2)); + sx = std::max(0, std::min(sx, srcSize.w - 2)); - double iWeights[2] = {1 - fi, fi}; - double jWeights[2] = {1 - fj, fj}; + double wghtY[2] = {1 - fy, fy}; + double wghtX[2] = {1 - fx, fx}; - for (int k = 0; k < elementsPerPixel; k++) + for (int c = 0; c < channels; c++) { - double res = std::rint(std::abs( - srcPtr[(si + 0) * srcRowStride + (sj + 0) * elementsPerPixel + k] * iWeights[0] * jWeights[0] - + srcPtr[(si + 1) * srcRowStride + (sj + 0) * elementsPerPixel + k] * iWeights[1] * jWeights[0] - + srcPtr[(si + 0) * srcRowStride + (sj + 1) * elementsPerPixel + k] * iWeights[0] * jWeights[1] - + srcPtr[(si + 1) * srcRowStride + (sj + 1) * elementsPerPixel + k] * iWeights[1] - * jWeights[1])); - - dstPtr[di * dstRowStride + dj * elementsPerPixel + k] = res < 0 ? 0 : (res > 255 ? 255 : res); + double res = std::rint( + std::abs(srcPtr[(sy + 0) * srcStep + (sx + 0) * channels + c] * wghtY[0] * wghtX[0] + + srcPtr[(sy + 1) * srcStep + (sx + 0) * channels + c] * wghtY[1] * wghtX[0] + + srcPtr[(sy + 0) * srcStep + (sx + 1) * channels + c] * wghtY[0] * wghtX[1] + + srcPtr[(sy + 1) * srcStep + (sx + 1) * channels + c] * wghtY[1] * wghtX[1])); + + dstPtr[dy * dstStep + dx * channels + c] = res < MinVal ? MinVal : (res > MaxVal ? MaxVal : res); } } - else if (interpolation == NVCV_INTERP_CUBIC) + else if (interp == NVCV_INTERP_CUBIC) { - double fi = iScale * (di + 0.5) - 0.5 + top; - double fj = jScale * (dj + 0.5) - 0.5 + left; + double fy = scaleH * (dy + 0.5) - 0.5 + top; + double fx = scaleW * (dx + 0.5) - 0.5 + left; - int si = std::floor(fi); - int sj = std::floor(fj); + int sy = std::floor(fy); + int sx = std::floor(fx); - fi -= si; - fj -= sj; + fy -= sy; + fx -= sx; - fj = (sj < 1 || sj >= srcSize.w - 3) ? 0 : fj; + fx = (sx < 1 || sx >= srcSize.w - 3) ? 0 : fx; - si = std::max(1, std::min(si, srcSize.h - 3)); - sj = std::max(1, std::min(sj, srcSize.w - 3)); + sy = std::max(1, std::min(sy, srcSize.h - 3)); + sx = std::max(1, std::min(sx, srcSize.w - 3)); const double A = -0.75; - double iWeights[4]; - iWeights[0] = ((A * (fi + 1) - 5 * A) * (fi + 1) + 8 * A) * (fi + 1) - 4 * A; - iWeights[1] = ((A + 2) * fi - (A + 3)) * fi * fi + 1; - iWeights[2] = ((A + 2) * (1 - fi) - (A + 3)) * (1 - fi) * (1 - fi) + 1; - iWeights[3] = 1 - iWeights[0] - iWeights[1] - iWeights[2]; - - double jWeights[4]; - jWeights[0] = ((A * (fj + 1) - 5 * A) * (fj + 1) + 8 * A) * (fj + 1) - 4 * A; - jWeights[1] = ((A + 2) * fj - (A + 3)) * fj * fj + 1; - jWeights[2] = ((A + 2) * (1 - fj) - (A + 3)) * (1 - fj) * (1 - fj) + 1; - jWeights[3] = 1 - jWeights[0] - jWeights[1] - jWeights[2]; - - for (int k = 0; k < elementsPerPixel; k++) + double wghtY[4]; + wghtY[0] = ((A * (fy + 1) - 5 * A) * (fy + 1) + 8 * A) * (fy + 1) - 4 * A; + wghtY[1] = ((A + 2) * fy - (A + 3)) * fy * fy + 1; + wghtY[2] = ((A + 2) * (1 - fy) - (A + 3)) * (1 - fy) * (1 - fy) + 1; + wghtY[3] = 1 - wghtY[0] - wghtY[1] - wghtY[2]; + + double wghtX[4]; + wghtX[0] = ((A * (fx + 1) - 5 * A) * (fx + 1) + 8 * A) * (fx + 1) - 4 * A; + wghtX[1] = ((A + 2) * fx - (A + 3)) * fx * fx + 1; + wghtX[2] = ((A + 2) * (1 - fx) - (A + 3)) * (1 - fx) * (1 - fx) + 1; + wghtX[3] = 1 - wghtX[0] - wghtX[1] - wghtX[2]; + + for (int c = 0; c < channels; c++) { - double res = std::rint(std::abs( - srcPtr[(si - 1) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[0] - + srcPtr[(si + 0) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[1] - + srcPtr[(si + 1) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[2] - + srcPtr[(si + 2) * srcRowStride + (sj - 1) * elementsPerPixel + k] * jWeights[0] * iWeights[3] - + srcPtr[(si - 1) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[0] - + srcPtr[(si + 0) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[1] - + srcPtr[(si + 1) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[2] - + srcPtr[(si + 2) * srcRowStride + (sj + 0) * elementsPerPixel + k] * jWeights[1] * iWeights[3] - + srcPtr[(si - 1) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[0] - + srcPtr[(si + 0) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[1] - + srcPtr[(si + 1) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[2] - + srcPtr[(si + 2) * srcRowStride + (sj + 1) * elementsPerPixel + k] * jWeights[2] * iWeights[3] - + srcPtr[(si - 1) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] * iWeights[0] - + srcPtr[(si + 0) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] * iWeights[1] - + srcPtr[(si + 1) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] * iWeights[2] - + srcPtr[(si + 2) * srcRowStride + (sj + 2) * elementsPerPixel + k] * jWeights[3] - * iWeights[3])); - - dstPtr[di * dstRowStride + dj * elementsPerPixel + k] = res < 0 ? 0 : (res > 255 ? 255 : res); + double res = std::rint( + std::abs(srcPtr[(sy - 1) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[0] + + srcPtr[(sy + 0) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[1] + + srcPtr[(sy + 1) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[2] + + srcPtr[(sy + 2) * srcStep + (sx - 1) * channels + c] * wghtX[0] * wghtY[3] + + srcPtr[(sy - 1) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[0] + + srcPtr[(sy + 0) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[1] + + srcPtr[(sy + 1) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[2] + + srcPtr[(sy + 2) * srcStep + (sx + 0) * channels + c] * wghtX[1] * wghtY[3] + + srcPtr[(sy - 1) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[0] + + srcPtr[(sy + 0) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[1] + + srcPtr[(sy + 1) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[2] + + srcPtr[(sy + 2) * srcStep + (sx + 1) * channels + c] * wghtX[2] * wghtY[3] + + srcPtr[(sy - 1) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[0] + + srcPtr[(sy + 0) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[1] + + srcPtr[(sy + 1) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[2] + + srcPtr[(sy + 2) * srcStep + (sx + 2) * channels + c] * wghtX[3] * wghtY[3])); + + dstPtr[dy * dstStep + dx * channels + c] = res < MinVal ? MinVal : (res > MaxVal ? MaxVal : res); } } } } } +void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, + int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, + bool isVarShape) +{ + int dstStep = dstRowStride / sizeof(uint8_t); + int srcStep = srcRowStride / sizeof(uint8_t); + + if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) + { + _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w, fmt, + interp); + } + else if (interp == NVCV_INTERP_AREA) + { + _Resize(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, fmt, interp, isVarShape); + } +} + +void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, + int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, + nvcv::ImageFormat fmt, NVCVInterpolationType interp) +{ + int dstStep = dstRowStride / sizeof(uint8_t); + int srcStep = srcRowStride / sizeof(uint8_t); + + if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) + { + _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, top, left, crop_rows, crop_cols, + fmt, interp); + } +} + +void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, + int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, + bool isVarShape) +{ + int dstStep = dstRowStride / sizeof(float); + int srcStep = srcRowStride / sizeof(float); + + if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) + { + _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w, + fmt, interp); + } + else if (interp == NVCV_INTERP_AREA) + { + _Resize(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, fmt, interp, isVarShape); + } +} + +void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, + int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, + nvcv::ImageFormat fmt, NVCVInterpolationType interp) +{ + int dstStep = dstRowStride / sizeof(float); + int srcStep = srcRowStride / sizeof(float); + + if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) + { + _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, top, left, crop_rows, + crop_cols, fmt, interp); + } +} + } // namespace nvcv::test diff --git a/tests/cvcuda/system/ResizeUtils.hpp b/tests/cvcuda/system/ResizeUtils.hpp index d8c27f7ca..ad1ffd482 100644 --- a/tests/cvcuda/system/ResizeUtils.hpp +++ b/tests/cvcuda/system/ResizeUtils.hpp @@ -31,13 +31,21 @@ namespace nvcv::test { // support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC/NVCV_INTERP_AREA void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, - bool isVarshape); + bool isVarShape); + +void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, + int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, + bool isVarShape); // only support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation); +void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, + int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, + nvcv::ImageFormat fmt, NVCVInterpolationType interpolation); + } // namespace nvcv::test #endif // NVCV_TEST_COMMON_FLIP_UTILS_HPP diff --git a/tests/cvcuda/system/TestOpBilateralFilter.cpp b/tests/cvcuda/system/TestOpBilateralFilter.cpp index e16668c08..10ef192b2 100644 --- a/tests/cvcuda/system/TestOpBilateralFilter.cpp +++ b/tests/cvcuda/system/TestOpBilateralFilter.cpp @@ -29,6 +29,8 @@ #include #include +#define NVCV_IMAGE_FORMAT_2U8 NVCV_DETAIL_MAKE_NONCOLOR_FMT1(PL, UNSIGNED, XY00, ASSOCIATED, X8_Y8) + namespace gt = ::testing; namespace test = nvcv::test; @@ -37,17 +39,21 @@ static uint32_t saturate_cast(float n) return static_cast(std::min(255.0f, std::round(n))); } -static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride, float delta) +static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride, + size_t channels, float delta) { for (size_t j = 0; j < rows; j++) { for (size_t k = 0; k < columns; k++) { - size_t offset = j * rowStride + k; - float diff = std::abs(static_cast(pTest[offset]) - static_cast(pGold[offset])); - if (diff > delta) + for (size_t c = 0; c < channels; ++c) { - return false; + size_t offset = j * rowStride + k * channels + c; + float diff = std::abs(static_cast(pTest[offset]) - static_cast(pGold[offset])); + if (diff > delta) + { + return false; + } } } } @@ -55,13 +61,13 @@ static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t } static bool CompareTensors(std::vector &vTest, std::vector &vGold, size_t columns, size_t rows, - size_t batch, size_t rowStride, size_t sampleStride, float delta) + size_t batch, size_t rowStride, size_t channels, size_t sampleStride, float delta) { for (size_t i = 0; i < batch; i++) { uint8_t *pTest = vTest.data() + i * sampleStride; uint8_t *pGold = vGold.data() + i * sampleStride; - if (!CompareImages(pTest, pGold, columns, rows, rowStride, delta)) + if (!CompareImages(pTest, pGold, columns, rows, rowStride, channels, delta)) return false; } return true; @@ -69,27 +75,31 @@ static bool CompareTensors(std::vector &vTest, std::vector &vG static bool CompareVarShapes(std::vector> &vTest, std::vector> &vGold, std::vector &vColumns, std::vector &vRows, std::vector &vRowStride, - float delta) + std::vector &vChannels, float delta) { for (size_t i = 0; i < vTest.size(); i++) { - if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], delta)) + if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], vChannels[i], delta)) return false; } return true; } -static void CPUBilateralFilter(uint8_t *pIn, uint8_t *pOut, int columns, int rows, int rowStride, int radius, - float colorCoefficient, float spaceCoefficient) +static void CPUBilateralFilter(uint8_t *pIn, uint8_t *pOut, int columns, int rows, int rowStride, int channels, + int radius, float colorCoefficient, float spaceCoefficient) { float radiusSquared = radius * radius; for (int j = 0; j < rows; j++) { for (int k = 0; k < columns; k++) { - float numerator = 0.0f; - float denominator = 0.0f; - float center = static_cast(pIn[j * rowStride + k]); + std::vector numerators(channels, 0.0f); + float denominator = 0.0f; + std::vector centers{static_cast(pIn[j * rowStride + k * channels]), + static_cast(pIn[j * rowStride + k * channels + 1]), + static_cast(pIn[j * rowStride + k * channels + 2]), + static_cast(pIn[j * rowStride + k * channels + 3])}; + for (int y = j - radius; y <= j + radius; y++) { for (int x = k - radius; x <= k + radius; x++) @@ -97,26 +107,44 @@ static void CPUBilateralFilter(uint8_t *pIn, uint8_t *pOut, int columns, int row float distanceSquared = (k - x) * (k - x) + (j - y) * (j - y); if (distanceSquared <= radiusSquared) { - float pixel = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows)) - ? static_cast(pIn[y * rowStride + x]) - : 0.0f; - float e_space = distanceSquared * spaceCoefficient; - float one_norm_size = std::abs(pixel - center); - float e_color = one_norm_size * one_norm_size * colorCoefficient; - float weight = std::exp(e_space + e_color); + std::vector pixels; + for (auto c = 0; c < channels; ++c) + { + float pixel = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows)) + ? static_cast(pIn[y * rowStride + x * channels + c]) + : 0.0f; + pixels.emplace_back(pixel); + } + float e_space = distanceSquared * spaceCoefficient; + float e_color = 0.0f; + + for (auto c = 0; c < channels; ++c) + { + e_color += std::abs(pixels[c] - centers[c]); + } + e_color = e_color * e_color * colorCoefficient; + + float weight = std::exp(e_space + e_color); denominator += weight; - numerator += weight * pixel; + for (auto c = 0; c < channels; ++c) + { + numerators[c] += weight * pixels[c]; + } } } } - pOut[j * rowStride + k] = saturate_cast(numerator / denominator); + + for (auto c = 0; c < channels; ++c) + { + pOut[j * rowStride + k * channels + c] = saturate_cast(numerators[c] / denominator); + } } } } static void CPUBilateralFilterTensor(std::vector &vIn, std::vector &vOut, int columns, int rows, - int batch, int rowStride, int sampleStride, int diameter, float sigmaColor, - float sigmaSpace) + int batch, int rowStride, int channels, int sampleStride, int diameter, + float sigmaColor, float sigmaSpace) { int radius = diameter / 2; float spaceCoefficient = -1 / (2 * sigmaSpace * sigmaSpace); @@ -125,21 +153,22 @@ static void CPUBilateralFilterTensor(std::vector &vIn, std::vector> &vIn, std::vector> &vOut, std::vector &vColumns, std::vector &vRows, - std::vector &vRowStride, std::vector &vDiameter, - std::vector &vSigmaColor, std::vector &vSigmaSpace) + std::vector &vRowStride, std::vector &vChannels, + std::vector &vDiameter, std::vector &vSigmaColor, + std::vector &vSigmaSpace) { for (size_t i = 0; i < vIn.size(); i++) { int radius = vDiameter[i] / 2; float spaceCoefficient = -1 / (2 * vSigmaSpace[i] * vSigmaSpace[i]); float colorCoefficient = -1 / (2 * vSigmaColor[i] * vSigmaColor[i]); - CPUBilateralFilter(vIn[i].data(), vOut[i].data(), vColumns[i], vRows[i], vRowStride[i], radius, + CPUBilateralFilter(vIn[i].data(), vOut[i].data(), vColumns[i], vRows[i], vRowStride[i], vChannels[i], radius, colorCoefficient, spaceCoefficient); } } @@ -171,177 +200,189 @@ TEST_P(OpBilateralFilter, BilateralFilter_packed) { cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int width = GetParamValue<0>(); - int height = GetParamValue<1>(); - int d = GetParamValue<2>(); - float sigmaColor = GetParamValue<3>(); - float sigmaSpace = GetParamValue<4>(); - int numberOfImages = GetParamValue<5>(); - - nvcv::Tensor imgOut = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8); - nvcv::Tensor imgIn = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8); + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int d = GetParamValue<2>(); + float sigmaColor = GetParamValue<3>(); + float sigmaSpace = GetParamValue<4>(); + int numberOfImages = GetParamValue<5>(); + std::vector fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8, + nvcv::FMT_RGBA8}; + for (nvcv::ImageFormat fmt : fmts) + { + nvcv::Tensor imgOut = nvcv::util::CreateTensor(numberOfImages, width, height, fmt); + nvcv::Tensor imgIn = nvcv::util::CreateTensor(numberOfImages, width, height, fmt); + const int channels = fmt.numChannels(); - auto inData = imgIn.exportData(); - auto outData = imgOut.exportData(); + auto inData = imgIn.exportData(); + auto outData = imgOut.exportData(); - ASSERT_NE(nullptr, inData); - ASSERT_NE(nullptr, outData); + ASSERT_NE(nullptr, inData); + ASSERT_NE(nullptr, outData); - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); - ASSERT_TRUE(inAccess); + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + ASSERT_TRUE(inAccess); - auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); - ASSERT_TRUE(outAccess); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); + ASSERT_TRUE(outAccess); - int inSampleStride = inAccess->numRows() * inAccess->rowStride(); - int outSampleStride = outAccess->numRows() * outAccess->rowStride(); + int inSampleStride = inAccess->numRows() * inAccess->rowStride(); + int outSampleStride = outAccess->numRows() * outAccess->rowStride(); - int inBufSize = inSampleStride * inAccess->numSamples(); - int outBufSize = outSampleStride * outAccess->numSamples(); + int inBufSize = inSampleStride * inAccess->numSamples(); + int outBufSize = outSampleStride * outAccess->numSamples(); - std::vector vIn(inBufSize); - std::vector vOut(outBufSize); + std::vector vIn(inBufSize); + std::vector vOut(outBufSize); - std::vector inGold(inBufSize, 0); - std::vector outGold(outBufSize, 0); - for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns + std::vector inGold(inBufSize, 0); + std::vector outGold(outBufSize, 0); + for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns - EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice)); - CPUBilateralFilterTensor(inGold, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(), - inAccess->rowStride(), inSampleStride, d, sigmaColor, sigmaSpace); + EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice)); + CPUBilateralFilterTensor(inGold, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(), + inAccess->rowStride(), channels, inSampleStride, d, sigmaColor, sigmaSpace); - // run operator - cvcuda::BilateralFilter bilateralFilterOp; + // run operator + cvcuda::BilateralFilter bilateralFilterOp; - EXPECT_NO_THROW(bilateralFilterOp(stream, imgIn, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT)); + EXPECT_NO_THROW(bilateralFilterOp(stream, imgIn, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT)); - // check cdata - std::vector outTest(outBufSize); + // check cdata + std::vector outTest(outBufSize); - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost)); + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost)); + ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(), + inAccess->rowStride(), channels, inSampleStride, 0.9f)); + } EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); - ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(), - inAccess->rowStride(), inSampleStride, 0.9f)); } TEST_P(OpBilateralFilter, BilateralFilter_VarShape) { cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int width = GetParamValue<0>(); - int height = GetParamValue<1>(); - int diameter = GetParamValue<2>(); - float sigmaColor = GetParamValue<3>(); - float sigmaSpace = GetParamValue<4>(); - int numberOfImages = GetParamValue<5>(); - nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8}; - - // Create input varshape - std::default_random_engine rng; - std::uniform_int_distribution udistWidth(width * 0.8, width * 1.1); - std::uniform_int_distribution udistHeight(height * 0.8, height * 1.1); - - std::vector imgSrc; - - std::vector> srcVec(numberOfImages); - std::vector srcVecRowStride(numberOfImages); - std::vector srcVecRows(numberOfImages); - std::vector srcVecColumns(numberOfImages); - std::vector> goldVec(numberOfImages); - std::vector> dstVec(numberOfImages); - for (int i = 0; i < numberOfImages; ++i) + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int diameter = GetParamValue<2>(); + float sigmaColor = GetParamValue<3>(); + float sigmaSpace = GetParamValue<4>(); + int numberOfImages = GetParamValue<5>(); + std::vector fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8, + nvcv::FMT_RGBA8}; + for (nvcv::ImageFormat fmt : fmts) { - imgSrc.emplace_back(nvcv::Size2D{udistWidth(rng), udistHeight(rng)}, format); - int srcRowStride = imgSrc[i].size().w * format.planePixelStrideBytes(0); - srcVecRowStride[i] = srcRowStride; - srcVecRows[i] = imgSrc[i].size().h; - srcVecColumns[i] = imgSrc[i].size().w; - std::uniform_int_distribution udist(0, 255); - - srcVec[i].resize(imgSrc[i].size().h * srcRowStride); - goldVec[i].resize(imgSrc[i].size().h * srcRowStride); - dstVec[i].resize(imgSrc[i].size().h * srcRowStride); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); - std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; }); - std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; }); - auto imgData = imgSrc[i].exportData(); - ASSERT_NE(imgData, nvcv::NullOpt); - - // Copy input data to the GPU - ASSERT_EQ(cudaSuccess, - cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride, srcVec[i].data(), - srcRowStride, srcRowStride, imgSrc[i].size().h, cudaMemcpyHostToDevice, stream)); - } + // Create input varshape + std::default_random_engine rng; + std::uniform_int_distribution udistWidth(width * 0.8, width * 1.1); + std::uniform_int_distribution udistHeight(height * 0.8, height * 1.1); + + std::vector imgSrc; + + std::vector> srcVec(numberOfImages); + std::vector srcVecRowStride(numberOfImages); + std::vector srcVecRows(numberOfImages); + std::vector srcVecColumns(numberOfImages); + std::vector channelsVec(numberOfImages); + std::vector> goldVec(numberOfImages); + std::vector> dstVec(numberOfImages); + for (int i = 0; i < numberOfImages; ++i) + { + imgSrc.emplace_back(nvcv::Size2D{udistWidth(rng), udistHeight(rng)}, fmt); + int srcRowStride = imgSrc[i].size().w * fmt.planePixelStrideBytes(0); + srcVecRowStride[i] = srcRowStride; + srcVecRows[i] = imgSrc[i].size().h; + srcVecColumns[i] = imgSrc[i].size().w; + channelsVec[i] = fmt.numChannels(); + std::uniform_int_distribution udist(0, 255); + + srcVec[i].resize(imgSrc[i].size().h * srcRowStride); + goldVec[i].resize(imgSrc[i].size().h * srcRowStride); + dstVec[i].resize(imgSrc[i].size().h * srcRowStride); + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); + std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; }); + std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; }); + auto imgData = imgSrc[i].exportData(); + ASSERT_NE(imgData, nvcv::NullOpt); + + // Copy input data to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride, + srcVec[i].data(), srcRowStride, srcRowStride, imgSrc[i].size().h, + cudaMemcpyHostToDevice, stream)); + } - nvcv::ImageBatchVarShape batchSrc(numberOfImages); - batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + nvcv::ImageBatchVarShape batchSrc(numberOfImages); + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); - // Create output varshape - std::vector imgDst; - for (int i = 0; i < numberOfImages; ++i) - { - imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format()); - } - nvcv::ImageBatchVarShape batchDst(numberOfImages); - batchDst.pushBack(imgDst.begin(), imgDst.end()); + // Create output varshape + std::vector imgDst; + for (int i = 0; i < numberOfImages; ++i) + { + imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format()); + } + nvcv::ImageBatchVarShape batchDst(numberOfImages); + batchDst.pushBack(imgDst.begin(), imgDst.end()); - // Create diameter tensor - std::vector vDiameter(numberOfImages, diameter); - nvcv::Tensor diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32); - { - auto dev = diameterTensor.exportData(); - ASSERT_NE(dev, nullptr); + // Create diameter tensor + std::vector vDiameter(numberOfImages, diameter); + nvcv::Tensor diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32); + { + auto dev = diameterTensor.exportData(); + ASSERT_NE(dev, nullptr); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int), - cudaMemcpyHostToDevice, stream)); - } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int), + cudaMemcpyHostToDevice, stream)); + } - // Create sigmaColor tensor - std::vector vSigmaColor(numberOfImages, sigmaColor); - nvcv::Tensor sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); - { - auto dev = sigmaColorTensor.exportData(); - ASSERT_NE(dev, nullptr); + // Create sigmaColor tensor + std::vector vSigmaColor(numberOfImages, sigmaColor); + nvcv::Tensor sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); + { + auto dev = sigmaColorTensor.exportData(); + ASSERT_NE(dev, nullptr); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(), vSigmaColor.size() * sizeof(float), - cudaMemcpyHostToDevice, stream)); - } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(), + vSigmaColor.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); + } - // Create sigmaSpace tensor - std::vector vSigmaSpace(numberOfImages, sigmaSpace); - nvcv::Tensor sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); - { - auto dev = sigmaSpaceTensor.exportData(); - ASSERT_NE(dev, nullptr); + // Create sigmaSpace tensor + std::vector vSigmaSpace(numberOfImages, sigmaSpace); + nvcv::Tensor sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); + { + auto dev = sigmaSpaceTensor.exportData(); + ASSERT_NE(dev, nullptr); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(), vSigmaSpace.size() * sizeof(float), - cudaMemcpyHostToDevice, stream)); - } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(), + vSigmaSpace.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); + } - // Create gold data - CPUBilateralFilterVarShape(srcVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, vDiameter, vSigmaColor, - vSigmaSpace); + // Create gold data + CPUBilateralFilterVarShape(srcVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, channelsVec, vDiameter, + vSigmaColor, vSigmaSpace); - // Run operator - cvcuda::BilateralFilter bilateralFilterOp; - EXPECT_NO_THROW(bilateralFilterOp(stream, batchSrc, batchDst, diameterTensor, sigmaColorTensor, sigmaSpaceTensor, - NVCV_BORDER_CONSTANT)); + // Run operator + cvcuda::BilateralFilter bilateralFilterOp; + EXPECT_NO_THROW(bilateralFilterOp(stream, batchSrc, batchDst, diameterTensor, sigmaColorTensor, + sigmaSpaceTensor, NVCV_BORDER_CONSTANT)); - // Retrieve data from GPU - for (int i = 0; i < numberOfImages; i++) - { - auto imgData = imgDst[i].exportData(); - ASSERT_NE(imgData, nvcv::NullOpt); + // Retrieve data from GPU + for (int i = 0; i < numberOfImages; i++) + { + auto imgData = imgDst[i].exportData(); + ASSERT_NE(imgData, nvcv::NullOpt); - // Copy input data to the GPU - ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr, - imgData->plane(0).rowStride, srcVecRowStride[i], imgDst[i].size().h, - cudaMemcpyDeviceToHost, stream)); + // Copy input data to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr, + imgData->plane(0).rowStride, srcVecRowStride[i], + imgDst[i].size().h, cudaMemcpyDeviceToHost, stream)); + } + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + + // Compare data + ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, channelsVec, 0.9f)); } - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); - - // Compare data - ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, 0.9f)); } + +#undef NVCV_IMAGE_FORMAT_2U8 diff --git a/tests/cvcuda/system/TestOpCvtColor.cpp b/tests/cvcuda/system/TestOpCvtColor.cpp index bb6bca005..833cfa92a 100644 --- a/tests/cvcuda/system/TestOpCvtColor.cpp +++ b/tests/cvcuda/system/TestOpCvtColor.cpp @@ -171,7 +171,7 @@ test::ValueList()}; nvcv::ImageFormat dstFormat{GetParamValue<4>()}; + // Waive the formats that have subsampled planes + if (srcFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444 + || dstFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444) + { + GTEST_SKIP() << "Waived the formats that have subsampled planes for OpCvtColor varshape test"; + } + NVCVDataType nvcvDataType; ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType)); diff --git a/tests/cvcuda/system/TestOpErase.cpp b/tests/cvcuda/system/TestOpErase.cpp index 905a6f0d7..a7126dcbf 100644 --- a/tests/cvcuda/system/TestOpErase.cpp +++ b/tests/cvcuda/system/TestOpErase.cpp @@ -27,11 +27,19 @@ #include -NVCV_TEST_SUITE_P(OpErase, nvcv::test::ValueList{{1}, {2}}); +NVCV_TEST_SUITE_P(OpErase, nvcv::test::ValueList{ + {1, false}, + {2, false}, + {1, true}, + {2, true} +}); TEST_P(OpErase, correct_output) { - int N = GetParam(); + int N = GetParamValue<0>(); + bool random = GetParamValue<1>(); + int max_num_erasing_area = 2; + unsigned int seed = 0; cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); @@ -115,9 +123,6 @@ TEST_P(OpErase, correct_output) cudaMemcpyHostToDevice, stream)); // Call operator - unsigned int seed = 0; - bool random = false; - int max_num_erasing_area = 2; cvcuda::Erase eraseOp(max_num_erasing_area); EXPECT_NO_THROW(eraseOp(stream, imgIn, imgOut, anchor, erasing, values, imgIdx, random, seed)); @@ -126,17 +131,21 @@ TEST_P(OpErase, correct_output) std::vector test(outBufferSize, 0xA); //Check data + if (!random) + { + EXPECT_EQ(cudaSuccess, cudaMemcpy(test.data(), outData->basePtr(), outBufferSize, cudaMemcpyDeviceToHost)); + + EXPECT_EQ(test[0], 1); + EXPECT_EQ(test[9], 1); + EXPECT_EQ(test[10], 0); + EXPECT_EQ(test[9 * 640], 1); + EXPECT_EQ(test[9 * 640 + 9], 1); + EXPECT_EQ(test[9 * 640 + 10], 0); + EXPECT_EQ(test[10 * 640], 0); + EXPECT_EQ(test[10 * 640 + 10], 1); + } EXPECT_EQ(cudaSuccess, cudaMemcpy(test.data(), outData->basePtr(), outBufferSize, cudaMemcpyDeviceToHost)); - EXPECT_EQ(test[0], 1); - EXPECT_EQ(test[9], 1); - EXPECT_EQ(test[10], 0); - EXPECT_EQ(test[9 * 640], 1); - EXPECT_EQ(test[9 * 640 + 9], 1); - EXPECT_EQ(test[9 * 640 + 10], 0); - EXPECT_EQ(test[10 * 640], 0); - EXPECT_EQ(test[10 * 640 + 10], 1); - EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } @@ -232,17 +241,21 @@ TEST(OpErase, OpErase_Varshape) std::vector test(dstHeight * dstRowStride, 0xFF); // Copy output data to Host - ASSERT_EQ(cudaSuccess, cudaMemcpy2D(test.data(), dstRowStride, dstData->plane(0).basePtr, - dstData->plane(0).rowStride, dstRowStride, dstHeight, cudaMemcpyDeviceToHost)); - - EXPECT_EQ(test[0], 1); - EXPECT_EQ(test[9], 1); - EXPECT_EQ(test[10], 0); - EXPECT_EQ(test[9 * 640], 1); - EXPECT_EQ(test[9 * 640 + 9], 1); - EXPECT_EQ(test[9 * 640 + 10], 0); - EXPECT_EQ(test[10 * 640], 0); - EXPECT_EQ(test[10 * 640 + 10], 1); + if (!random) + { + ASSERT_EQ(cudaSuccess, + cudaMemcpy2D(test.data(), dstRowStride, dstData->plane(0).basePtr, dstData->plane(0).rowStride, + dstRowStride, dstHeight, cudaMemcpyDeviceToHost)); + + EXPECT_EQ(test[0], 1); + EXPECT_EQ(test[9], 1); + EXPECT_EQ(test[10], 0); + EXPECT_EQ(test[9 * 640], 1); + EXPECT_EQ(test[9 * 640 + 9], 1); + EXPECT_EQ(test[9 * 640 + 10], 0); + EXPECT_EQ(test[10 * 640], 0); + EXPECT_EQ(test[10 * 640 + 10], 1); + } EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } diff --git a/tests/cvcuda/system/TestOpJointBilateralFilter.cpp b/tests/cvcuda/system/TestOpJointBilateralFilter.cpp index 44b0a4eea..2be0529bf 100644 --- a/tests/cvcuda/system/TestOpJointBilateralFilter.cpp +++ b/tests/cvcuda/system/TestOpJointBilateralFilter.cpp @@ -29,6 +29,8 @@ #include #include +#define NVCV_IMAGE_FORMAT_2U8 NVCV_DETAIL_MAKE_NONCOLOR_FMT1(PL, UNSIGNED, XY00, ASSOCIATED, X8_Y8) + namespace gt = ::testing; namespace test = nvcv::test; @@ -37,21 +39,26 @@ static uint32_t saturate_cast(float n) return static_cast(std::min(255.0f, std::round(n))); } -static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride, float delta) +static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t rows, size_t rowStride, + size_t channels, float delta) { for (size_t j = 0; j < rows; j++) { for (size_t k = 0; k < columns; k++) { - size_t offset = j * rowStride + k; - float diff = std::abs(static_cast(pTest[offset]) - static_cast(pGold[offset])); - if (diff > delta) + for (size_t c = 0; c < channels; ++c) { - std::cout << " o = " << offset << " j = " << j << " k = " << k << " rowS = " << rowStride << std::endl; - std::cout << " test = " << static_cast(pTest[offset]) - << " gold = " << static_cast(pGold[offset]) << std::endl; + size_t offset = j * rowStride + k * channels + c; + float diff = std::abs(static_cast(pTest[offset]) - static_cast(pGold[offset])); + if (diff > delta) + { + std::cout << " o = " << offset << " j = " << j << " k = " << k << " rowS = " << rowStride + << std::endl; + std::cout << " test = " << static_cast(pTest[offset]) + << " gold = " << static_cast(pGold[offset]) << std::endl; - return false; + return false; + } } } } @@ -59,13 +66,13 @@ static bool CompareImages(uint8_t *pTest, uint8_t *pGold, size_t columns, size_t } static bool CompareTensors(std::vector &vTest, std::vector &vGold, size_t columns, size_t rows, - size_t batch, size_t rowStride, size_t sampleStride, float delta) + size_t batch, size_t rowStride, size_t channels, size_t sampleStride, float delta) { for (size_t i = 0; i < batch; i++) { uint8_t *pTest = vTest.data() + i * sampleStride; uint8_t *pGold = vGold.data() + i * sampleStride; - if (!CompareImages(pTest, pGold, columns, rows, rowStride, delta)) + if (!CompareImages(pTest, pGold, columns, rows, rowStride, channels, delta)) return false; } return true; @@ -73,11 +80,11 @@ static bool CompareTensors(std::vector &vTest, std::vector &vG static bool CompareVarShapes(std::vector> &vTest, std::vector> &vGold, std::vector &vColumns, std::vector &vRows, std::vector &vRowStride, - float delta) + std::vector &vChannels, float delta) { for (size_t i = 0; i < vTest.size(); i++) { - if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], delta)) + if (!CompareImages(vTest[i].data(), vGold[i].data(), vColumns[i], vRows[i], vRowStride[i], vChannels[i], delta)) { return false; } @@ -86,16 +93,21 @@ static bool CompareVarShapes(std::vector> &vTest, std::vect } static void CPUJointBilateralFilter(uint8_t *pIn, uint8_t *pInColor, uint8_t *pOut, int columns, int rows, - int rowStride, int radius, float colorCoefficient, float spaceCoefficient) + int rowStride, int channels, int radius, float colorCoefficient, + float spaceCoefficient) { float radiusSquared = radius * radius; for (int j = 0; j < rows; j++) { for (int k = 0; k < columns; k++) { - float numerator = 0; - float denominator = 0; - float centerColor = static_cast(pInColor[j * rowStride + k]); + std::vector numerators(channels, 0.0f); + float denominator = 0; + std::vector centerColors{static_cast(pInColor[j * rowStride + k * channels]), + static_cast(pInColor[j * rowStride + k * channels + 1]), + static_cast(pInColor[j * rowStride + k * channels + 2]), + static_cast(pInColor[j * rowStride + k * channels + 3])}; + for (int y = j - radius; y <= j + radius; y++) { for (int x = k - radius; x <= k + radius; x++) @@ -103,30 +115,50 @@ static void CPUJointBilateralFilter(uint8_t *pIn, uint8_t *pInColor, uint8_t *pO float distanceSquared = (k - x) * (k - x) + (j - y) * (j - y); if (distanceSquared <= radiusSquared) { - float pixel = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows)) - ? static_cast(pIn[y * rowStride + x]) - : 0.0f; - float pixelColor = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows)) - ? static_cast(pInColor[y * rowStride + x]) - : 0.0f; - float e_space = distanceSquared * spaceCoefficient; - float one_norm_size = std::abs(pixelColor - centerColor); - float e_color = one_norm_size * one_norm_size * colorCoefficient; - float weight = std::exp(e_space + e_color); + std::vector pixels; + std::vector pixelColors; + for (auto c = 0; c < channels; ++c) + { + float pixel = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows)) + ? static_cast(pIn[y * rowStride + x * channels + c]) + : 0.0f; + float pixelColor = ((x >= 0) && (x < columns) && (y >= 0) && (y < rows)) + ? static_cast(pInColor[y * rowStride + x * channels + c]) + : 0.0f; + pixels.emplace_back(pixel); + pixelColors.emplace_back(pixelColor); + } + + float e_space = distanceSquared * spaceCoefficient; + float e_color = 0.0f; + for (auto c = 0; c < channels; ++c) + { + e_color += std::abs(pixelColors[c] - centerColors[c]); + } + e_color = e_color * e_color * colorCoefficient; + + float weight = std::exp(e_space + e_color); denominator += weight; - numerator += weight * pixel; + for (auto c = 0; c < channels; ++c) + { + numerators[c] += weight * pixels[c]; + } } } } - denominator = (denominator != 0) ? denominator : 1.0f; - pOut[j * rowStride + k] = saturate_cast(numerator / denominator); + denominator = (denominator != 0) ? denominator : 1.0f; + for (auto c = 0; c < channels; ++c) + { + pOut[j * rowStride + k * channels + c] = saturate_cast(numerators[c] / denominator); + } } } } static void CPUJointBilateralFilterTensor(std::vector &vIn, std::vector &vInColor, std::vector &vOut, int columns, int rows, int batch, int rowStride, - int sampleStride, int diameter, float sigmaColor, float sigmaSpace) + int channels, int sampleStride, int diameter, float sigmaColor, + float sigmaSpace) { int radius = diameter / 2; float spaceCoefficient = -1 / (2 * sigmaSpace * sigmaSpace); @@ -136,7 +168,7 @@ static void CPUJointBilateralFilterTensor(std::vector &vIn, std::vector uint8_t *pIn = vIn.data() + i * sampleStride; uint8_t *pInColor = vInColor.data() + i * sampleStride; uint8_t *pOut = vOut.data() + i * sampleStride; - CPUJointBilateralFilter(pIn, pInColor, pOut, columns, rows, rowStride, radius, colorCoefficient, + CPUJointBilateralFilter(pIn, pInColor, pOut, columns, rows, rowStride, channels, radius, colorCoefficient, spaceCoefficient); } } @@ -145,8 +177,8 @@ static void CPUJointBilateralFilterVarShape(std::vector> &v std::vector> &vInColor, std::vector> &vOut, std::vector &vColumns, std::vector &vRows, std::vector &vRowStride, - std::vector &vDiameter, std::vector &vSigmaColor, - std::vector &vSigmaSpace) + std::vector &vChannels, std::vector &vDiameter, + std::vector &vSigmaColor, std::vector &vSigmaSpace) { for (size_t i = 0; i < vIn.size(); i++) { @@ -154,7 +186,7 @@ static void CPUJointBilateralFilterVarShape(std::vector> &v float spaceCoefficient = -1 / (2 * vSigmaSpace[i] * vSigmaSpace[i]); float colorCoefficient = -1 / (2 * vSigmaColor[i] * vSigmaColor[i]); CPUJointBilateralFilter(vIn[i].data(), vInColor[i].data(), vOut[i].data(), vColumns[i], vRows[i], vRowStride[i], - radius, colorCoefficient, spaceCoefficient); + vChannels[i], radius, colorCoefficient, spaceCoefficient); } } @@ -185,208 +217,222 @@ TEST_P(OpJointBilateralFilter, JointBilateralFilter_packed) { cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int width = GetParamValue<0>(); - int height = GetParamValue<1>(); - int d = GetParamValue<2>(); - float sigmaColor = GetParamValue<3>(); - float sigmaSpace = GetParamValue<4>(); - int numberOfImages = GetParamValue<5>(); - - nvcv::Tensor imgOut = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8); - nvcv::Tensor imgIn = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8); - nvcv::Tensor imgInColor = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8); - - auto inData = imgIn.exportData(); - auto inColorData = imgInColor.exportData(); - auto outData = imgOut.exportData(); - - ASSERT_NE(nullptr, inData); - ASSERT_NE(nullptr, inColorData); - ASSERT_NE(nullptr, outData); - - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); - ASSERT_TRUE(inAccess); - - auto inColorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inColorData); - ASSERT_TRUE(inColorAccess); - - auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); - ASSERT_TRUE(outAccess); - - int inSampleStride = inAccess->numRows() * inAccess->rowStride(); - int inColorSampleStride = inColorAccess->numRows() * inColorAccess->rowStride(); - int outSampleStride = outAccess->numRows() * outAccess->rowStride(); - - int inBufSize = inSampleStride * inAccess->numSamples(); - int inColorBufSize = inColorSampleStride * inColorAccess->numSamples(); - int outBufSize = outSampleStride * outAccess->numSamples(); - - std::vector vIn(inBufSize); - std::vector vInColor(inColorBufSize); - std::vector vOut(outBufSize); - - std::vector inGold(inBufSize, 0); - std::vector inColorGold(inColorBufSize, 0); - std::vector outGold(outBufSize, 0); - for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns - for (int i = 0; i < inColorBufSize; i++) - inColorGold[i] = i % 109; // Use prime number to prevent weird tiling patterns - EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice)); - EXPECT_EQ(cudaSuccess, - cudaMemcpy(inColorData->basePtr(), inColorGold.data(), inColorBufSize, cudaMemcpyHostToDevice)); - CPUJointBilateralFilterTensor(inGold, inColorGold, outGold, inAccess->numCols(), inAccess->numRows(), - inAccess->numSamples(), inAccess->rowStride(), inSampleStride, d, sigmaColor, - sigmaSpace); - - // run operator - cvcuda::JointBilateralFilter jointBilateralFilterOp; - - EXPECT_NO_THROW( - jointBilateralFilterOp(stream, imgIn, imgInColor, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT)); - - // check cdata - std::vector outTest(outBufSize); - - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost)); + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int d = GetParamValue<2>(); + float sigmaColor = GetParamValue<3>(); + float sigmaSpace = GetParamValue<4>(); + int numberOfImages = GetParamValue<5>(); + std::vector fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8, + nvcv::FMT_RGBA8}; + + for (nvcv::ImageFormat fmt : fmts) + { + nvcv::Tensor imgOut = nvcv::util::CreateTensor(numberOfImages, width, height, fmt); + nvcv::Tensor imgIn = nvcv::util::CreateTensor(numberOfImages, width, height, fmt); + nvcv::Tensor imgInColor = nvcv::util::CreateTensor(numberOfImages, width, height, fmt); + const int channels = fmt.numChannels(); + + auto inData = imgIn.exportData(); + auto inColorData = imgInColor.exportData(); + auto outData = imgOut.exportData(); + + ASSERT_NE(nullptr, inData); + ASSERT_NE(nullptr, inColorData); + ASSERT_NE(nullptr, outData); + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + ASSERT_TRUE(inAccess); + + auto inColorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inColorData); + ASSERT_TRUE(inColorAccess); + + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); + ASSERT_TRUE(outAccess); + + int inSampleStride = inAccess->numRows() * inAccess->rowStride(); + int inColorSampleStride = inColorAccess->numRows() * inColorAccess->rowStride(); + int outSampleStride = outAccess->numRows() * outAccess->rowStride(); + + int inBufSize = inSampleStride * inAccess->numSamples(); + int inColorBufSize = inColorSampleStride * inColorAccess->numSamples(); + int outBufSize = outSampleStride * outAccess->numSamples(); + + std::vector vIn(inBufSize); + std::vector vInColor(inColorBufSize); + std::vector vOut(outBufSize); + + std::vector inGold(inBufSize, 0); + std::vector inColorGold(inColorBufSize, 0); + std::vector outGold(outBufSize, 0); + for (int i = 0; i < inBufSize; i++) inGold[i] = i % 113; // Use prime number to prevent weird tiling patterns + for (int i = 0; i < inColorBufSize; i++) + inColorGold[i] = i % 109; // Use prime number to prevent weird tiling patterns + EXPECT_EQ(cudaSuccess, cudaMemcpy(inData->basePtr(), inGold.data(), inBufSize, cudaMemcpyHostToDevice)); + EXPECT_EQ(cudaSuccess, + cudaMemcpy(inColorData->basePtr(), inColorGold.data(), inColorBufSize, cudaMemcpyHostToDevice)); + CPUJointBilateralFilterTensor(inGold, inColorGold, outGold, inAccess->numCols(), inAccess->numRows(), + inAccess->numSamples(), inAccess->rowStride(), channels, inSampleStride, d, + sigmaColor, sigmaSpace); + + // run operator + cvcuda::JointBilateralFilter jointBilateralFilterOp; + + EXPECT_NO_THROW( + jointBilateralFilterOp(stream, imgIn, imgInColor, imgOut, d, sigmaColor, sigmaSpace, NVCV_BORDER_CONSTANT)); + + // check cdata + std::vector outTest(outBufSize); + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_EQ(cudaSuccess, cudaMemcpy(outTest.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost)); + ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(), + inAccess->rowStride(), channels, inSampleStride, 0.9f)); + } EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); - ASSERT_TRUE(CompareTensors(outTest, outGold, inAccess->numCols(), inAccess->numRows(), inAccess->numSamples(), - inAccess->rowStride(), inSampleStride, 0.9f)); } TEST_P(OpJointBilateralFilter, JointBilateralFilter_VarShape) { cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int width = GetParamValue<0>(); - int height = GetParamValue<1>(); - int diameter = GetParamValue<2>(); - float sigmaColor = GetParamValue<3>(); - float sigmaSpace = GetParamValue<4>(); - int numberOfImages = GetParamValue<5>(); - nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8}; - - // Create input varshape - std::default_random_engine rng; - std::uniform_int_distribution udistWidth(width * 0.8, width * 1.1); - std::uniform_int_distribution udistHeight(height * 0.8, height * 1.1); - - std::vector imgSrc; - std::vector imgSrcColor; - - std::vector> srcVec(numberOfImages); - std::vector> srcColorVec(numberOfImages); - std::vector srcVecRowStride(numberOfImages); - std::vector srcVecRows(numberOfImages); - std::vector srcVecColumns(numberOfImages); - std::vector> goldVec(numberOfImages); - std::vector> dstVec(numberOfImages); - for (int i = 0; i < numberOfImages; ++i) + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int diameter = GetParamValue<2>(); + float sigmaColor = GetParamValue<3>(); + float sigmaSpace = GetParamValue<4>(); + int numberOfImages = GetParamValue<5>(); + std::vector fmts{nvcv::FMT_U8, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_2U8}, nvcv::FMT_RGB8, + nvcv::FMT_RGBA8}; + + for (nvcv::ImageFormat fmt : fmts) { - int w = udistWidth(rng); - int h = udistHeight(rng); - nvcv::Size2D sz(w, h); - imgSrc.emplace_back(sz, format); - imgSrcColor.emplace_back(sz, format); - int srcRowStride = imgSrc[i].size().w * format.planePixelStrideBytes(0); - srcVecRowStride[i] = srcRowStride; - srcVecRows[i] = imgSrc[i].size().h; - srcVecColumns[i] = imgSrc[i].size().w; - std::uniform_int_distribution udist(0, 255); - - srcVec[i].resize(imgSrc[i].size().h * srcRowStride); - srcColorVec[i].resize(imgSrcColor[i].size().h * srcRowStride); - goldVec[i].resize(imgSrc[i].size().h * srcRowStride); - dstVec[i].resize(imgSrc[i].size().h * srcRowStride); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); - std::generate(srcColorVec[i].begin(), srcColorVec[i].end(), [&]() { return udist(rng); }); - std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; }); - std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; }); - auto imgData = imgSrc[i].exportData(); - ASSERT_NE(imgData, nvcv::NullOpt); - auto imgColorData = imgSrcColor[i].exportData(); - ASSERT_NE(imgColorData, nvcv::NullOpt); - - // Copy input data to the GPU - ASSERT_EQ(cudaSuccess, - cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride, srcVec[i].data(), - srcRowStride, srcRowStride, imgSrc[i].size().h, cudaMemcpyHostToDevice, stream)); - - ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgColorData->plane(0).basePtr, imgColorData->plane(0).rowStride, - srcColorVec[i].data(), srcRowStride, srcRowStride, - imgSrcColor[i].size().h, cudaMemcpyHostToDevice, stream)); - } + // Create input varshape + std::default_random_engine rng; + std::uniform_int_distribution udistWidth(width * 0.8, width * 1.1); + std::uniform_int_distribution udistHeight(height * 0.8, height * 1.1); + + std::vector imgSrc; + std::vector imgSrcColor; + + std::vector> srcVec(numberOfImages); + std::vector> srcColorVec(numberOfImages); + std::vector srcVecRowStride(numberOfImages); + std::vector srcVecRows(numberOfImages); + std::vector srcVecColumns(numberOfImages); + std::vector channelsVec(numberOfImages); + std::vector> goldVec(numberOfImages); + std::vector> dstVec(numberOfImages); + for (int i = 0; i < numberOfImages; ++i) + { + int w = udistWidth(rng); + int h = udistHeight(rng); + nvcv::Size2D sz(w, h); + imgSrc.emplace_back(sz, fmt); + imgSrcColor.emplace_back(sz, fmt); + int srcRowStride = imgSrc[i].size().w * fmt.planePixelStrideBytes(0); + srcVecRowStride[i] = srcRowStride; + srcVecRows[i] = imgSrc[i].size().h; + srcVecColumns[i] = imgSrc[i].size().w; + channelsVec[i] = fmt.numChannels(); + std::uniform_int_distribution udist(0, 255); + + srcVec[i].resize(imgSrc[i].size().h * srcRowStride); + srcColorVec[i].resize(imgSrcColor[i].size().h * srcRowStride); + goldVec[i].resize(imgSrc[i].size().h * srcRowStride); + dstVec[i].resize(imgSrc[i].size().h * srcRowStride); + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); + std::generate(srcColorVec[i].begin(), srcColorVec[i].end(), [&]() { return udist(rng); }); + std::generate(goldVec[i].begin(), goldVec[i].end(), [&]() { return 0; }); + std::generate(dstVec[i].begin(), dstVec[i].end(), [&]() { return 0; }); + auto imgData = imgSrc[i].exportData(); + ASSERT_NE(imgData, nvcv::NullOpt); + auto imgColorData = imgSrcColor[i].exportData(); + ASSERT_NE(imgColorData, nvcv::NullOpt); + + // Copy input data to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgData->plane(0).basePtr, imgData->plane(0).rowStride, + srcVec[i].data(), srcRowStride, srcRowStride, imgSrc[i].size().h, + cudaMemcpyHostToDevice, stream)); + + ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(imgColorData->plane(0).basePtr, imgColorData->plane(0).rowStride, + srcColorVec[i].data(), srcRowStride, srcRowStride, + imgSrcColor[i].size().h, cudaMemcpyHostToDevice, stream)); + } - nvcv::ImageBatchVarShape batchSrc(numberOfImages); - batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); - nvcv::ImageBatchVarShape batchSrcColor(numberOfImages); - batchSrcColor.pushBack(imgSrcColor.begin(), imgSrcColor.end()); + nvcv::ImageBatchVarShape batchSrc(numberOfImages); + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + nvcv::ImageBatchVarShape batchSrcColor(numberOfImages); + batchSrcColor.pushBack(imgSrcColor.begin(), imgSrcColor.end()); - // Create output varshape - std::vector imgDst; - for (int i = 0; i < numberOfImages; ++i) - { - imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format()); - } - nvcv::ImageBatchVarShape batchDst(numberOfImages); - batchDst.pushBack(imgDst.begin(), imgDst.end()); + // Create output varshape + std::vector imgDst; + for (int i = 0; i < numberOfImages; ++i) + { + imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format()); + } + nvcv::ImageBatchVarShape batchDst(numberOfImages); + batchDst.pushBack(imgDst.begin(), imgDst.end()); - // Create diameter tensor - std::vector vDiameter(numberOfImages, diameter); - nvcv::Tensor diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32); - { - auto dev = diameterTensor.exportData(); - ASSERT_NE(dev, nullptr); + // Create diameter tensor + std::vector vDiameter(numberOfImages, diameter); + nvcv::Tensor diameterTensor({{numberOfImages}, "N"}, nvcv::TYPE_S32); + { + auto dev = diameterTensor.exportData(); + ASSERT_NE(dev, nullptr); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int), - cudaMemcpyHostToDevice, stream)); - } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vDiameter.data(), vDiameter.size() * sizeof(int), + cudaMemcpyHostToDevice, stream)); + } - // Create sigmaColor tensor - std::vector vSigmaColor(numberOfImages, sigmaColor); - nvcv::Tensor sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); - { - auto dev = sigmaColorTensor.exportData(); - ASSERT_NE(dev, nullptr); + // Create sigmaColor tensor + std::vector vSigmaColor(numberOfImages, sigmaColor); + nvcv::Tensor sigmaColorTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); + { + auto dev = sigmaColorTensor.exportData(); + ASSERT_NE(dev, nullptr); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(), vSigmaColor.size() * sizeof(float), - cudaMemcpyHostToDevice, stream)); - } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaColor.data(), + vSigmaColor.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); + } - // Create sigmaSpace tensor - std::vector vSigmaSpace(numberOfImages, sigmaSpace); - nvcv::Tensor sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); - { - auto dev = sigmaSpaceTensor.exportData(); - ASSERT_NE(dev, nullptr); + // Create sigmaSpace tensor + std::vector vSigmaSpace(numberOfImages, sigmaSpace); + nvcv::Tensor sigmaSpaceTensor({{numberOfImages}, "N"}, nvcv::TYPE_F32); + { + auto dev = sigmaSpaceTensor.exportData(); + ASSERT_NE(dev, nullptr); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(), vSigmaSpace.size() * sizeof(float), - cudaMemcpyHostToDevice, stream)); - } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(dev->basePtr(), vSigmaSpace.data(), + vSigmaSpace.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); + } - // Create gold data - CPUJointBilateralFilterVarShape(srcVec, srcColorVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, vDiameter, - vSigmaColor, vSigmaSpace); + // Create gold data + CPUJointBilateralFilterVarShape(srcVec, srcColorVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, + channelsVec, vDiameter, vSigmaColor, vSigmaSpace); - // Run operator - cvcuda::JointBilateralFilter jointBilateralFilterOp; - EXPECT_NO_THROW(jointBilateralFilterOp(stream, batchSrc, batchSrcColor, batchDst, diameterTensor, sigmaColorTensor, - sigmaSpaceTensor, NVCV_BORDER_CONSTANT)); + // Run operator + cvcuda::JointBilateralFilter jointBilateralFilterOp; + EXPECT_NO_THROW(jointBilateralFilterOp(stream, batchSrc, batchSrcColor, batchDst, diameterTensor, + sigmaColorTensor, sigmaSpaceTensor, NVCV_BORDER_CONSTANT)); - // Retrieve data from GPU - for (int i = 0; i < numberOfImages; i++) - { - auto imgData = imgDst[i].exportData(); - ASSERT_NE(imgData, nvcv::NullOpt); + // Retrieve data from GPU + for (int i = 0; i < numberOfImages; i++) + { + auto imgData = imgDst[i].exportData(); + ASSERT_NE(imgData, nvcv::NullOpt); - // Copy input data to the GPU - ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr, - imgData->plane(0).rowStride, srcVecRowStride[i], imgDst[i].size().h, - cudaMemcpyDeviceToHost, stream)); + // Copy input data to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpy2DAsync(dstVec[i].data(), srcVecRowStride[i], imgData->plane(0).basePtr, + imgData->plane(0).rowStride, srcVecRowStride[i], + imgDst[i].size().h, cudaMemcpyDeviceToHost, stream)); + } + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + + // Compare data + ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, channelsVec, 1.0f)); } - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); - - // Compare data - ASSERT_TRUE(CompareVarShapes(dstVec, goldVec, srcVecColumns, srcVecRows, srcVecRowStride, 1.0f)); } + +#undef NVCV_IMAGE_FORMAT_2U8 diff --git a/tests/cvcuda/system/TestOpPillowResize.cpp b/tests/cvcuda/system/TestOpPillowResize.cpp index 4f276dba6..4cc2c46f2 100644 --- a/tests/cvcuda/system/TestOpPillowResize.cpp +++ b/tests/cvcuda/system/TestOpPillowResize.cpp @@ -1091,11 +1091,14 @@ void StartVarShapeTest(int srcWidthBase, int srcHeightBase, int dstWidthBase, in std::uniform_int_distribution rndDstWidth(dstWidthBase * 0.8, dstWidthBase * 1.1); std::uniform_int_distribution rndDstHeight(dstHeightBase * 0.8, dstHeightBase * 1.1); - std::vector imgSrc, imgDst; + std::vector imgSrc, imgDst; + std::vector srcSizes, dstSizes; for (int i = 0; i < numberOfImages; ++i) { imgSrc.emplace_back(nvcv::Size2D{rndSrcWidth(randEng), rndSrcHeight(randEng)}, fmt); imgDst.emplace_back(nvcv::Size2D{rndDstWidth(randEng), rndDstHeight(randEng)}, fmt); + srcSizes.emplace_back(imgSrc.back().size()); + dstSizes.emplace_back(imgDst.back().size()); } nvcv::ImageBatchVarShape batchSrc(numberOfImages); @@ -1142,14 +1145,11 @@ void StartVarShapeTest(int srcWidthBase, int srcHeightBase, int dstWidthBase, in srcHeight, cudaMemcpyHostToDevice)); } - nvcv::Size2D maxSrcSize = batchSrc.maxSize(); - nvcv::Size2D maxDstSize = batchDst.maxSize(); - // Generate test result cvcuda::PillowResize pillowResizeOp; cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace( - pillowResizeOp.getWorkspaceRequirements(numberOfImages, maxSrcSize, maxDstSize, fmt)); + pillowResizeOp.getWorkspaceRequirements(numberOfImages, srcSizes.data(), dstSizes.data(), fmt)); EXPECT_NO_THROW(pillowResizeOp(stream, ws.get(), batchSrc, batchDst, interpolation)); // Get test data back @@ -1218,3 +1218,18 @@ TEST_P(OpPillowResize, varshape_correct_output) else if (nvcv::FMT_RGBf32 == fmt || nvcv::FMT_RGBAf32 == fmt) StartVarShapeTest(srcWidth, srcHeight, dstWidth, dstHeight, interpolation, numberOfImages, fmt); } + +TEST(OpPillowResize, invalidGetWorkSpace) +{ + NVCVOperatorHandle pillowResizeHandle; + ASSERT_EQ(NVCV_SUCCESS, cvcudaPillowResizeCreate(&pillowResizeHandle)); + NVCVSize2D inputSizesWH[1] = { + {224, 224} + }; + NVCVSize2D outputSizesWH[1] = { + {112, 112} + }; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + cvcudaPillowResizeVarShapeGetWorkspaceRequirements(pillowResizeHandle, 1, inputSizesWH, outputSizesWH, + NVCV_IMAGE_FORMAT_U8, nullptr)); +} diff --git a/tests/cvcuda/system/TestOpResize.cpp b/tests/cvcuda/system/TestOpResize.cpp index 26140de44..eef44e285 100644 --- a/tests/cvcuda/system/TestOpResize.cpp +++ b/tests/cvcuda/system/TestOpResize.cpp @@ -37,20 +37,19 @@ namespace test = nvcv::test; namespace t = ::testing; // clang-format off - NVCV_TEST_SUITE_P(OpResize, test::ValueList { // srcWidth, srcHeight, dstWidth, dstHeight, interpolation, numberImages - { 42, 48, 23, 24, NVCV_INTERP_NEAREST, 1}, - { 113, 12, 12, 36, NVCV_INTERP_NEAREST, 1}, - { 421, 148, 223, 124, NVCV_INTERP_NEAREST, 2}, - { 313, 212, 412, 336, NVCV_INTERP_NEAREST, 3}, - { 42, 40, 21, 20, NVCV_INTERP_LINEAR, 1}, - { 21, 21, 42, 42, NVCV_INTERP_LINEAR, 1}, - { 420, 420, 210, 210, NVCV_INTERP_LINEAR, 4}, - { 210, 210, 420, 420, NVCV_INTERP_LINEAR, 5}, - { 42, 40, 21, 20, NVCV_INTERP_CUBIC, 1}, - { 21, 21, 42, 42, NVCV_INTERP_CUBIC, 6}, + { 42, 48, 23, 24, NVCV_INTERP_NEAREST, 1}, + { 113, 12, 12, 36, NVCV_INTERP_NEAREST, 1}, + { 421, 148, 223, 124, NVCV_INTERP_NEAREST, 2}, + { 313, 212, 412, 336, NVCV_INTERP_NEAREST, 3}, + { 42, 40, 21, 20, NVCV_INTERP_LINEAR, 1}, + { 21, 21, 42, 42, NVCV_INTERP_LINEAR, 1}, + { 420, 420, 210, 210, NVCV_INTERP_LINEAR, 4}, + { 210, 210, 420, 420, NVCV_INTERP_LINEAR, 5}, + { 42, 40, 21, 20, NVCV_INTERP_CUBIC, 1}, + { 21, 21, 42, 42, NVCV_INTERP_CUBIC, 6}, { 420, 420, 420, 420, NVCV_INTERP_CUBIC, 2}, { 420, 420, 420, 420, NVCV_INTERP_CUBIC, 1}, { 420, 420, 40, 42, NVCV_INTERP_CUBIC, 1}, @@ -59,9 +58,17 @@ NVCV_TEST_SUITE_P(OpResize, test::ValueList mae(testVec.size()); + for (size_t i = 0; i < mae.size(); ++i) + { + mae[i] = abs(static_cast(goldVec[i]) - static_cast(testVec[i])); + } + + int maeThreshold = 1; + + EXPECT_THAT(mae, t::Each(t::Le(maeThreshold))); } } diff --git a/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp b/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp new file mode 100644 index 000000000..81208d38a --- /dev/null +++ b/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp @@ -0,0 +1,491 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Definitions.hpp" +#include "ResizeUtils.hpp" + +#include +#include +// #include +#include +#include +#include +#include +#include +#include +#include // for NVCVInterpolationType, NVCVChannelManip, etc. +#include +#include +#include +#include + +#include +#include +#include + +namespace gt = ::testing; +namespace test = nvcv::test; +namespace cuda = nvcv::cuda; +namespace ttype = test::type; + +static std::default_random_engine randEng(std::random_device{}()); + +template +using uniform_dist + = std::conditional_t, std::uniform_int_distribution, std::uniform_real_distribution>; + +inline NVCVChannelManip ChannelManip(nvcv::ImageFormat srcFormat, nvcv::ImageFormat dstFormat) +{ + const int srcChannels = srcFormat.numChannels(); + const nvcv::Swizzle srcSwizzle = srcFormat.swizzle(); + const nvcv::Swizzle dstSwizzle = dstFormat.swizzle(); + + NVCVChannelManip manip = NVCV_CHANNEL_NO_OP; + + if (srcChannels > 2 && srcSwizzle != dstSwizzle) + { + int srcSwap = static_cast(srcSwizzle), dstSwap = static_cast(dstSwizzle); + bool srcRGB = (srcSwap == NVCV_SWIZZLE_XYZ0 || srcSwap == NVCV_SWIZZLE_XYZW || srcSwap == NVCV_SWIZZLE_XYZ1), + srcBGR = (srcSwap == NVCV_SWIZZLE_ZYX0 || srcSwap == NVCV_SWIZZLE_ZYXW || srcSwap == NVCV_SWIZZLE_ZYX1); + bool dstRGB = (dstSwap == NVCV_SWIZZLE_XYZ0 || dstSwap == NVCV_SWIZZLE_XYZW || dstSwap == NVCV_SWIZZLE_XYZ1), + dstBGR = (dstSwap == NVCV_SWIZZLE_ZYX0 || dstSwap == NVCV_SWIZZLE_ZYXW || dstSwap == NVCV_SWIZZLE_ZYX1); + bool swapRB = ((srcRGB && dstBGR) || (srcBGR && dstRGB)); + + if (swapRB && srcChannels == 3) + { + manip = NVCV_CHANNEL_REVERSE; + } + } + return manip; +} + +template +void CropConvert(DstT *dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, const SrcT *src, + const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, const int numImages, const int2 cropPos, + const NVCVChannelManip manip, const double scale = 1.0, const double offst = 0.0) +{ + int srcPlanes = srcFormat.numPlanes(); + int dstPlanes = dstFormat.numPlanes(); + int srcChannels = srcFormat.numChannels(); + int dstChannels = dstFormat.numChannels(); + + size_t srcIncrX = srcChannels / srcPlanes; // 1 if planar; srcChannels if not. + size_t dstIncrX = dstChannels / dstPlanes; // 1 if planar; dstChannels if not. + size_t srcIncrY = srcIncrX * srcSize.w; + size_t dstIncrY = dstIncrX * dstSize.w; + size_t srcIncrC = (srcPlanes > 1 ? srcSize.w * srcSize.h : 1); + size_t dstIncrC = (dstPlanes > 1 ? dstSize.w * dstSize.h : 1); + size_t srcIncrN = srcSize.w * srcSize.h * srcChannels; + size_t dstIncrN = dstSize.w * dstSize.h * dstChannels; + size_t srcOffst = cropPos.y * srcIncrY + cropPos.x * srcIncrX; + + int channelMap[4] = {0, 1, 2, 3}; + + int channels = (srcChannels < dstChannels ? srcChannels : dstChannels); + + if (manip == NVCV_CHANNEL_REVERSE) + { + for (int c = 0; c < channels; ++c) channelMap[c] = channels - c - 1; + } + + for (int i = 0; i < numImages; i++) + { + const SrcT *srcBase = src + i * srcIncrN + srcOffst; + DstT *dstBase = dst + i * dstIncrN; + + for (int y = 0; y < dstSize.h; y++) + { + const SrcT *srcRow = srcBase + y * srcIncrY; + DstT *dstRow = dstBase + y * dstIncrY; + + for (int x = 0; x < dstSize.w; x++) + { + const SrcT *srcPtr = srcRow + x * srcIncrX; + DstT *dstPtr = dstRow + x * dstIncrX; + + for (int c = 0; c < channels; c++) + { + dstPtr[channelMap[c] * dstIncrC] = static_cast(srcPtr[c * srcIncrC] * scale + offst); + } + } + } + } +} + +template +void CropConvert(std::vector &dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, + const std::vector src, const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, + const int numImages, const int2 cropPos, const NVCVChannelManip manip, const double scale = 1.0, + const double offst = 0.0) +{ + CropConvert(dst.data(), dstSize, dstFormat, src.data(), srcSize, srcFormat, numImages, cropPos, manip, scale, + offst); +} + +template +void CropConvert(DstT *dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, const std::vector src, + const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, const int numImages, const int2 cropPos, + const NVCVChannelManip manip, const double scale = 1.0, const double offst = 0.0) +{ + CropConvert(dst, dstSize, dstFormat, src.data(), srcSize, srcFormat, numImages, cropPos, manip, scale, offst); +} + +template +void fillVec(std::vector &vec, const nvcv::Size2D size, const nvcv::ImageFormat frmt, size_t offst = 0) +{ + int planes = frmt.numPlanes(); + int channels = frmt.numChannels(); + size_t incrX = channels / planes; // 1 if planar; dstChannels if not. + size_t incrY = incrX * size.w; + size_t incrC = (planes > 1 ? size.w * size.h : 1); + + for (int y = 0; y < size.h; y++) + { + size_t yIncr = offst + y * incrY; + + for (int x = 0; x < size.w; x++) + { + size_t xIncr = yIncr + x * incrX; + + for (int c = 0; c < channels; c++) + { + vec[xIncr + c * incrC] = static_cast((x + y + c) & 255); + } + } + } +} + +#define _SHAPE(w, h, n) (int3{w, h, n}) + +#define _TEST_ROW(SrcShape, ResizeDim, Interp, DstSize, CropPos, SrcFrmt, DstFrmt, SrcType, DstType) \ + ttype::Types, ttype::Value, ttype::Value, ttype::Value, \ + ttype::Value, ttype::Value, ttype::Value, SrcType, DstType> + +// clang-format off + +NVCV_TYPED_TEST_SUITE( + OpResizeCropConvertReformat, ttype::Types< + // Test cases: RGB (interleaved) -> BGR (planar); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_LINEAR, int2( 6, 6), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 0 + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 16, 16), NVCV_INTERP_LINEAR, int2( 12, 12), int2( 2, 2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 1 + _TEST_ROW(_SHAPE( 42, 48, 1), int2( 23, 24), NVCV_INTERP_LINEAR, int2( 15, 13), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 2 + _TEST_ROW(_SHAPE( 42, 40, 3), int2( 21, 20), NVCV_INTERP_LINEAR, int2( 17, 13), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 3 + _TEST_ROW(_SHAPE( 21, 21, 5), int2( 42, 42), NVCV_INTERP_LINEAR, int2( 32, 32), int2( 10, 10), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 4 + _TEST_ROW(_SHAPE( 113, 12, 7), int2( 12, 36), NVCV_INTERP_LINEAR, int2( 7, 13), int2( 3, 11), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 5 + _TEST_ROW(_SHAPE( 17, 151, 7), int2( 48, 16), NVCV_INTERP_LINEAR, int2( 32, 16), int2( 4, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 6 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 7 + _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_LINEAR, int2( 640, 480), int2(101, 64), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 8 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 9 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 10 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 11 + + // Test cases: RGB (interleaved) -> RGB (planar); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 12 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 13 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 14 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 15 + + // Test cases: BGR (interleaved) -> RGB (planar); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 16 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 17 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 18 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 19 + + // Test cases: BGR (interleaved) -> BGR (planar); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 20 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 21 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 22 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 23 + + // Test cases: RGB (interleaved) -> BGR (interleaved); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_LINEAR, int2( 6, 6), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 24 + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 16, 16), NVCV_INTERP_LINEAR, int2( 12, 12), int2( 2, 2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 25 + _TEST_ROW(_SHAPE( 113, 12, 7), int2( 12, 36), NVCV_INTERP_LINEAR, int2( 7, 13), int2( 3, 11), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 26 + _TEST_ROW(_SHAPE( 17, 151, 7), int2( 48, 16), NVCV_INTERP_LINEAR, int2( 32, 16), int2( 4, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 27 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 28 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 29 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, uchar3, uchar3), // 30 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, uchar3, uchar3), // 31 + + // Test cases: RGB (interleaved) -> RGB (interleaved); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 32 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 33 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 34 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 35 + + // Test cases: BGR (interleaved) -> RGB (interleaved); linear interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 36 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 37 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 38 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 39 + + // Test cases: RGB (interleaved) -> BGR (planar); nearest-neighbor interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_NEAREST, int2( 6, 6), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 40 + _TEST_ROW(_SHAPE( 8, 8, 5), int2( 16, 16), NVCV_INTERP_NEAREST, int2( 12, 12), int2( 2, 2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 41 + _TEST_ROW(_SHAPE( 42, 48, 1), int2( 23, 24), NVCV_INTERP_NEAREST, int2( 15, 13), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 42 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 43 + _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_NEAREST, int2( 640, 480), int2(101, 64), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 44 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 45 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 46 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 47 + + // Test cases: BGR (interleaved) -> RGB (planar); nearest-neighbor interpolation; float and uchar output. + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 48 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 49 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 50 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t) // 51 + +>); +#undef _TEST_ROW + +// clang-format on + +TYPED_TEST(OpResizeCropConvertReformat, tensor_correct_output) +{ + int3 srcShape = ttype::GetValue; + int2 resize = ttype::GetValue; + + NVCVInterpolationType interp = ttype::GetValue; + + int2 cropDim = ttype::GetValue; + int2 cropPos = ttype::GetValue; + + nvcv::ImageFormat srcFormat{ttype::GetValue}; + nvcv::ImageFormat dstFormat{ttype::GetValue}; + + using SrcVT = typename ttype::GetType; + using DstVT = typename ttype::GetType; + using SrcBT = typename cuda::BaseType; + using DstBT = typename cuda::BaseType; + + int srcW = srcShape.x; + int srcH = srcShape.y; + int dstW = cropDim.x; + int dstH = cropDim.y; + int tmpW = resize.x; + int tmpH = resize.y; + + int numImages = srcShape.z; + int srcChannels = srcFormat.numChannels(); + int dstChannels = dstFormat.numChannels(); + int srcPlanes = srcFormat.numPlanes(); + int dstPlanes = dstFormat.numPlanes(); + int srcPixElems = srcChannels / srcPlanes; + int dstPixElems = dstChannels / dstPlanes; + + ASSERT_LE(srcChannels, 4); + ASSERT_EQ(srcChannels, dstChannels); + + NVCVSize2D resizeDim{resize.x, resize.y}; + + NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat); + + // Create input and output tensors. + nvcv::Tensor srcTensor = nvcv::util::CreateTensor(numImages, srcW, srcH, srcFormat); + nvcv::Tensor dstTensor = nvcv::util::CreateTensor(numImages, dstW, dstH, dstFormat); + + auto src = srcTensor.exportData(); + auto dst = dstTensor.exportData(); + + ASSERT_NE(src, nullptr); + ASSERT_NE(dst, nullptr); + + auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*src); + ASSERT_TRUE(srcAccess); + + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dst); + ASSERT_TRUE(dstAccess); + + int srcRowElems = srcPixElems * srcW; + int tmpRowElems = srcPixElems * tmpW; + int dstRowElems = dstPixElems * dstW; + + size_t srcElems = (size_t)srcRowElems * (size_t)srcH * (size_t)srcPlanes * (size_t)numImages; + size_t tmpElems = (size_t)tmpRowElems * (size_t)tmpH * (size_t)srcPlanes * (size_t)numImages; + size_t dstElems = (size_t)dstRowElems * (size_t)dstH * (size_t)dstPlanes * (size_t)numImages; + + nvcv::Size2D srcSize{srcW, srcH}; + nvcv::Size2D tmpSize{tmpW, tmpH}; + nvcv::Size2D dstSize{dstW, dstH}; + + size_t srcPitch = srcW * sizeof(SrcVT); + size_t tmpPitch = tmpW * sizeof(SrcVT); + size_t dstPitch = dstW * sizeof(DstVT); + + std::vector srcVec(srcElems); + std::vector tmpVec(tmpElems); + std::vector refVec(dstElems); + + // Populate source tensor. + fillVec(srcVec, srcSize, srcFormat); + + // Generate "gold" result for image and place in reference vector. + test::Resize(tmpVec, tmpPitch, tmpSize, srcVec, srcPitch, srcSize, srcFormat, interp, false); + CropConvert(refVec, dstSize, dstFormat, tmpVec, tmpSize, srcFormat, numImages, cropPos, manip); + + // Copy source tensor to device. + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(src->basePtr(), srcAccess->rowStride(), srcVec.data(), srcPitch, srcPitch, + srcH * srcPlanes, cudaMemcpyHostToDevice)); + + // Run fused ResizeCropConvertReformat operator. + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + cvcuda::ResizeCropConvertReformat resizeCrop; + EXPECT_NO_THROW(resizeCrop(stream, srcTensor, dstTensor, resizeDim, interp, cropPos, manip)); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // Copy destination tensor back to host. + std::vector dstVec(dstElems); + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitch, dst->basePtr(), dstAccess->rowStride(), dstPitch, + dstH * dstPlanes, cudaMemcpyDeviceToHost)); + + // Compare "gold" reference to computed output. + VEC_EXPECT_NEAR(refVec, dstVec, 1); +} + +TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) +{ + int3 srcShape = ttype::GetValue; + int2 resize = ttype::GetValue; + + NVCVInterpolationType interp = ttype::GetValue; + + int2 cropDim = ttype::GetValue; + int2 cropPos = ttype::GetValue; + + nvcv::ImageFormat srcFormat{ttype::GetValue}; + nvcv::ImageFormat dstFormat{ttype::GetValue}; + + using SrcVT = typename ttype::GetType; + using DstVT = typename ttype::GetType; + using SrcBT = typename cuda::BaseType; + using DstBT = typename cuda::BaseType; + + int srcW = srcShape.x; + int srcH = srcShape.y; + int dstW = cropDim.x; + int dstH = cropDim.y; + int tmpW = resize.x; + int tmpH = resize.y; + + int numImages = srcShape.z; + int srcChannels = srcFormat.numChannels(); + int dstChannels = dstFormat.numChannels(); + int srcPlanes = srcFormat.numPlanes(); + int dstPlanes = dstFormat.numPlanes(); + int srcPixElems = srcChannels / srcPlanes; + int dstPixElems = dstChannels / dstPlanes; + + ASSERT_LE(srcChannels, 4); + ASSERT_EQ(srcChannels, dstChannels); + + NVCVSize2D resizeDim{resize.x, resize.y}; + + NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat); + + std::vector srcImg; + + uniform_dist randVal(std::is_integral_v ? cuda::TypeTraits::min : SrcBT{0}, + std::is_integral_v ? cuda::TypeTraits::max : SrcBT{1}); + + std::uniform_int_distribution randW(srcW * 0.8, srcW * 1.2); + std::uniform_int_distribution randH(srcH * 0.8, srcH * 1.2); + + int tmpRowElems = srcPixElems * tmpW; + int dstRowElems = dstPixElems * dstW; + + size_t tmpElems = (size_t)tmpRowElems * (size_t)tmpH * (size_t)srcPlanes; + size_t refIncr = (size_t)dstRowElems * (size_t)dstH * (size_t)dstPlanes; + size_t dstElems = refIncr * (size_t)numImages; + + nvcv::Size2D tmpSize{tmpW, tmpH}; + nvcv::Size2D dstSize{dstW, dstH}; + + std::vector tmpVec(tmpElems); + std::vector refVec(dstElems); + + size_t tmpPitch = tmpW * sizeof(SrcVT); + size_t dstPitch = dstW * sizeof(DstVT); + + for (int i = 0; i < numImages; ++i) + { + int imgW = (interp ? randW(randEng) : srcW); + int imgH = (interp ? randH(randEng) : srcH); + + srcImg.emplace_back(nvcv::Size2D{imgW, imgH}, srcFormat); + + auto srcData = srcImg[i].exportData(); + ASSERT_TRUE(srcData); + + int imgRowElems = srcPixElems * imgW; + + size_t imgPitch = imgW * sizeof(SrcVT); + size_t imgElems = (size_t)imgRowElems * (size_t)imgH * (size_t)srcPlanes; + + nvcv::Size2D imgSize{imgW, imgH}; + + std::vector imgVec(imgElems); + + // Populate image tensor . + fillVec(imgVec, imgSize, srcFormat); + + // Generate "gold" result for image and place in reference image plane. + DstBT *refPlane = refVec.data() + i * refIncr; + + test::Resize(tmpVec, tmpPitch, tmpSize, imgVec, imgPitch, imgSize, srcFormat, interp, true); + CropConvert(refPlane, dstSize, dstFormat, tmpVec, tmpSize, srcFormat, 1, cropPos, manip); + + // Copy source tensor to device. + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->plane(0).basePtr, srcData->plane(0).rowStride, imgVec.data(), + imgPitch, imgPitch, imgH * srcPlanes, cudaMemcpyHostToDevice)); + } + + nvcv::ImageBatchVarShape src(numImages); + + src.pushBack(srcImg.begin(), srcImg.end()); + + // Create output tensor. + nvcv::Tensor dstTensor = nvcv::util::CreateTensor(numImages, dstW, dstH, dstFormat); + + auto dst = dstTensor.exportData(); + + ASSERT_NE(dst, nullptr); + + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dst); + ASSERT_TRUE(dstAccess); + + // Run fused ResizeCropConvertReformat operator. + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + cvcuda::ResizeCropConvertReformat resizeCrop; + EXPECT_NO_THROW(resizeCrop(stream, src, dstTensor, resizeDim, interp, cropPos, manip)); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + + // Copy destination tensor back to host. + std::vector dstVec(dstElems); + ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitch, dst->basePtr(), dstAccess->rowStride(), dstPitch, + dstH * dstPlanes * numImages, cudaMemcpyDeviceToHost)); + + // Compare "gold" reference to computed output. + VEC_EXPECT_NEAR(refVec, dstVec, 1); +} diff --git a/tests/nvcv_types/python/test_image.py b/tests/nvcv_types/python/test_image.py index 4615ca0de..b9e5efae6 100644 --- a/tests/nvcv_types/python/test_image.py +++ b/tests/nvcv_types/python/test_image.py @@ -13,11 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch import numpy as np -import pytest as t import nvcv import nvcv_util as util +import pytest as t +import torch + +import cvcuda + + +def test_image_is_cached(): + created_ids = set() + + pt_img = torch.rand((1, 1), dtype=torch.float32, device="cuda") + img = cvcuda.as_image(pt_img) + created_ids.add(img.id) + del img # delete img such that only cache has a reference to it + + for i in range(50): + pt_img = torch.rand((1 + i, i + 2), dtype=torch.float32, device="cuda") + img = cvcuda.as_image(pt_img) + assert img.id in created_ids + del img def test_image_creation_works(): @@ -110,10 +127,7 @@ def test_wrap_host_buffer_infer_imgformat_multiple_planes(buffers, format): assert img.format == format img = nvcv.as_image( - [ - torch.zeros(size=buf[0], dtype=buf[2], device="cuda").cuda() - for buf in buffers - ] + [torch.zeros(size=buf[0], dtype=buf[2], device="cuda") for buf in buffers] ) assert img.width == 8 assert img.height == 6 @@ -128,10 +142,7 @@ def test_wrap_host_buffer_explicit_format2(buffers, format): assert img.format == format img = nvcv.as_image( - [ - torch.zeros(size=buf[0], dtype=buf[2], device="cuda").cuda() - for buf in buffers - ], + [torch.zeros(size=buf[0], dtype=buf[2], device="cuda") for buf in buffers], format, ) assert img.width == 8 @@ -447,7 +458,7 @@ def test_image_is_kept_alive_by_cuda_array_interface(): del img2 # remove img2 from cache, but not img1, as it's being - # held by iface + # held by iface1 nvcv.clear_cache() # now img1 is free for reuse @@ -455,3 +466,21 @@ def test_image_is_kept_alive_by_cuda_array_interface(): img3 = nvcv.Image((640, 480), nvcv.Format.U8) assert img3.cuda().__cuda_array_interface__["data"][0] == data_buffer1 + + +def test_image_wrapper_nodeletion(): + """ + Check if image wrappers deletes memory that's not ours. + """ + # run twice, first run is without cache re-usage, second is with cache re-usage + for i in range(2): + np_img = np.random.rand(1 + i, 2 + i).astype(np.float32) + pt_img = torch.from_numpy(np_img).cuda() + + nv_img = cvcuda.as_image(pt_img) + del nv_img + + try: + assert (pt_img.cpu().numpy() == np_img).all() + except RuntimeError: + assert False, "Invalid memory" diff --git a/tests/nvcv_types/python/test_imgbatchvarshape.py b/tests/nvcv_types/python/test_imgbatchvarshape.py index 0caaedb38..cb0af3306 100644 --- a/tests/nvcv_types/python/test_imgbatchvarshape.py +++ b/tests/nvcv_types/python/test_imgbatchvarshape.py @@ -13,10 +13,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -import nvcv -import pytest as t import numpy as np +import nvcv import nvcv_util as util +import pytest as t +import torch + +import cvcuda + + +def test_imgbatchvarshape_are_cached(): + created_ids = set() + + # Create first VarShape + pt_imgs = [] + for n in range(2): + pt_img = torch.rand((1 + n, 2 + n), dtype=torch.float32, device="cuda") + pt_imgs.append(pt_img) + + batch = cvcuda.as_images(pt_imgs) + + for img in batch: + created_ids.add(img.id) + del img + del batch + + # Create more VarShapes, that reuse cache + for i in range(50): + pt_imgs = [] + for _ in range(2): + pt_img = torch.rand((1 + i, 2 + i), dtype=torch.float32, device="cuda") + pt_imgs.append(pt_img) + + batch = cvcuda.as_images(pt_imgs) + + for img in batch: + assert img.id in created_ids + del img + del batch def test_imgbatchvarshape_creation_works(): @@ -151,3 +185,22 @@ def test_wrap_buffer_list(base_shape, dt, format): assert images[i].width == sh[1] assert images[i].height == sh[0] assert images[i].format == format + + +def test_imgbatchvarshape_wrapper_nodeletion(): + """ + Check if imgbatchvarshape wrappers deletes memory that's not ours. + """ + + # run twice, first run is without cache re-usage, second is with cache re-usage + for i in range(2): + np_img = np.random.rand(1 + i, 2 + i).astype(np.float32) + pt_img = torch.from_numpy(np_img).cuda() + + batch = cvcuda.as_images([pt_img]) + del batch + + try: + assert (pt_img.cpu().numpy() == np_img).all() + except RuntimeError: + assert False, "Invalid memory"