diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 185ef70bd..ae8f44bda 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -48,7 +48,7 @@ jobs: run: | sudo apt update -y && sudo apt install -y --no-install-recommends \ python3 python3-pip python3-dev python3-distutils doxygen && sudo rm -rf /var/lib/apt/lists/* \ - && python3 -m pip install sphinx-rtd-theme sphinx breathe exhale recommonmark graphviz \ + && python3 -m pip install sphinx-rtd-theme sphinx breathe recommonmark graphviz \ && python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 - if: matrix.language == 'c-cpp' @@ -92,7 +92,7 @@ jobs: - if: matrix.language == 'c-cpp' && github.event_name == 'push' name: Build Docs and Clean up Sphinx Build Directory run: | - ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON" $* + ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON -DPYTHON_VERSIONS=3.8" $* find build/docs/sphinx -name '*.doctree' -delete find build/docs/sphinx -name '*.map' -delete find build/docs/sphinx -name '*.pickle' -delete diff --git a/.gitignore b/.gitignore index 4b0a6a14c..528b9a558 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ # ---------------- /build/ /build-*/ +/build_*/ /install/ /cvcuda-installer*/ @@ -47,7 +48,8 @@ ipython_config.py # Documentation # ------------- -_exhale_api +_c_cpp_api +_cvcuda_api # Samples # ------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index 841801962..198a070f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ endif() project(cvcuda LANGUAGES C CXX - VERSION 0.8.0 + VERSION 0.9.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) diff --git a/README.md b/README.md index 5a9af904d..fe954e9b7 100644 --- a/README.md +++ b/README.md @@ -18,13 +18,13 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.8.0--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.9.0--beta-blue) ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray) [![CUDA](https://img.shields.io/badge/CUDA-v11.7-%2376B900?logo=nvidia)](https://developer.nvidia.com/cuda-toolkit-archive) [![GCC](https://img.shields.io/badge/GCC-v11.0-yellow)](https://gcc.gnu.org/gcc-11/changes.html) -[![Python](https://img.shields.io/badge/python-v3.7_%7c_v3.8_%7c_v3.9_%7c_v3.10%7c_v3.11-blue?logo=python)](https://www.python.org/) +[![Python](https://img.shields.io/badge/python-v3.8_%7c_v3.9_%7c_v3.10%7c_v3.11-blue?logo=python)](https://www.python.org/) [![CMake](https://img.shields.io/badge/CMake-v3.20-%23008FBA?logo=cmake)](https://cmake.org/) CV-CUDA is an open-source project that enables building efficient cloud-scale @@ -53,13 +53,15 @@ To get a local copy up and running follow these steps. \** full build, including test module
\*** [samples][CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12. -### Known limitations +### Known limitations and issues - For GCC versions lower than 11.0, C++17 support needs to be enabled when compiling CV-CUDA. - The C++ test module cannot build with gcc<11 (requires specific C++-20 features). With gcc-9 or gcc-10, please build with option `-DBUILD_TESTS=0` - [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12. - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version. -- Documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later. +- Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS=""`. +- Python bindings installed via Debian packages and Python tests fail with Numpy 2.0. We recommend using an older version of Numpy (e.g. 1.26) until we have implemented a fix. +- The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using linear and cubic interpolation. This will be fixed in an upcoming release. ### Installation @@ -87,12 +89,12 @@ Please note that the Python wheels are standalone, they include both the C++/CUD Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*) using `apt`: ```shell -apt install -y ./cvcuda-lib----linux.deb ./cvcuda-dev----linux.deb +sudo apt install -y ./cvcuda-lib----linux.deb ./cvcuda-dev----linux.deb ``` Install Python bindings (cvcuda-python*) using `apt`: ```shell -apt install -y ./cvcuda-python----linux.deb +sudo apt install -y ./cvcuda-python----linux.deb ``` where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture. @@ -122,7 +124,7 @@ Install the dependencies needed to setup up the repository: On Ubuntu >= 20.04, install the following packages using `apt`: ```shell -apt install -y git git-lfs +sudo apt install -y git git-lfs ``` Clone the repository @@ -145,19 +147,20 @@ Install the dependencies required to build CV-CUDA: - python3-dev: for python bindings - libssl-dev: needed by the testsuite (MD5 hashing utilities) - CUDA toolkit +- patchelf On Ubuntu >= 20.04, install the following packages using `apt`: ```shell -apt install -y g++-11 cmake ninja-build python3-dev libssl-dev +sudo apt install -y g++-11 cmake ninja-build python3-dev libssl-dev patchelf ``` Any version of the 11.x or 12.x CUDA toolkit should work. CV-CUDA was tested with 11.7 and 12.2, these versions are thus recommended. ```shell -apt install -y cuda-11-7 +sudo apt install -y cuda-11-7 # or -apt install -y cuda-12-2 +sudo apt install -y cuda-12-2 ``` Build the project: @@ -175,18 +178,18 @@ ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTH #### 3. Build Documentation -Known limitation: documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later. +Known limitation: Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS=""`. Install the dependencies required to build the documentation: - doxygen: parse header files for reference documentation - python3, python3-pip: to install some python packages needed -- sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation +- sphinx, breathe, recommonmark, graphiviz: to render the documentation - sphinx-rtd-theme: documentation theme used On Ubuntu, install the following packages using `apt` and `pip`: ```shell -apt install -y doxygen graphviz python3 python3-pip sphinx -python3 -m pip install breathe exhale recommonmark graphviz sphinx-rtd-theme +sudo apt install -y doxygen graphviz python3 python3-pip sphinx +python3 -m pip install breathe recommonmark graphviz sphinx-rtd-theme ``` Build the documentation: @@ -204,11 +207,12 @@ For instructions on how to build samples from source and run them, see the [Samp Install the dependencies required for running the tests: - python3, python3-pip: to run python bindings tests - torch: dependencies needed by python bindings tests +- numpy: known limitation: Python tests fail with numpy 2.0. We recommend using an older version (eg 1.26) until we have implemented a fix. On Ubuntu >= 20.04, install the following packages using `apt` and `pip`: ```shell -apt install -y python3 python3-pip -python3 -m pip install pytest torch +sudo apt install -y python3 python3-pip +python3 -m pip install pytest torch numpy==1.26 ``` The tests are in `/bin`. You can run the script below to run all tests at once. Here's an example when build tree is created in `build-rel`: diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 5ad6c8902..5ad4f5973 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -38,39 +38,44 @@ file(MAKE_DIRECTORY ${DOXYGEN_OUTPUT_DIR}) list(GET PYTHON_VERSIONS -1 VER) add_custom_command(OUTPUT ${DOXYGEN_INDEX_FILE} - COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYFILE_OUT} - MAIN_DEPENDENCY ${DOXYFILE_OUT} ${DOXYFILE_IN} - COMMENT "Generating doxygen xml" - DEPENDS cvcuda_python${VER}) + COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYFILE_OUT} + MAIN_DEPENDENCY ${DOXYFILE_OUT} ${DOXYFILE_IN} + COMMENT "Generating doxygen xml" + DEPENDS cvcuda_python${VER}) add_custom_target(cvcuda_doxygen ALL DEPENDS ${DOXYGEN_INDEX_FILE}) set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/sphinx) -set(EXHALE_SOURCE ${SPHINX_SOURCE}/_exhale_api) set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sphinx) set(SPHINX_INDEX_FILE ${SPHINX_BUILD}/index.html) -set(SPHINX_GROUP_INDEX_FILE ${SPHINX_BUILD}/groupindex.html) +set(C_CPP_API_RST ${SPHINX_SOURCE}/_c_cpp_api) +set(PY_CVCUDA_API_RST ${SPHINX_SOURCE}/_python_api/_cvcuda_api) + +# Start from clean directory for rst files, otherwise build could be affected due to old files +file(REMOVE_RECURSE ${C_CPP_API_RST}/*) +file(REMOVE_RECURSE ${PY_CVCUDA_API_RST}/*) # Generate rst files for groups from doxygen index.xml -add_custom_target(cvcuda_groups ALL - COMMAND python3 ${SPHINX_SOURCE}/generate_groups.py ${EXHALE_SOURCE} ${DOXYGEN_OUTPUT_DIR}/xml - DEPENDS ${DOXYGEN_INDEX_FILE}) +add_custom_target(cvcuda_groups ALL python3 ${SPHINX_SOURCE}/generate_groups.py ${C_CPP_API_RST} ${DOXYGEN_OUTPUT_DIR}/xml + DEPENDS cvcuda_doxygen) + +# Generate rst files for python documentation +add_custom_target(cvcuda_python_docs ALL python3 ${SPHINX_SOURCE}/gen_py_doc_rsts.py ${PY_CVCUDA_API_RST} ${CMAKE_SOURCE_DIR} + DEPENDS cvcuda_python${VER}) # Generate Sphinx add_custom_command(OUTPUT ${SPHINX_INDEX_FILE} - COMMAND - ${CMAKE_COMMAND} -E env "SPHINX_PYTHON_SRC=${CMAKE_BINARY_DIR}/lib/python" "DOXYGEN_STRIP_PATH=${CMAKE_CURRENT_SOURCE_DIR}/../src" - ${SPHINX_EXECUTABLE} -j auto -b html - # Tell Breathe where to find the Doxygen output - -Dbreathe_projects.cvcuda=${DOXYGEN_OUTPUT_DIR}/xml - ${SPHINX_SOURCE} ${SPHINX_BUILD} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - ${SPHINX_SOURCE}/index.rst - ${DOXYGEN_INDEX_FILE} - cvcuda_groups - MAIN_DEPENDENCY ${SPHINX_SOURCE}/conf.py - COMMENT "Generating documentation with Sphinx") + COMMAND ${CMAKE_COMMAND} -E env "SPHINX_PYTHON_SRC=${CMAKE_BINARY_DIR}/lib/python" + ${SPHINX_EXECUTABLE} -j auto -b html + # Tell Breathe where to find the Doxygen's xml output. Needed to have c/cpp documentation. + -Dbreathe_projects.cvcuda=${DOXYGEN_OUTPUT_DIR}/xml + ${SPHINX_SOURCE} ${SPHINX_BUILD} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${SPHINX_SOURCE}/index.rst + cvcuda_doxygen + cvcuda_groups + MAIN_DEPENDENCY ${SPHINX_SOURCE}/conf.py + COMMENT "Generating documentation with Sphinx") add_custom_target(cvcuda_sphinx ALL DEPENDS ${SPHINX_INDEX_FILE}) diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 471474de2..5c9873453 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -748,14 +748,14 @@ QUIET = NO # Tip: Turn warnings on while writing the documentation. # The default value is: YES. -WARNINGS = YES +WARNINGS = NO # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. -WARN_IF_UNDOCUMENTED = YES +WARN_IF_UNDOCUMENTED = NO # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters @@ -763,7 +763,7 @@ WARN_IF_UNDOCUMENTED = YES # markup commands wrongly. # The default value is: YES. -WARN_IF_DOC_ERROR = YES +WARN_IF_DOC_ERROR = NO # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return @@ -2419,7 +2419,7 @@ PLANTUML_INCLUDE_PATH = # Minimum value: 0, maximum value: 10000, default value: 50. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_GRAPH_MAX_NODES = 50 +DOT_GRAPH_MAX_NODES = 128 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs # generated by dot. A depth value of 3 means that only nodes reachable from the diff --git a/docs/sphinx/_python_api/nvcv/cache.rst b/docs/sphinx/_python_api/nvcv/cache.rst new file mode 100644 index 000000000..ee48d9df9 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/cache.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +Cache +===== + +.. automodule:: nvcv + :noindex: + :members: cache_size, clear_cache diff --git a/docs/sphinx/_python_api/nvcv/colorspec.rst b/docs/sphinx/_python_api/nvcv/colorspec.rst new file mode 100644 index 000000000..7344dac50 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/colorspec.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +Color Models +============ + +.. automodule:: nvcv + :noindex: + :members: ColorSpec diff --git a/docs/sphinx/_python_api/nvcv/format.rst b/docs/sphinx/_python_api/nvcv/format.rst new file mode 100644 index 000000000..d51f8fd21 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/format.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +Image Formats +============= + +.. automodule:: nvcv + :noindex: + :members: Format diff --git a/docs/sphinx/_python_api/nvcv/image.rst b/docs/sphinx/_python_api/nvcv/image.rst new file mode 100644 index 000000000..cd5f3dfa7 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/image.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +Image +===== + +.. automodule:: nvcv + :noindex: + :members: Image, as_image diff --git a/docs/sphinx/_python_api/nvcv/imagebatch.rst b/docs/sphinx/_python_api/nvcv/imagebatch.rst new file mode 100644 index 000000000..17054ce96 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/imagebatch.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +ImageBatchVarShape +================== + +.. automodule:: nvcv + :noindex: + :members: ImageBatchVarShape, as_images diff --git a/docs/sphinx/_python_api/nvcv/recti.rst b/docs/sphinx/_python_api/nvcv/recti.rst new file mode 100644 index 000000000..e170207e0 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/recti.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +RectI +===== + +.. automodule:: nvcv + :noindex: + :members: RectI diff --git a/docs/sphinx/_python_api/nvcv/tensor.rst b/docs/sphinx/_python_api/nvcv/tensor.rst new file mode 100644 index 000000000..fdd3f4915 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/tensor.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +Tensor +====== + +.. automodule:: nvcv + :noindex: + :members: Tensor, as_tensor, reshape diff --git a/docs/sphinx/_python_api/nvcv/tensorbatch.rst b/docs/sphinx/_python_api/nvcv/tensorbatch.rst new file mode 100644 index 000000000..636442dc5 --- /dev/null +++ b/docs/sphinx/_python_api/nvcv/tensorbatch.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +TensorBatch +=========== + +.. automodule:: nvcv + :noindex: + :members: TensorBatch, as_tensors diff --git a/docs/sphinx/_python_api/template.rst b/docs/sphinx/_python_api/template.rst new file mode 100644 index 000000000..5ca35a3bc --- /dev/null +++ b/docs/sphinx/_python_api/template.rst @@ -0,0 +1,22 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +@OperatorName@ +@=@ + +.. automodule:: @Module@ + :noindex: + :members: @MemberFunctions@ diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index f7fac8e63..9ab82fd6a 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -26,12 +26,13 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # - -# -- Project information ----------------------------------------------------- import os -import sphinx_rtd_theme import sys +import sphinx_rtd_theme + +# -- Project information ----------------------------------------------------- + project = "CV-CUDA" copyright = "2022-2024, NVIDIA." author = "NVIDIA" @@ -41,7 +42,6 @@ # set python docstring source path lib_path = os.getenv("SPHINX_PYTHON_SRC", default=".") sys.path.insert(0, os.path.abspath(lib_path)) -doxygen_strip_path = os.getenv("DOXYGEN_STRIP_PATH", default=".") # -- General configuration --------------------------------------------------- @@ -58,13 +58,18 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "docs/manuals/py/**"] -# source_parsers = { '.md': 'recommonmark.parser.CommonMarkParser',} - extensions = ["recommonmark"] - source_suffix = {".rst": "restructuredtext", ".md": "markdown"} +# Tell sphinx what the primary language being documented is. +primary_domain = "cpp" + +# Tell sphinx what the pygments highlight language should be. +highlight_language = "cpp" + +autodoc_inherit_docstrings = False + # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -91,7 +96,6 @@ "titles_only": False, } - # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". @@ -114,19 +118,6 @@ def setup(app): # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" -# -- Options for breathe -------------------------------------------------- - -# Enable the breathe extension -extensions.append("breathe") -extensions.append("exhale") -extensions.append("sphinx.ext.autodoc") -extensions.append("sphinx.ext.viewcode") -extensions.append("sphinx.ext.napoleon") - -# Set up the default project for breathe extension -breathe_default_project = "cvcuda" - - # -- Options for sphinx_rtd_theme ----------------------------------------- # Enable the sphinx_rtd_theme extension @@ -135,63 +126,22 @@ def setup(app): # Enable the sphinx.ext.todo extension extensions.append("sphinx.ext.todo") -# -- Extension configuration ------------------------------------------------- +# -- Extensions -------------------------------------------------- -doxygen_config = """ -EXCLUDE_PATTERNS = *.md *.txt -ENABLE_PREPROCESSING = YES -WARN_IF_UNDOCUMENTED = NO -USE_M -""" - -doxygen_conf_extra = """ -INLINE_SIMPLE_STRUCTS = YES -TYPEDEF_HIDES_STRUCT = YES -EXPAND_ONLY_PREDEF = YES -""" - -doxygen_predefined = [ - "NVCV_PUBLIC=", - "NVCV_API_VERSION_IS(x,y)=0", - "NVCV_API_VERSION_AT_LEAST(x,y)=1", - "NVCV_API_VERSION_AT_MOST(x,y)=0", -] +# Enable extensions +extensions.append("breathe") +extensions.append("sphinx.ext.autodoc") +extensions.append("sphinx.ext.viewcode") +extensions.append("sphinx.ext.napoleon") -doxygen_input_config = """ -""" - -doxygen_config = ( - doxygen_input_config - + """ -EXCLUDE_PATTERNS = *.md *.txt -ENABLE_PREPROCESSING = YES -WARN_IF_UNDOCUMENTED = NO -USE_M -""" -) - -# Setup the exhale extension -exhale_args = { - # These arguments are required - "containmentFolder": "_exhale_api", - "rootFileName": "cvcuda_api.rst", - # Heavily encouraged optional argument (see docs) - "rootFileTitle": "Library API", - # Suggested optional arguments - "createTreeView": True, - # TIP: if using the sphinx-bootstrap-theme, you need - # "treeViewIsBootstrap": True, - "exhaleExecutesDoxygen": False, - "fullToctreeMaxDepth": 1, - "minifyTreeView": False, - "contentsDirectives": False, - "doxygenStripFromPath": doxygen_strip_path, +# -- Extension configuration ------------------------------------------------- +# Set up the default project for breathe extension +breathe_default_project = "cvcuda" +breathe_doxygen_config_options = { + "QUIET": "NO", + "WARNINGS": "NO", + "WARN_IF_UNDOCUMENTED": "NO", + "WARN_IF_DOC_ERROR": "NO", + "WARN_NO_PARAMDOC": "NO", + "WARN_AS_ERROR": "NO", } - -# Tell sphinx what the primary language being documented is. -primary_domain = "cpp" - -# Tell sphinx what the pygments highlight language should be. -highlight_language = "cpp" - -autodoc_inherit_docstrings = False diff --git a/docs/sphinx/gen_py_doc_rsts.py b/docs/sphinx/gen_py_doc_rsts.py new file mode 100644 index 000000000..b48eccba3 --- /dev/null +++ b/docs/sphinx/gen_py_doc_rsts.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys +from typing import List, Tuple + + +def exports_enum(s: str) -> bool: + return s.lstrip().startswith("py::enum_<") + + +def get_name_of_enum(s: str) -> str: + """Name of enum is first string in line""" + return re.findall('"([^"]*)"', s)[0] + + +def exports_class(s: str) -> bool: + return s.lstrip().startswith("py::class_<") + + +def get_name_of_class_if_documented(s: str) -> Tuple[bool, str]: + """ + If a class has only one strings in line, it has no documentation to be exported. + If it has more than one string, it has doc and first string is the title of the class + """ + found_strings = re.findall('"([^"]*)"', s) # get all strings + if len(found_strings) > 1: + return True, found_strings[0] + else: + return False, "" + + +def exports_def(s: str) -> bool: + return s.lstrip().startswith("m.def(") + + +def get_name_of_def(s: str) -> str: + """Name of def is first string in line""" + return re.findall('"([^"]*)"', s)[0] + + +def has_exports(file_path, export_calls): + for call in export_calls: + if call in open(file_path).read(): + export_calls.remove(call) + return True + return False + + +def create_rst_text(template_file: str, name: str, module: str, members: str) -> str: + with open(template_file, "r") as f: + rst_text = f.read() + rst_text = rst_text.replace("@OperatorName@", name) + rst_text = rst_text.replace("@=@", "=" * len(name)) + rst_text = rst_text.replace("@Module@", module) + rst_text = rst_text.replace("@MemberFunctions@", members) + return rst_text + + +def create_cvcuda_operator_rst_files( + cvcuda_path: str, outdir: str, python_cvcuda_root: str, export_calls: List[str] +) -> None: + # search for template rst file + template_rst_file_path = os.path.join( + cvcuda_path, "docs", "sphinx", "_python_api", "template.rst" + ) + if not os.path.isfile(template_rst_file_path): + raise FileNotFoundError(f"File {template_rst_file_path} not found") + + # iterate through all files + for i in sorted(os.listdir(python_cvcuda_root)): + op_file_path = os.path.join(python_cvcuda_root, i) + # Only work on .cpp files that export operators + if ( + os.path.isfile(op_file_path) + and i.endswith(".cpp") + and i != "Main.cpp" + and has_exports(op_file_path, export_calls) + ): + + # Get operator name form .cpp file: remove prefix "Op" and file type + operator_name = os.path.splitext(i)[0] + operator_name = operator_name[len("Op") :] # noqa: E203 + + # Look for functions to add to documentation + # search for all lines that start with "m.def(" (stripping leading white spaces) + # then pick first string of that line, this is the name of the python function to be exported + exports = set() + with open(op_file_path, "r") as fp: + for line in fp: + if exports_def(line): + exports.add(get_name_of_def(line)) + if len(exports) == 0: + raise RuntimeError(f"No exports found in file {op_file_path}") + exports_str = ", ".join(exports) + + # Create text to put into rst file - starting from a template + rst_text = create_rst_text( + template_rst_file_path, operator_name, "cvcuda", exports_str + ) + + # Write rst file: outdir/_op_.rst + outfile = os.path.join(outdir, f"_op_{operator_name.lower()}.rst") + with open(outfile, "w") as f: + f.write(rst_text) + return + + +def create_cvcuda_non_operator_rst_files( + cvcuda_path: str, outdir: str, python_cvcuda_root: str, export_calls: List[str] +) -> None: + # search for template rst file + template_rst_file_path = os.path.join( + cvcuda_path, "docs", "sphinx", "_python_api", "template.rst" + ) + if not os.path.isfile(template_rst_file_path): + raise FileNotFoundError(f"File {template_rst_file_path} not found") + + for i in sorted(os.listdir(python_cvcuda_root)): + nonop_file_path = os.path.join(python_cvcuda_root, i) + # Only work on .cpp files that something different than operators + if ( + os.path.isfile(nonop_file_path) + and i.endswith(".cpp") + and i != "Main.cpp" + and has_exports(nonop_file_path, export_calls) + ): + # Look for functions to add to documentation + # Search for all lines that start with "py::enum_<" or "py::class_<" + with open(nonop_file_path, "r") as fp: + for line in fp: + if exports_enum(line): + export = get_name_of_enum(line) + elif exports_class(line): + has_doc, name = get_name_of_class_if_documented(line) + if has_doc: + export = name + else: + continue + else: + continue + + # Create text to put into rst file - starting from a template + rst_text = create_rst_text( + template_rst_file_path, export, "cvcuda", export + ) + + # Write rst file: outdir/_aux_.rst + outfile = os.path.join(outdir, f"_aux_{export.lower()}.rst") + with open(outfile, "w") as f: + f.write(rst_text) + return + + +def export_found(s: str) -> bool: + return s.lstrip().startswith("Export") + + +def get_export_fun_name(s: str) -> str: + return s.lstrip().split("(", 1)[0] + + +def exporting_nonops(s: str) -> bool: + """Everything after that command exports auxiliary operator entities + (non-operators)""" + return s.lstrip().startswith("// doctag: Non-Operators") + + +def exporting_ops(s: str) -> bool: + """Everything after that command exports operators""" + return s.lstrip().startswith("// doctag: Operators") + + +def get_exported_cvcuda(path_to_main: str): + export_nonop = [] # list for non operators + export_op = [] # list for operators + exports = None + with open(path_to_main, "r") as fp: + for line in fp: + if export_found(line): + # remove everything after first "(" + name = get_export_fun_name(line) + try: + exports.append(name) + except AttributeError: + print( + "No comment '// doctag: Non-Operators' or '// doctag: Operators' was found in " + f"{path_to_main} prior to 'Export*(m);'-routines." + ) + sys.exit() + elif exporting_nonops(line): + exports = export_nonop + elif exporting_ops(line): + exports = export_op + assert len(export_nonop) > 0 and len(export_op) > 0 + return export_nonop, export_op + + +def generate_py_doc_rsts_cvcuda(cvcuda_path: str, outdir: str): + python_cvcuda_root = os.path.join(cvcuda_path, "python", "mod_cvcuda") + export_nonop, export_op = get_exported_cvcuda( + os.path.join(python_cvcuda_root, "Main.cpp") + ) + create_cvcuda_operator_rst_files(cvcuda_path, outdir, python_cvcuda_root, export_op) + create_cvcuda_non_operator_rst_files( + cvcuda_path, outdir, python_cvcuda_root, export_nonop + ) + return + + +if __name__ == "__main__": + outdir = sys.argv[1] # path/to/cvcuda/docs/sphinx/_python_api/_cvcuda_api + cvcuda_path = sys.argv[2] # path/to/cvcuda + os.makedirs(outdir, exist_ok=True) + generate_py_doc_rsts_cvcuda(cvcuda_path, outdir) diff --git a/docs/sphinx/generate_groups.py b/docs/sphinx/generate_groups.py index b269c0492..8ab19ba2b 100644 --- a/docs/sphinx/generate_groups.py +++ b/docs/sphinx/generate_groups.py @@ -13,15 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pathlib import Path -import xml.etree.ElementTree as ET import os import sys +import xml.etree.ElementTree as ET +from pathlib import Path outdir = Path(sys.argv[1]) -if not os.path.exists(outdir): - os.makedirs(outdir) +os.makedirs(outdir, exist_ok=True) xmlRoot = sys.argv[2] diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index ceeb3265e..3817f1bfd 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -17,7 +17,7 @@ .. _cvcuda_doc_system: CV-CUDA -============================ +======= NVIDIA CV-CUDA™ is an open-source project for building cloud-scale `Artificial Intelligence (AI) imaging and Computer Vision (CV) `_ applications. It uses graphics processing unit (GPU) acceleration to help developers build highly efficient pre- and post-processing pipelines. It can improve throughput by more than 10x while lowering cloud computing costs. @@ -36,7 +36,7 @@ CV-CUDA includes: CV-CUDA Pre- and Post-Processing Operators ------------------- +------------------------------------------ CV-CUDA offers a comprehensive collection of Computer Vision and Image Processing operators, listed below. @@ -47,20 +47,20 @@ CV-CUDA offers a comprehensive collection of Computer Vision and Image Processin Where Are the Release Notes? ------------------- +---------------------------- CV-CUDA release notes can be found `here `_. Where Can I Get Help? ------------------- +--------------------- An awesome product requires excellent support. File requests for enhancements and bug reports `here `_. What Other Computer Vision Products Does NVIDIA Offer? ------------------- +------------------------------------------------------ NVIDIA offers a number of products for accelerating computer vision and image processing applications. In addition to CV-CUDA, some of the others include: @@ -73,7 +73,7 @@ If you want to learn more about what computer vision solutions are available, re Notice --------------------- +------ The information provided in this specification is believed to be accurate and reliable as of the date provided. However, NVIDIA Corporation (“NVIDIA”) does not give any representations or warranties, expressed or implied, as to the accuracy or completeness of such information. NVIDIA shall have no liability for the consequences or use of such information or for any infringement of patents or other rights of third parties that may result from its use. This publication supersedes and replaces all other specifications for the product that may have been previously supplied. NVIDIA reserves the right to make corrections, modifications, enhancements, improvements, and other changes to this specification, at any time and/or to discontinue any product or service without notice. Customer should obtain the latest relevant specification before placing orders and should verify that such information is current and complete. @@ -90,13 +90,13 @@ ALL NVIDIA DESIGN SPECIFICATIONS, REFERENCE BOARDS, FILES, DRAWINGS, DIAGNOSTICS Trademarks --------------------- +---------- NVIDIA, the NVIDIA logo, NVIDIA CV-CUDA, and NVIDIA TensorRT are trademarks and/or registered trademarks of NVIDIA Corporation in the U.S. and other countries. Other company and product names may be trademarks of the respective companies with which they are associated. Copyright --------------------- +--------- © 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. @@ -117,13 +117,13 @@ Copyright C Modules C++ Modules Python Modules - Index <_exhale_api/cvcuda_api> .. toctree:: :caption: Release Notes :maxdepth: 1 :hidden: + v0.9.0-beta v0.8.0-beta v0.7.0-beta v0.6.0-beta diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index f1969c8ed..c6566a662 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -38,38 +38,38 @@ You can download the CV-CUDA tar, deb or wheel packages from `the asset section Unzip the cvcuda runtime package: :: - tar -xvf cvcuda-lib-x.x.x-cuda11-x86_64-linux.tar.xz + tar -xvf cvcuda-lib----linux.tar.xz Unzip the cvcuda developer package: :: - tar -xvf cvcuda-dev-x.x.x-cuda11-x86_64-linux.tar.xz + tar -xvf cvcuda-dev----linux.tar.xz Unzip the cvcuda python package: :: - tar -xvf cvcuda-python3.*-x.x.x-cuda11-x86_64-linux.tar.xz + tar -xvf cvcuda-python----linux.tar.xz [Optional] Unzip the tests. :: - tar -xvf cvcuda-tests-cuda11-x86_64-linux.tar.xz + tar -xvf cvcuda-tests----linux.tar.xz * Debian Installation Install the runtime library. :: - dpkg -i cvcuda-lib-x.x.x-cuda11-x86_64-linux.deb + sudo apt install -y ./cvcuda-lib----linux.deb Install the developer library. :: - dpkg -i cvcuda-dev-x.x.x-cuda11-x86_64-linux.deb + sudo apt install -y ./cvcuda-dev----linux.deb Install the python bindings :: - dpkg -i cvcuda-python3.*-x.x.x-cuda11-x86_64-linux.deb + sudo apt install -y ./cvcuda-python----linux.deb [Optional] Install the tests. :: - sudo dpkg -i cvcuda-tests-x.x.x-cuda11-x86_64-linux.deb + sudo apt install -y ./cvcuda-tests----linux.deb * Python Wheel File Installation diff --git a/docs/sphinx/modules/c_algos.rst b/docs/sphinx/modules/c_algos.rst index 7cb81e190..81392c4ef 100644 --- a/docs/sphinx/modules/c_algos.rst +++ b/docs/sphinx/modules/c_algos.rst @@ -22,4 +22,4 @@ CV-CUDA Algorithms .. toctree:: :glob: - ../_exhale_api/group__NVCV__C__ALGORITHM__* + ../_c_cpp_api/group__NVCV__C__ALGORITHM__* diff --git a/docs/sphinx/modules/c_core.rst b/docs/sphinx/modules/c_core.rst index 09b8e483b..d74af3281 100644 --- a/docs/sphinx/modules/c_core.rst +++ b/docs/sphinx/modules/c_core.rst @@ -22,4 +22,4 @@ Core components and related functions. .. toctree:: :glob: - ../_exhale_api/group__NVCV__C__CORE__* + ../_c_cpp_api/group__NVCV__C__CORE__* diff --git a/docs/sphinx/modules/c_status.rst b/docs/sphinx/modules/c_status.rst index 4f06a241d..0696ccab5 100644 --- a/docs/sphinx/modules/c_status.rst +++ b/docs/sphinx/modules/c_status.rst @@ -20,4 +20,4 @@ Status .. toctree:: :glob: - ../_exhale_api/group__NVCV__C__API_STATUS* + ../_c_cpp_api/group__NVCV__C__API_STATUS* diff --git a/docs/sphinx/modules/c_utils.rst b/docs/sphinx/modules/c_utils.rst index 3a1bee3dd..07d51cefb 100644 --- a/docs/sphinx/modules/c_utils.rst +++ b/docs/sphinx/modules/c_utils.rst @@ -22,4 +22,4 @@ Utility components for CV-CUDA. .. toctree:: :glob: - ../_exhale_api/group__NVCV__C__UTIL_* + ../_c_cpp_api/group__NVCV__C__UTIL_* diff --git a/docs/sphinx/modules/cpp_algos.rst b/docs/sphinx/modules/cpp_algos.rst index 04aa9a7c0..83255b27d 100644 --- a/docs/sphinx/modules/cpp_algos.rst +++ b/docs/sphinx/modules/cpp_algos.rst @@ -22,4 +22,4 @@ CV-CUDA Algorithms .. toctree:: :glob: - ../_exhale_api/group__NVCV__C__ALGORITHM__* + ../_c_cpp_api/group__NVCV__C__ALGORITHM__* diff --git a/docs/sphinx/modules/cpp_core.rst b/docs/sphinx/modules/cpp_core.rst index f716bd0d5..23ea8ce5b 100644 --- a/docs/sphinx/modules/cpp_core.rst +++ b/docs/sphinx/modules/cpp_core.rst @@ -22,4 +22,4 @@ Core components and related functions. .. toctree:: :glob: - ../_exhale_api/group__NVCV__CPP__CORE_* + ../_c_cpp_api/group__NVCV__CPP__CORE_* diff --git a/docs/sphinx/modules/cpp_cudatools.rst b/docs/sphinx/modules/cpp_cudatools.rst index 7fadac27a..0d08d11b6 100644 --- a/docs/sphinx/modules/cpp_cudatools.rst +++ b/docs/sphinx/modules/cpp_cudatools.rst @@ -22,4 +22,4 @@ CUDA Tools .. toctree:: :glob: - ../_exhale_api/group__NVCV__CPP__CUDATOOLS_* + ../_c_cpp_api/group__NVCV__CPP__CUDATOOLS_* diff --git a/docs/sphinx/modules/cpp_modules.rst b/docs/sphinx/modules/cpp_modules.rst index 9662c2516..7df402929 100644 --- a/docs/sphinx/modules/cpp_modules.rst +++ b/docs/sphinx/modules/cpp_modules.rst @@ -15,7 +15,7 @@ # limitations under the License. C++ API -===== +======= .. toctree:: diff --git a/docs/sphinx/modules/cpp_utils.rst b/docs/sphinx/modules/cpp_utils.rst index 29d10dddf..fff1ffde8 100644 --- a/docs/sphinx/modules/cpp_utils.rst +++ b/docs/sphinx/modules/cpp_utils.rst @@ -22,4 +22,4 @@ Utility components for CV-CUDA. .. toctree:: :glob: - ../_exhale_api/group__NVCV__CPP__UTIL_* + ../_c_cpp_api/group__NVCV__CPP__UTIL_* diff --git a/docs/sphinx/modules/python_algos.rst b/docs/sphinx/modules/python_algos.rst new file mode 100644 index 000000000..50983eacb --- /dev/null +++ b/docs/sphinx/modules/python_algos.rst @@ -0,0 +1,26 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +ALGORITHMS +========== + +Algorithms for the NVIDIA® CV-CUDA library. + +.. toctree:: + :glob: + + ../_python_api/_cvcuda_api/_op_* + ../_python_api/_cvcuda_api/_aux_* diff --git a/docs/sphinx/modules/python_core.rst b/docs/sphinx/modules/python_core.rst new file mode 100644 index 000000000..98279fe88 --- /dev/null +++ b/docs/sphinx/modules/python_core.rst @@ -0,0 +1,31 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +Core +==== + +Core components and related functions for the NVIDIA® NVCV library. + +.. toctree:: + + Cache <../_python_api/nvcv/cache> + Color Models <../_python_api/nvcv/colorspec> + Image Formats <../_python_api/nvcv/format> + Rect <../_python_api/nvcv/recti> + Image <../_python_api/nvcv/image> + ImageBatchVarShape <../_python_api/nvcv/imagebatch> + Tensor <../_python_api/nvcv/tensor> + TensorBatchVarShape <../_python_api/nvcv/tensorbatch> diff --git a/docs/sphinx/modules/python_modules.rst b/docs/sphinx/modules/python_modules.rst index 4179ac66c..bf20edd91 100644 --- a/docs/sphinx/modules/python_modules.rst +++ b/docs/sphinx/modules/python_modules.rst @@ -18,9 +18,8 @@ Python API ========== .. toctree:: - :caption: Python API - :maxdepth: 2 - :hidden: + :caption: Python API + :maxdepth: 3 -.. automodule:: cvcuda - :members: + Core + Algorithms diff --git a/docs/sphinx/prerequisites.rst b/docs/sphinx/prerequisites.rst index 0aa105faa..540bef62e 100644 --- a/docs/sphinx/prerequisites.rst +++ b/docs/sphinx/prerequisites.rst @@ -35,5 +35,5 @@ Python Samples' Dependencies: * Torchvision * torchnvjpeg (https://github.com/itsliupeng/torchnvjpeg) -Refer to the :ref:`Installation` docs for the sample installation guide using *.deb or .tar installers. +Refer to the :ref:`Installation` docs for the sample installation guide using \*.deb or .tar installers. Refer to the sample README for instructions to compile samples from the source. diff --git a/docs/sphinx/relnotes/v0.1.0-prealpha.rst b/docs/sphinx/relnotes/v0.1.0-prealpha.rst index ddb876c0e..c7b75d22e 100644 --- a/docs/sphinx/relnotes/v0.1.0-prealpha.rst +++ b/docs/sphinx/relnotes/v0.1.0-prealpha.rst @@ -17,7 +17,7 @@ .. _v0.1.0-prealpha: v0.1.0-preAlpha -======== +=============== CV-CUDA-0.1.0 is the first release of CV-CUDA. This release is for evaluation purposes only. diff --git a/docs/sphinx/relnotes/v0.2.0-alpha.rst b/docs/sphinx/relnotes/v0.2.0-alpha.rst index b6ef95ade..d9bef9512 100644 --- a/docs/sphinx/relnotes/v0.2.0-alpha.rst +++ b/docs/sphinx/relnotes/v0.2.0-alpha.rst @@ -17,7 +17,7 @@ .. _v0.2.0-alpha: v0.2.0-alpha -======== +============ CV-CUDA 0.2.0 is the first open-source release of the project. @@ -61,7 +61,7 @@ Refer to documentation of the sample applications for dependencies. Known Issues/Limitations ------------- +------------------------ * Performance optimization of variable shape versions of the operators will be addressed in the next release. * Improvements to APIs of some operators are expected in the next release. * Morphology operator - performance will be optimized in the next release diff --git a/docs/sphinx/relnotes/v0.2.1-alpha.rst b/docs/sphinx/relnotes/v0.2.1-alpha.rst index 65d5b8861..4455e9237 100644 --- a/docs/sphinx/relnotes/v0.2.1-alpha.rst +++ b/docs/sphinx/relnotes/v0.2.1-alpha.rst @@ -17,7 +17,7 @@ .. _v0.2.1-alpha: v0.2.1-alpha -======= +============ General ------- diff --git a/docs/sphinx/relnotes/v0.3.0-beta.rst b/docs/sphinx/relnotes/v0.3.0-beta.rst index 6473da539..5c7d784c9 100644 --- a/docs/sphinx/relnotes/v0.3.0-beta.rst +++ b/docs/sphinx/relnotes/v0.3.0-beta.rst @@ -17,7 +17,7 @@ .. _v0.3.0-beta: v0.3.0-beta -==== +=========== CV-CUDA 0.3.0 is the next open-source release of the project. @@ -58,7 +58,7 @@ Refer to documentation of the sample applications for dependencies. Known Issues/Limitations ------------- +------------------------ * Open compilation issue with CUDA Toolkit 11.2 + GCC 10.3 diff --git a/docs/sphinx/relnotes/v0.3.1-beta.rst b/docs/sphinx/relnotes/v0.3.1-beta.rst index c04a3d1f7..24058d6d7 100644 --- a/docs/sphinx/relnotes/v0.3.1-beta.rst +++ b/docs/sphinx/relnotes/v0.3.1-beta.rst @@ -17,7 +17,7 @@ .. _v0.3.1-beta: 0.3.1-beta -====== +========== The v0.3.1 release provides several bug fixes along with documentation updates and performance improvements. diff --git a/docs/sphinx/relnotes/v0.4.0-beta.rst b/docs/sphinx/relnotes/v0.4.0-beta.rst index 0f38b3138..1a0f4eed1 100644 --- a/docs/sphinx/relnotes/v0.4.0-beta.rst +++ b/docs/sphinx/relnotes/v0.4.0-beta.rst @@ -17,7 +17,7 @@ .. _v0.4.0-beta: v0.4.0-beta -====== +=========== CV-CUDA 0.4.0 is a major release of the library providing multiple new operators, Jetson Orin support, and updated API documentation. diff --git a/docs/sphinx/relnotes/v0.5.0-beta.rst b/docs/sphinx/relnotes/v0.5.0-beta.rst index a15c1b98b..57abe2f20 100644 --- a/docs/sphinx/relnotes/v0.5.0-beta.rst +++ b/docs/sphinx/relnotes/v0.5.0-beta.rst @@ -17,7 +17,7 @@ .. _v0.5.0-beta: v0.5.0-beta -====== +=========== CV-CUDA 0.5.0 is a comprehensive update introducing new security, compliance, and performance enhancements, alongside bug fixes and new features. diff --git a/docs/sphinx/relnotes/v0.6.0-beta.rst b/docs/sphinx/relnotes/v0.6.0-beta.rst index c199fb2bd..93731f881 100644 --- a/docs/sphinx/relnotes/v0.6.0-beta.rst +++ b/docs/sphinx/relnotes/v0.6.0-beta.rst @@ -17,7 +17,7 @@ .. _v0.6.0-beta: v0.6.0-beta -====== +=========== CV-CUDA 0.6.0 is a comprehensive update introducing new packaging and documentation enhancements, along with bug fixes and new features. diff --git a/docs/sphinx/relnotes/v0.7.0-beta.rst b/docs/sphinx/relnotes/v0.7.0-beta.rst index 196d236b4..4a155fad2 100644 --- a/docs/sphinx/relnotes/v0.7.0-beta.rst +++ b/docs/sphinx/relnotes/v0.7.0-beta.rst @@ -17,7 +17,7 @@ .. _v0.7.0-beta: v0.7.0-beta -====== +=========== CV-CUDA v0.7.0 introduces performance and support enhancements, along with bug fixes and new features. diff --git a/docs/sphinx/relnotes/v0.8.0-beta.rst b/docs/sphinx/relnotes/v0.8.0-beta.rst index 59e97fab6..4a5c3fb57 100644 --- a/docs/sphinx/relnotes/v0.8.0-beta.rst +++ b/docs/sphinx/relnotes/v0.8.0-beta.rst @@ -17,7 +17,7 @@ .. _v0.8.0-beta: v0.8.0-beta -====== +=========== Release Highlights ------------------ diff --git a/docs/sphinx/relnotes/v0.9.0-beta.rst b/docs/sphinx/relnotes/v0.9.0-beta.rst new file mode 100644 index 000000000..dbc18e95c --- /dev/null +++ b/docs/sphinx/relnotes/v0.9.0-beta.rst @@ -0,0 +1,68 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.9.0-beta: + +v0.9.0-beta +=========== + +Release Highlights +------------------ + +CV-CUDA v0.9.0 includes the following changes: + +* **New Features**: + + * Improved Resize performance (up to 4x for u8 inputs, up to 3x for RGB8) + * Improved performance of cubic interpolation, eg in Rotate, WarpAffine and WarpPerspective (up to 2x faster) + * Added optional scaling to ResizeCropConvertReformat fused operator + * Improved structure of Python documentation and optimized its generation (>5min to <30s) by removing the Exhale index + * Added 64bit stride support to various operators + + * limited to 32bit strides to avoid performance regressions: AdaptiveThreshold, AdvCvtColor, AverageBlur, BilateralFilter, BrightnessContrast, ColorTwist, BoxBlur, CenterCrop, ConvertTo, CopyMakeBorder, CustomCrop, GaussianNoise, Gaussian, Flip, HistogramEq, JointBilateralFilter, Laplacian, Morphology, Normalize, RandomResizedCrop, Reformat, Remap, Resize, Rotate, SIFT, WarpAffine, WarpPerspective + +* **Bug Fixes**: + + * Added exception handling on CApi in Python: now forward C/C++exceptions to Python + * Fixed coordinate rounding bug in Resize operator with nearest neighbor interpolation + +Compatibility and Known Limitations +----------------------------------- + + * Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version `./ci/build_docs path/to/build -DPYTHON_VERSIONS=""`. + * Python bindings installed via Debian packages and Python tests fail with Numpy 2.0. We recommend using an older version of Numpy (e.g. 1.26) until we have implemented a fix. + * The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using linear and cubic interpolation. This will be fixed in an upcoming release. + + +See main README on `CV-CUDA GitHub `_. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/docs/sphinx/samples/cpp_samples/cropresize.rst b/docs/sphinx/samples/cpp_samples/cropresize.rst index 671733dae..0e10f8d88 100644 --- a/docs/sphinx/samples/cpp_samples/cropresize.rst +++ b/docs/sphinx/samples/cpp_samples/cropresize.rst @@ -62,7 +62,7 @@ The Tensor Buffer is then wrapped to create a Tensor Object for which we will ca We will use NvJpeg library to decode the images into the required color format and create a buffer on the device. -.. literalinclude:: ../../../samples/cropandresize/Main.cpp +.. literalinclude:: ../../../../samples/cropandresize/Main.cpp :language: cpp :start-after: Image Loading :end-before: The input buffer is now ready to be used diff --git a/docs/sphinx/samples/python_samples/classification/classification_pytorch.rst b/docs/sphinx/samples/python_samples/classification/classification_pytorch.rst index 1e4128b49..941c3b1e3 100644 --- a/docs/sphinx/samples/python_samples/classification/classification_pytorch.rst +++ b/docs/sphinx/samples/python_samples/classification/classification_pytorch.rst @@ -17,7 +17,7 @@ .. _classification_pytorch: Classification Inference Using PyTorch -==================== +====================================== The classification sample in CVCUDA uses the ``ResNet50`` deep learning model from the ``torchvision`` library. Since the model does not come with the softmax layer at the end, we are going to add one. The following code snippet shows how the model is setup for inference use case with PyTorch. diff --git a/docs/sphinx/samples/python_samples/classification/classification_tensorrt.rst b/docs/sphinx/samples/python_samples/classification/classification_tensorrt.rst index fda595278..afcb4e428 100644 --- a/docs/sphinx/samples/python_samples/classification/classification_tensorrt.rst +++ b/docs/sphinx/samples/python_samples/classification/classification_tensorrt.rst @@ -17,7 +17,7 @@ .. _classification_tensorrt: Classification Inference Using TensorRT -==================== +======================================= The classification sample in CVCUDA uses the ``ResNet50`` deep learning model from the ``torchvision`` library. Since the model does not come with the softmax layer at the end, we are going to add one. The following code snippet shows how the model is setup for inference use case with TensorRT. diff --git a/docs/sphinx/samples/python_samples/classification/postprocessor_cvcuda.rst b/docs/sphinx/samples/python_samples/classification/postprocessor_cvcuda.rst index 0d136430b..96abd4efc 100644 --- a/docs/sphinx/samples/python_samples/classification/postprocessor_cvcuda.rst +++ b/docs/sphinx/samples/python_samples/classification/postprocessor_cvcuda.rst @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _preprocessor_cvcuda: +.. _postprocessor_cvcuda_classification: Classification Post-processing Pipeline -==================== +======================================= The classification post processing pipeline is a relatively lightweight one with sorting being the only operation happening in it. diff --git a/docs/sphinx/samples/python_samples/classification/preprocessor_cvcuda.rst b/docs/sphinx/samples/python_samples/classification/preprocessor_cvcuda.rst index 480b25cc5..cb75a09f7 100644 --- a/docs/sphinx/samples/python_samples/classification/preprocessor_cvcuda.rst +++ b/docs/sphinx/samples/python_samples/classification/preprocessor_cvcuda.rst @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _preprocessor_cvcuda: +.. _preprocessor_cvcuda_classification: Classification Pre-processing Pipeline using CVCUDA -==================== +=================================================== CVCUDA helps accelerate the pre-processing pipeline of the classification sample tremendously. Easy interoperability with PyTorch tensors also makes it easy to integrate with PyTorch and other data loaders that supports the tensor layout. diff --git a/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst b/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst index edb14806e..0ac56c1bb 100644 --- a/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst +++ b/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst @@ -17,7 +17,7 @@ .. _imagebatchdecoder_nvcodec: Image Decoding using nvImageCodec -==================== +================================= The image batch decoder is responsible for parsing the input expression, reading and decoding image data. The actual decoding is done in batches using the library `nvImageCodec `_. Although used in the semantic segmentation sample, this image decoder is generic enough to be used in other applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. diff --git a/docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst b/docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst index 3cdb507d7..09261c529 100644 --- a/docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst +++ b/docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst @@ -17,7 +17,7 @@ .. _imagebatchencoder_nvcodec: Image Encoding using nvImageCodec -==================== +================================= The image batch encoder is responsible for saving image tensors to the disk as JPG images. The actual encoding is done in batches using the `nvImageCodec `_ library. The image encoder is generic enough to be across the sample applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. diff --git a/docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst b/docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst index 9219f0aef..1e088ed26 100644 --- a/docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst +++ b/docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst @@ -17,7 +17,7 @@ .. _videobatchdecoder_pyvideocodec: Video Decoding using pyNvVideoCodec -==================== +=================================== The video batch decoder is responsible for reading an MP4 video as tensors. The actual decoding is done per frame using NVIDIA's PyNvVideoCodec API. The video decoder is generic enough to be used across the sample applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. diff --git a/docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst b/docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst index 96a75bf2b..4312e5360 100644 --- a/docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst +++ b/docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst @@ -17,7 +17,7 @@ .. _videobatchencoder_pyvideocodec: Video Encoding using VpyNvVideoCodecPF -==================== +====================================== The video batch encoder is responsible for writing tensors as an MP4 video. The actual encoding is done in batches using NVIDIA's pyNvVideoCodec. The video encoder is generic enough to be used across the sample applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. @@ -35,7 +35,7 @@ The first class acts as a wrapper on the second class which allows us to: VideoBatchEncoderVPF ------------------- +-------------------- To get started, here is how the class is initialized in its ``__init__`` method. The encoder instance and CVCUDA color conversion tensors both are allocated when needed upon the first use. diff --git a/docs/sphinx/samples/python_samples/object_detection/objectdetection_tensorflow.rst b/docs/sphinx/samples/python_samples/object_detection/objectdetection_tensorflow.rst index 07223f3d3..129aebe5f 100644 --- a/docs/sphinx/samples/python_samples/object_detection/objectdetection_tensorflow.rst +++ b/docs/sphinx/samples/python_samples/object_detection/objectdetection_tensorflow.rst @@ -17,7 +17,7 @@ .. _objectdetection_tensorflow: Object Detection Inference Using TensorFlow -========================================== +=========================================== The object detection sample in CVCUDA uses the `Peoplenet Model `_ from NGC. The HDF5 model file is downloaded from NGC. We use appropriate GPU device with Keras to load the model. diff --git a/docs/sphinx/samples/python_samples/object_detection/postprocessor_cvcuda.rst b/docs/sphinx/samples/python_samples/object_detection/postprocessor_cvcuda.rst index 6491b09e6..59c20333a 100644 --- a/docs/sphinx/samples/python_samples/object_detection/postprocessor_cvcuda.rst +++ b/docs/sphinx/samples/python_samples/object_detection/postprocessor_cvcuda.rst @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _preprocessor_cvcuda: +.. _postprocessor_cvcuda_object_detection: Object Detection Post-processing Pipeline using CVCUDA ====================================================== diff --git a/docs/sphinx/samples/python_samples/object_detection/preprocessor_cvcuda.rst b/docs/sphinx/samples/python_samples/object_detection/preprocessor_cvcuda.rst index 39863109f..7f39501af 100644 --- a/docs/sphinx/samples/python_samples/object_detection/preprocessor_cvcuda.rst +++ b/docs/sphinx/samples/python_samples/object_detection/preprocessor_cvcuda.rst @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _preprocessor_cvcuda: +.. _preprocessor_cvcuda_object_detection: Object Detection Pre-processing Pipeline using CVCUDA ===================================================== diff --git a/docs/sphinx/samples/python_samples/segmentation.rst b/docs/sphinx/samples/python_samples/segmentation.rst index 53cf5b2eb..23a86893f 100644 --- a/docs/sphinx/samples/python_samples/segmentation.rst +++ b/docs/sphinx/samples/python_samples/segmentation.rst @@ -17,7 +17,7 @@ .. _segmentation: Semantic Segmentation -==================== +===================== In this example, we use CVCUDA to accelerate the pre and post processing pipelines in the deep learning inference use case involving a semantic segmentation model. The deep learning model can utilize either PyTorch or TensorRT to run the inference. The pre-processing pipeline converts the input into the format required by the input layer of the model whereas the post processing pipeline converts the output produced by the model into a visualization-friendly frame. We use the FCN ResNet101 model (from torchvision) to generate the predictions. This sample can work on a single image or a folder full of images or on a single video. All images have to be in the JPEG format and with the same dimensions unless run under the batch size of one. Video has to be in mp4 format with a fixed frame rate. We use the torchnvjpeg library to read the images and NVIDIA's Video Processing Framework (VPF) to read/write videos. diff --git a/docs/sphinx/samples/python_samples/segmentation/postprocessor_cvcuda.rst b/docs/sphinx/samples/python_samples/segmentation/postprocessor_cvcuda.rst index aa29e10c3..02e54d352 100644 --- a/docs/sphinx/samples/python_samples/segmentation/postprocessor_cvcuda.rst +++ b/docs/sphinx/samples/python_samples/segmentation/postprocessor_cvcuda.rst @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _preprocessor_cvcuda: +.. _postprocessor_cvcuda_segmentation: Semantic Segmentation Post-processing Pipeline using CVCUDA -==================== +=========================================================== CVCUDA helps accelerate the post-processing pipeline of the semantic segmentation sample tremendously. Easy interoperability with PyTorch tensors also makes it easy to integrate with PyTorch and other data loaders that supports the tensor layout. diff --git a/docs/sphinx/samples/python_samples/segmentation/preprocessor_cvcuda.rst b/docs/sphinx/samples/python_samples/segmentation/preprocessor_cvcuda.rst index 4a1c4412b..cc0649879 100644 --- a/docs/sphinx/samples/python_samples/segmentation/preprocessor_cvcuda.rst +++ b/docs/sphinx/samples/python_samples/segmentation/preprocessor_cvcuda.rst @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _preprocessor_cvcuda: +.. _preprocessor_cvcuda_segmentation: Semantic Segmentation Pre-processing Pipeline using CVCUDA -==================== +========================================================== CVCUDA helps accelerate the pre-processing pipeline of the semantic segmentation sample tremendously. Easy interoperability with PyTorch tensors also makes it easy to integrate with PyTorch and other data loaders that supports the tensor layout. diff --git a/docs/sphinx/samples/python_samples/segmentation/segmentation_pytorch.rst b/docs/sphinx/samples/python_samples/segmentation/segmentation_pytorch.rst index 34b6d5fdd..acf9de781 100644 --- a/docs/sphinx/samples/python_samples/segmentation/segmentation_pytorch.rst +++ b/docs/sphinx/samples/python_samples/segmentation/segmentation_pytorch.rst @@ -17,7 +17,7 @@ .. _segmentation_pytorch: Semantic Segmentation Inference Using PyTorch -==================== +============================================= The semantic segmentation sample in CVCUDA uses the ``fcn_resnet101`` deep learning model from the ``torchvision`` library. Since the model does not come with the softmax layer at the end, we are going to add one. The following code snippet shows how the model is setup for inference use case with PyTorch. diff --git a/docs/sphinx/samples/python_samples/segmentation/segmentation_tensorrt.rst b/docs/sphinx/samples/python_samples/segmentation/segmentation_tensorrt.rst index 03f995f81..211ff89ef 100644 --- a/docs/sphinx/samples/python_samples/segmentation/segmentation_tensorrt.rst +++ b/docs/sphinx/samples/python_samples/segmentation/segmentation_tensorrt.rst @@ -17,7 +17,7 @@ .. _segmentation_tensorrt: Semantic Segmentation Inference Using TensorRT -==================== +============================================== The semantic segmentation sample in CVCUDA uses the ``fcn_resnet101`` deep learning model from the ``torchvision`` library. Since the model does not come with the softmax layer at the end, we are going to add one. The following code snippet shows how the model is setup for inference use case with TensorRT. diff --git a/docs/sphinx/samples/python_samples/segmentation_triton.rst b/docs/sphinx/samples/python_samples/segmentation_triton.rst index f03e0dcc2..549edd597 100644 --- a/docs/sphinx/samples/python_samples/segmentation_triton.rst +++ b/docs/sphinx/samples/python_samples/segmentation_triton.rst @@ -28,12 +28,15 @@ Refer to the Segmentation sample documentation to understand the details of the Terminologies ------------- * Triton Server + Manages and deploys model at scale. Refer the Triton documentation to review all the features Triton has to offer. * Triton model repository + Triton model represents a inference workload that needs to be deployed. The triton server loads the model repository when started. * Triton Client + Triton client libraries facilitates communication with Triton using Python or C++ API. In this example we will demonstrate how to to the Python API to communicate with Triton using GRPC requests. @@ -63,22 +66,26 @@ Tutorial The model repository paths needs to conform to a layout specified below: - / - / - / - - config.pbtxt + .. code-block:: bash + + / + / + / + + config.pbtxt For the segmentation sample, we will create a model.py which creates a TritonPythonModel that runs the preprocess, inference and post process workloads. We will copy the necessary files and modules from the segmentation sample for preprocess, inference and postprocess stages and create the following folder structure: - triton_models/ - fcn_resnet101/ - 1/ - model.py - config.pbtxt + .. code-block:: bash + + triton_models/ + fcn_resnet101/ + 1/ + model.py + config.pbtxt Each model in the model repository must include a model configuration that provides the required and optional information about the model. Typically, this configuration is provided in a config.pbtxt diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt index b533a105d..2ab990d14 100644 --- a/python/mod_cvcuda/CMakeLists.txt +++ b/python/mod_cvcuda/CMakeLists.txt @@ -88,6 +88,7 @@ nvcv_python_add_module( OpInpaint.cpp CvtColorUtil.cpp OpFindHomography.cpp + NormType.cpp ) target_link_libraries(cvcuda_module_python diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp index f5c26574f..780773ba1 100644 --- a/python/mod_cvcuda/Main.cpp +++ b/python/mod_cvcuda/Main.cpp @@ -38,13 +38,6 @@ namespace py = pybind11; PYBIND11_MODULE(cvcuda, m) { - m.doc() = R"pbdoc( - CV-CUDA Python API reference - ======================== - - This is the Python API reference for the NVIDIA® CV-CUDA library. - )pbdoc"; - m.attr("__version__") = CVCUDA_VERSION_STRING; // Import all public names from nvcv @@ -76,23 +69,25 @@ PYBIND11_MODULE(cvcuda, m) using namespace cvcudapy; + // doctag: Non-Operators // Operators' auxiliary entities - ExportInterpolationType(m); - ExportChannelManipType(m); + ExportAdaptiveThresholdType(m); ExportBorderType(m); - ExportMorphologyType(m); - ExportColorConversionCode(m); - ExportRemapMapValueType(m); ExportBoxBlur(m); + ExportChannelManipType(m); ExportOSD(m); - ExportThresholdType(m); - ExportAdaptiveThresholdType(m); - ExportSIFTFlagType(m); + ExportColorConversionCode(m); ExportConnectivityType(m); + ExportInterpolationType(m); ExportLabelType(m); - ExportNormType(m); ExportPairwiseMatcherType(m); + ExportMorphologyType(m); + ExportNormType(m); + ExportRemapMapValueType(m); + ExportSIFTFlagType(m); + ExportThresholdType(m); + // doctag: Operators // CV-CUDA Operators ExportOpResizeCropConvertReformat(m); ExportOpPairwiseMatcher(m); diff --git a/python/mod_cvcuda/OpAdaptiveThreshold.cpp b/python/mod_cvcuda/OpAdaptiveThreshold.cpp index 30801fb5f..503820171 100644 --- a/python/mod_cvcuda/OpAdaptiveThreshold.cpp +++ b/python/mod_cvcuda/OpAdaptiveThreshold.cpp @@ -141,7 +141,7 @@ void ExportOpAdaptiveThreshold(py::module &m) cvcuda.adaptivethreshold_into(dst: nvcv.Tensor, src: nvcv.Tensor, max_value: double, adaptive_method: NVCVAdaptiveThresholdType = < NVCV_ADAPTIVE_TH RESH_MEAN_C >, threshold_type: NVCVThresholdType = < NVCV_THRESH_BINARY >, block_size: int, c: double, stream: Optional[nvcv.cuda.Stream] = None) - Executes the adaptive threshold operation on the given cuda stream. + Executes the adaptive threshold operation on the given cuda stream. See also: Refer to the CV-CUDA C API reference for the Composite operator @@ -169,6 +169,7 @@ RESH_MEAN_C >, threshold_type: NVCVThresholdType = < NVCV_THRESH_BINARY >, block "adaptive_method"_a = NVCV_ADAPTIVE_THRESH_MEAN_C, "threshold_type"_a = NVCV_THRESH_BINARY, "max_block_size"_a, "block_size"_a, "c"_a, py::kw_only(), "stream"_a = nullptr, R"pbdoc( cvcuda.adaptivethreshold(src: nvcv.ImageBatchVarShape, max_value: nvcv.Tensor, adaptive_method: NVCVAdaptiveThresholdType = < NVCV_ADAPTIVE_THRESH_MEAN_C >, threshold_type: NVCVThresholdType = < NVCV_THRESH_BINARY > , block_size: int, c: double, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.ImageBatchVarShape + Executes the adaptive threshold operation on the given cuda stream. See also: diff --git a/python/mod_cvcuda/OpAdvCvtColor.cpp b/python/mod_cvcuda/OpAdvCvtColor.cpp index c9a4eff2e..d2b3cc04d 100644 --- a/python/mod_cvcuda/OpAdvCvtColor.cpp +++ b/python/mod_cvcuda/OpAdvCvtColor.cpp @@ -103,8 +103,8 @@ Tensor AdvCvtColor(Tensor &input, NVCVColorConversionCode code, NVCVColorSpec sp void ExportOpAdvCvtColor(py::module &m) { using namespace pybind11::literals; - - m.def("advcvtcolor", &AdvCvtColor, "src"_a, "code"_a, "spec"_a, py::kw_only(), "stream"_a = nullptr, R"pbdoc( + m.def("advcvtcolor", &AdvCvtColor, "src"_a, "code"_a, "spec"_a, py::kw_only(), "stream"_a = nullptr, + R"pbdoc( Executes the Adv Cvt Color operation on the given cuda stream. diff --git a/python/mod_cvcuda/OpResizeCropConvertReformat.cpp b/python/mod_cvcuda/OpResizeCropConvertReformat.cpp index a99c5072f..3dcb22f34 100644 --- a/python/mod_cvcuda/OpResizeCropConvertReformat.cpp +++ b/python/mod_cvcuda/OpResizeCropConvertReformat.cpp @@ -32,7 +32,8 @@ namespace cvcudapy { namespace { Tensor ResizeCropConvertReformatInto(Tensor &dst, Tensor &src, const std::tuple resizeDim, NVCVInterpolationType interp, const std::tuple cropPos, - const NVCVChannelManip manip, std::optional pstream) + const NVCVChannelManip manip, const float scale, const float offset, + const bool srcCast, std::optional pstream) { if (!pstream) { @@ -49,14 +50,15 @@ Tensor ResizeCropConvertReformatInto(Tensor &dst, Tensor &src, const std::tuple< nvcv::Size2D size_wh{std::get<0>(resizeDim), std::get<1>(resizeDim)}; int2 crop_xy{std::get<0>(cropPos), std::get<1>(cropPos)}; - resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip); + resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip, scale, offset, srcCast); return std::move(dst); } Tensor ResizeCropConvertReformat(Tensor &src, const std::tuple resizeDim, NVCVInterpolationType interp, const NVCVRectI cropRect, const char *layout, nvcv::DataType dataType, - const NVCVChannelManip manip, std::optional pstream) + const NVCVChannelManip manip, const float scale, const float offset, + const bool srcCast, std::optional pstream) { nvcv::TensorLayout srcLayout = src.layout(); @@ -98,12 +100,13 @@ Tensor ResizeCropConvertReformat(Tensor &src, const std::tuple resizeD const std::tuple cropPos = std::make_tuple((int)cropRect.x, (int)cropRect.y); - return ResizeCropConvertReformatInto(dst, src, resizeDim, interp, cropPos, manip, pstream); + return ResizeCropConvertReformatInto(dst, src, resizeDim, interp, cropPos, manip, scale, offset, srcCast, pstream); } Tensor ResizeCropConvertReformatVarShapeInto(Tensor &dst, ImageBatchVarShape &src, const std::tuple resizeDim, NVCVInterpolationType interp, const std::tuple cropPos, - const NVCVChannelManip manip, std::optional pstream) + const NVCVChannelManip manip, const float scale, const float offset, + const bool srcCast, std::optional pstream) { if (!pstream) { @@ -120,15 +123,15 @@ Tensor ResizeCropConvertReformatVarShapeInto(Tensor &dst, ImageBatchVarShape &sr nvcv::Size2D size_wh(std::get<0>(resizeDim), std::get<1>(resizeDim)); int2 crop_xy{std::get<0>(cropPos), std::get<1>(cropPos)}; - resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip); + resize->submit(pstream->cudaHandle(), src, dst, size_wh, interp, crop_xy, manip, scale, offset, srcCast); return std::move(dst); } Tensor ResizeCropConvertReformatVarShape(ImageBatchVarShape &src, const std::tuple resizeDim, NVCVInterpolationType interp, const NVCVRectI cropRect, const char *layout, - nvcv::DataType dataType, const NVCVChannelManip manip, - std::optional pstream) + nvcv::DataType dataType, const NVCVChannelManip manip, const float scale, + const float offset, const bool srcCast, std::optional pstream) { const nvcv::ImageFormat srcFrmt = src.uniqueFormat(); if (!srcFrmt) @@ -188,7 +191,8 @@ Tensor ResizeCropConvertReformatVarShape(ImageBatchVarShape &src, const std::tup const std::tuple cropPos = std::make_tuple((int)cropRect.x, (int)cropRect.y); - return ResizeCropConvertReformatVarShapeInto(dst, src, resizeDim, interp, cropPos, manip, pstream); + return ResizeCropConvertReformatVarShapeInto(dst, src, resizeDim, interp, cropPos, manip, scale, offset, srcCast, + pstream); } } // namespace @@ -202,7 +206,8 @@ void ExportOpResizeCropConvertReformat(py::module &m) m.def("resize_crop_convert_reformat", &ResizeCropConvertReformat, "src"_a, "resize_dim"_a, "interp"_a, "crop_rect"_a, py::kw_only(), "layout"_a = "", "data_type"_a = NVCV_DATA_TYPE_NONE, - "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + "manip"_a = NVCV_CHANNEL_NO_OP, "scale"_a = 1.0, "offset"_a = 0.0, "srcCast"_a = true, "stream"_a = nullptr, + R"pbdoc( cvcuda.resize_crop_convert_reformat(src: nvcv.Tensor, resize_dim: tuple[int,int], @@ -212,6 +217,9 @@ void ExportOpResizeCropConvertReformat(py::module &m) layout: str = "", data_type: nvcv.Type = 0, manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + scale: float = 1.0, + offset: float = 0.0, + srcCast: bool = True, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor Executes the ResizeCropConvertReformat operation on the given cuda stream. @@ -233,6 +241,15 @@ void ExportOpResizeCropConvertReformat(py::module &m) indicates output tensor data type copies input. manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) indicates output tensor channels are unchanged. + scale(float, optional): Scale (i.e., multiply) the output values by this amount. 1.0 (default) results + in no scaling of the output values. + offset(float, optional): Offset (i.e., add to) the output values by this amount. This is applied after + scaling. Let v be a resized and cropped value, then v * scale + offset is final + output value. 0.0 (default) results in no offset being added to the output. + srcCast(bool, optional): Boolean indicating whether or not the resize interpolation results are re-cast + back to the input (or source) data type. Refer to the C API reference for more + information. True (default) re-cast resize interpolation results back to the + source data type. stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. Returns: @@ -244,7 +261,8 @@ void ExportOpResizeCropConvertReformat(py::module &m) )pbdoc"); m.def("resize_crop_convert_reformat_into", &ResizeCropConvertReformatInto, "dst"_a, "src"_a, "resize_dim"_a, - "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "scale"_a = 1.0, "offset"_a = 0.0, + "srcCast"_a = true, "stream"_a = nullptr, R"pbdoc( cvcuda.resize_crop_convert_reformat_into(dst: nvcv.Tensor, src: nvcv.Tensor, @@ -253,6 +271,9 @@ void ExportOpResizeCropConvertReformat(py::module &m) cropPos: tuple[int,int], *, manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + scale: float = 1.0, + offset: float = 0.0, + srcCast: bool = True, stream: Optional[nvcv.cuda.Stream] = None) Executes the ResizeCropConvertReformat operation on the given cuda stream. @@ -274,6 +295,15 @@ void ExportOpResizeCropConvertReformat(py::module &m) output tensor's width & height. manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) indicates output tensor channels are unchanged. + scale(float, optional): Scale (i.e., multiply) the output values by this amount. 1.0 (default) results + in no scaling of the output values. + offset(float, optional): Offset (i.e., add to) the output values by this amount. This is applied after + scaling. Let v be a resized and cropped value, then v * scale + offset is final + output value. 0.0 (default) results in no offset being added to the output. + srcCast(bool, optional): Boolean indicating whether or not the resize interpolation results are re-cast + back to the input (or source) data type. Refer to the C API reference for more + information. True (default) re-cast resize interpolation results back to the + source data type. stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. Returns: @@ -286,7 +316,8 @@ void ExportOpResizeCropConvertReformat(py::module &m) m.def("resize_crop_convert_reformat", &ResizeCropConvertReformatVarShape, "src"_a, "resize_dim"_a, "interp"_a, "crop_rect"_a, py::kw_only(), "layout"_a = "", "data_type"_a = NVCV_DATA_TYPE_NONE, - "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + "manip"_a = NVCV_CHANNEL_NO_OP, "scale"_a = 1.0, "offset"_a = 0.0, "srcCast"_a = true, "stream"_a = nullptr, + R"pbdoc( cvcuda.resizeCropConvertReformat(src: nvcv.ImageBatchVarShape, resize_dim: tuple[int,int], @@ -296,6 +327,9 @@ void ExportOpResizeCropConvertReformat(py::module &m) layout: str = "", data_type: nvcv.Type = 0, manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + scale: float = 1.0, + offset: float = 0.0, + srcCast: bool = True, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor Executes the ResizeCropConvertReformat operation on the given cuda stream. @@ -318,6 +352,15 @@ void ExportOpResizeCropConvertReformat(py::module &m) indicates output tensor data type copies input. manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) indicates output tensor channels are unchanged. + scale(float, optional): Scale (i.e., multiply) the output values by this amount. 1.0 (default) results + in no scaling of the output values. + offset(float, optional): Offset (i.e., add to) the output values by this amount. This is applied after + scaling. Let v be a resized and cropped value, then v * scale + offset is final + output value. 0.0 (default) results in no offset being added to the output. + srcCast(bool, optional): Boolean indicating whether or not the resize interpolation results are re-cast + back to the input (or source) data type. Refer to the C API reference for more + information. True (default) re-cast resize interpolation results back to the + source data type. stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. Returns: @@ -329,7 +372,8 @@ void ExportOpResizeCropConvertReformat(py::module &m) )pbdoc"); m.def("resize_crop_convert_reformat_into", &ResizeCropConvertReformatVarShapeInto, "dst"_a, "src"_a, "resize_dim"_a, - "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "stream"_a = nullptr, R"pbdoc( + "interp"_a, "cropPos"_a, py::kw_only(), "manip"_a = NVCV_CHANNEL_NO_OP, "scale"_a = 1.0, "offset"_a = 0.0, + "srcCast"_a = true, "stream"_a = nullptr, R"pbdoc( cvcuda.resize_crop_convert_reformat_into(dst: nvcv.Tensor, src: nvcv.ImageBatchVarShape, @@ -338,6 +382,9 @@ void ExportOpResizeCropConvertReformat(py::module &m) cropPos: tuple[int,int], *, manip: cvcuda.ChannelManip = cvcuda.ChannelManip.NO_OP, + scale: float = 1.0, + offset: float = 0.0, + srcCast: bool = True, stream: Optional[nvcv.cuda.Stream] = None) Executes the ResizeCropConvertReformat operation on the given cuda stream. @@ -360,6 +407,15 @@ void ExportOpResizeCropConvertReformat(py::module &m) the output tensor's width & height. manip(cvcuda.ChannelManip, optional): Channel manipulation (e.g., shuffle RGB to BGR). NO_OP (default) indicates output tensor channels are unchanged. + scale(float, optional): Scale (i.e., multiply) the output values by this amount. 1.0 (default) results + in no scaling of the output values. + offset(float, optional): Offset (i.e., add to) the output values by this amount. This is applied after + scaling. Let v be a resized and cropped value, then v * scale + offset is final + output value. 0.0 (default) results in no offset being added to the output. + srcCast(bool, optional): Boolean indicating whether or not the resize interpolation results are re-cast + back to the input (or source) data type. Refer to the C API reference for more + information. True (default) re-cast resize interpolation results back to the + source data type. stream (nvcv.cuda.Stream, optional): CUDA Stream on which to perform the operation. Returns: diff --git a/python/mod_cvcuda/OsdElement.cpp b/python/mod_cvcuda/OsdElement.cpp index e47730fef..d1b00de37 100644 --- a/python/mod_cvcuda/OsdElement.cpp +++ b/python/mod_cvcuda/OsdElement.cpp @@ -97,7 +97,7 @@ void ExportBoxBlur(py::module &m) using namespace py::literals; using namespace cvcuda::priv; - py::class_(m, "BlurBoxI") + py::class_(m, "BlurBoxI", "BlurBoxI") .def(py::init( [](py::tuple box, int kernelSize) { @@ -121,7 +121,7 @@ void ExportOSD(py::module &m) using namespace py::literals; using namespace cvcuda::priv; - py::class_(m, "BndBoxI") + py::class_(m, "BndBoxI", "BndBoxI") .def(py::init( [](py::tuple box, int thickness, py::tuple borderColor, py::tuple fillColor) { @@ -168,7 +168,7 @@ void ExportOSD(py::module &m) }), "box"_a, "thickness"_a, "segArray"_a, "segThreshold"_a, "borderColor"_a, "segColor"_a); - py::class_(m, "Point") + py::class_(m, "Point", "Point") .def(py::init( [](py::tuple centerPos, int32_t radius, py::tuple color) { @@ -183,7 +183,7 @@ void ExportOSD(py::module &m) .def_readonly("radius", &NVCVPoint::radius, "Point size.") .def_readonly("color", &NVCVPoint::color, "Point color."); - py::class_(m, "Line") + py::class_(m, "Line", "Line") .def(py::init( [](py::tuple pos0, py::tuple pos1, int32_t thickness, py::tuple color, bool interpolation) { @@ -218,7 +218,7 @@ void ExportOSD(py::module &m) }), "points"_a, "thickness"_a, "isClosed"_a, "borderColor"_a, "fillColor"_a, py::arg("interpolation") = true); - py::class_(m, "RotatedBox") + py::class_(m, "RotatedBox", "RotatedBox") .def(py::init( [](py::tuple centerPos, int32_t width, int32_t height, float yaw, int32_t thickness, py::tuple borderColor, py::tuple bgColor, bool interpolation) @@ -245,7 +245,7 @@ void ExportOSD(py::module &m) .def_readonly("bgColor", &NVCVRotatedBox::bgColor, "Circle filled color.") .def_readonly("interpolation", &NVCVRotatedBox::interpolation, "Default: false."); - py::class_(m, "Circle") + py::class_(m, "Circle", "Circle") .def(py::init( [](py::tuple centerPos, int32_t radius, int32_t thickness, py::tuple borderColor, py::tuple bgColor) { @@ -264,7 +264,7 @@ void ExportOSD(py::module &m) .def_readonly("borderColor", &NVCVCircle::borderColor, "Circle border color.") .def_readonly("bgColor", &NVCVCircle::bgColor, "Circle filled color."); - py::class_(m, "Arrow") + py::class_(m, "Arrow", "Arrow") .def(py::init( [](py::tuple pos0, py::tuple pos1, int32_t arrowSize, int32_t thickness, py::tuple color, bool interpolation) diff --git a/python/mod_nvcv/Array.cpp b/python/mod_nvcv/Array.cpp index 5a39e91ef..710ad7bec 100644 --- a/python/mod_nvcv/Array.cpp +++ b/python/mod_nvcv/Array.cpp @@ -322,7 +322,7 @@ void Array::Export(py::module &m) using ResizeArrayLengthPtr = std::shared_ptr (*)(Array &, int64_t DataType); using ResizeArrayShapePtr = std::shared_ptr (*)(Array &, Shape); - py::class_, Container>(m, "Array") + py::class_, Container>(m, "Array", "Array") .def(py::init(static_cast(&Array::Create)), "length"_a, "dtype"_a, "Create a Array object with the given length and data type.") .def(py::init(static_cast(&Array::Create)), "shape"_a, "dtype"_a, diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp index b31fc27ec..f573704ab 100644 --- a/python/mod_nvcv/CAPI.cpp +++ b/python/mod_nvcv/CAPI.cpp @@ -36,6 +36,19 @@ namespace nvcvpy::priv { namespace { +// We need to catch any exceptions and set the appropriate PyError prior to crossing any C API boundry +#define CATCH_RETURN_DEFAULT(return_value, error_message) \ + catch (const std::exception &e) \ + { \ + PyErr_SetString(PyExc_ValueError, (std::string(error_message) + ": " + e.what()).c_str()); \ + return return_value; \ + } \ + catch (...) \ + { \ + PyErr_SetString(PyExc_ValueError, error_message); \ + return return_value; \ + } + template std::shared_ptr ToSharedObj(PyObject *obj) { @@ -50,34 +63,58 @@ T ToObj(PyObject *obj) extern "C" PyObject *ImplDataType_ToPython(NVCVDataType p) { - py::object obj = py::cast(nvcv::DataType(p)); - return obj.ptr(); + try + { + py::object obj = py::cast(nvcv::DataType(p)); + return obj.ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Casting PyObject from NVCVDataType failed") } extern "C" NVCVDataType ImplDataType_FromPython(PyObject *obj) { - return ToObj(obj); + try + { + return ToObj(obj); + } + CATCH_RETURN_DEFAULT(0, "Casting nvcv::DataType from PyObject failed") } extern "C" PyObject *ImplImageFormat_ToPython(NVCVImageFormat p) { - py::object obj = py::cast(nvcv::ImageFormat(p)); - return obj.ptr(); + try + { + py::object obj = py::cast(nvcv::ImageFormat(p)); + return obj.ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Casting PyObject from NVCVImageFormat failed") } extern "C" NVCVImageFormat ImplImageFormat_FromPython(PyObject *obj) { - return ToObj(obj); + try + { + return ToObj(obj); + } + CATCH_RETURN_DEFAULT(0, "Casting nvcv::ImageFormat from PyObject failed") } extern "C" NVCVTensorHandle ImplTensor_GetHandle(PyObject *obj) { - return ToSharedObj(obj)->impl().handle(); + try + { + return ToSharedObj(obj)->impl().handle(); + } + CATCH_RETURN_DEFAULT(0, "Getting Tensor handle from PyObject failed") } extern "C" NVCVArrayHandle ImplArray_GetHandle(PyObject *obj) { - return ToSharedObj(obj)->impl().handle(); + try + { + return ToSharedObj(obj)->impl().handle(); + } + CATCH_RETURN_DEFAULT(0, "Getting Array handle from PyObject failed") } LockMode ToLockMode(PyObject *_mode) @@ -107,180 +144,273 @@ LockMode ToLockMode(PyObject *_mode) extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream) { - ToSharedObj(res)->submitSync(*ToSharedObj(stream)); + try + { + ToSharedObj(res)->submitSync(*ToSharedObj(stream)); + } + CATCH_RETURN_DEFAULT(, "Submit sync failed") } extern "C" void ImplStream_HoldResources(PyObject *stream, PyObject *resourceList) { - py::list resList = ToObj(resourceList); + try + { + py::list resList = ToObj(resourceList); - LockResources resVector; + LockResources resVector; - for (py::handle h : resList) - { - py::tuple t = h.cast(); - if (t.size() != 2) + for (py::handle h : resList) { - throw std::runtime_error("ResourcePerMode tuple must have two elements"); - } + py::tuple t = h.cast(); + if (t.size() != 2) + { + throw std::runtime_error("ResourcePerMode tuple must have two elements"); + } - auto lockMode = ToLockMode(t[0].ptr()); - auto res = ToSharedObj(t[1].ptr()); + auto lockMode = ToLockMode(t[0].ptr()); + auto res = ToSharedObj(t[1].ptr()); - resVector.emplace(lockMode, res); - } + resVector.emplace(lockMode, res); + } - ToSharedObj(stream)->holdResources(std::move(resVector)); + ToSharedObj(stream)->holdResources(std::move(resVector)); + } + CATCH_RETURN_DEFAULT(, "Hold resources failed") } extern "C" PyObject *ImplStream_GetCurrent() { - return py::cast(Stream::Current().shared_from_this()).ptr(); + try + { + return py::cast(Stream::Current().shared_from_this()).ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Get current stream failed") } extern "C" cudaStream_t ImplStream_GetCudaHandle(PyObject *stream) { - return ToSharedObj(stream)->handle(); + try + { + return ToSharedObj(stream)->handle(); + } + CATCH_RETURN_DEFAULT(0, "Get cuda handle failed") } extern "C" PyObject *ImplTensor_Create(int32_t ndim, const int64_t *shape, NVCVDataType dtype, NVCVTensorLayout layout, int32_t rowalign) { - std::optional cxxLayout; - if (layout != NVCV_TENSOR_NONE) + try { - cxxLayout = nvcv::TensorLayout(layout); - } - - std::shared_ptr tensor = Tensor::Create(CreateShape(nvcv::TensorShape(shape, ndim, layout)), - nvcv::DataType{dtype}, std::move(layout), rowalign); + std::optional cxxLayout; + if (layout != NVCV_TENSOR_NONE) + { + cxxLayout = nvcv::TensorLayout(layout); + } - return py::cast(std::move(tensor)).release().ptr(); + std::shared_ptr tensor = Tensor::Create(CreateShape(nvcv::TensorShape(shape, ndim, layout)), + nvcv::DataType{dtype}, std::move(layout), rowalign); + return py::cast(std::move(tensor)).release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Tensor create failed") } extern "C" PyObject *ImplArray_Create(int64_t length, NVCVDataType dtype) { - std::shared_ptr array = Array::Create(length, nvcv::DataType{dtype}); + try + { + std::shared_ptr array = Array::Create(length, nvcv::DataType{dtype}); - return py::cast(std::move(array)).release().ptr(); + return py::cast(std::move(array)).release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Array create failed") } extern "C" PyObject *ImplImageBatchVarShape_Create(int32_t capacity) { - std::shared_ptr varshape = ImageBatchVarShape::Create(capacity); - return py::cast(std::move(varshape)).release().ptr(); + try + { + std::shared_ptr varshape = ImageBatchVarShape::Create(capacity); + return py::cast(std::move(varshape)).release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "ImageBatchVarShape create failed") } extern "C" NVCVImageBatchHandle ImplImageBatchVarShape_GetHandle(PyObject *varshape) { - return ToSharedObj(varshape)->impl().handle(); + try + { + return ToSharedObj(varshape)->impl().handle(); + } + CATCH_RETURN_DEFAULT(0, "ImageBatchVarShape get handle failed") } extern "C" PyObject *ImplTensor_CreateForImageBatch(int32_t numImages, int32_t width, int32_t height, NVCVImageFormat fmt, int32_t rowalign) { - std::shared_ptr tensor - = Tensor::CreateForImageBatch(numImages, {width, height}, nvcv::ImageFormat(fmt), rowalign); - return py::cast(std::move(tensor)).release().ptr(); + try + { + std::shared_ptr tensor + = Tensor::CreateForImageBatch(numImages, {width, height}, nvcv::ImageFormat(fmt), rowalign); + return py::cast(std::move(tensor)).release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Tensor for ImageBatch create failed") } extern "C" void ImplImageBatchVarShape_PushBack(PyObject *varshape, PyObject *image) { - auto pimage = ToSharedObj(image); - return ToSharedObj(varshape)->pushBack(*pimage); + try + { + auto pimage = ToSharedObj(image); + return ToSharedObj(varshape)->pushBack(*pimage); + } + CATCH_RETURN_DEFAULT(, "ImageBatchVarShape push back failed") } extern "C" void ImplImageBatchVarShape_PopBack(PyObject *varshape, int32_t cnt) { - return ToSharedObj(varshape)->popBack(cnt); + try + { + return ToSharedObj(varshape)->popBack(cnt); + } + CATCH_RETURN_DEFAULT(, "ImageBatchVarShape pop back failed") } extern "C" void ImplImageBatchVarShape_Clear(PyObject *varshape) { - return ToSharedObj(varshape)->clear(); + try + { + return ToSharedObj(varshape)->clear(); + } + CATCH_RETURN_DEFAULT(, "ImageBatchVarShape clear failed") } extern "C" PyObject *ImplTensorBatch_Create(int32_t capacity) { - std::shared_ptr tensorBatch = TensorBatch::Create(capacity); - return py::cast(std::move(tensorBatch)).release().ptr(); + try + { + std::shared_ptr tensorBatch = TensorBatch::Create(capacity); + return py::cast(std::move(tensorBatch)).release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "TensorBatch create failed") } extern "C" NVCVTensorBatchHandle ImplTensorBatch_GetHandle(PyObject *tensorBatch) { - return ToSharedObj(tensorBatch)->impl().handle(); + try + { + return ToSharedObj(tensorBatch)->impl().handle(); + } + CATCH_RETURN_DEFAULT(0, "TensorBatch get handle failed") } extern "C" void ImplTensorBatch_PushBack(PyObject *tensorBatch, PyObject *tensor) { - auto ptensor = ToSharedObj(tensor); - ToSharedObj(tensorBatch)->pushBack(*ptensor); + try + { + auto ptensor = ToSharedObj(tensor); + ToSharedObj(tensorBatch)->pushBack(*ptensor); + } + CATCH_RETURN_DEFAULT(, "TensorBatch push back failed") } extern "C" void ImplTensorBatch_PopBack(PyObject *tensorBatch, uint32_t cnt) { - ToSharedObj(tensorBatch)->popBack(cnt); + try + { + ToSharedObj(tensorBatch)->popBack(cnt); + } + CATCH_RETURN_DEFAULT(, "TensorBatch pop back failed") } extern "C" void ImplTensorBatch_Clear(PyObject *tensorBatch) { - ToSharedObj(tensorBatch)->clear(); + try + { + ToSharedObj(tensorBatch)->clear(); + } + CATCH_RETURN_DEFAULT(, "TensorBatch clear failed") } extern "C" void ImplCache_Add(ICacheItem *extItem) { - auto item = std::make_shared(extItem->shared_from_this()); - Cache::Instance().add(*item); + try + { + auto item = std::make_shared(extItem->shared_from_this()); + Cache::Instance().add(*item); + } + CATCH_RETURN_DEFAULT(, "Cache add item failed") } extern "C" ICacheItem **ImplCache_Fetch(const IKey *pkey) { - NVCV_ASSERT(pkey != nullptr); + try + { + NVCV_ASSERT(pkey != nullptr); - std::vector> vcont = Cache::Instance().fetch(*pkey); + std::vector> vcont = Cache::Instance().fetch(*pkey); - std::unique_ptr out(new ICacheItem *[vcont.size() + 1]); - for (size_t i = 0; i < vcont.size(); ++i) - { - ExternalCacheItem *extItem = dynamic_cast(vcont[i].get()); - NVCV_ASSERT(extItem != nullptr); + std::unique_ptr out(new ICacheItem *[vcont.size() + 1]); + for (size_t i = 0; i < vcont.size(); ++i) + { + ExternalCacheItem *extItem = dynamic_cast(vcont[i].get()); + NVCV_ASSERT(extItem != nullptr); - out[i] = extItem->obj.get(); - } - out[vcont.size()] = nullptr; // end of list + out[i] = extItem->obj.get(); + } + out[vcont.size()] = nullptr; // end of list - return out.release(); + return out.release(); + } + CATCH_RETURN_DEFAULT(nullptr, "Cache add fetch failed") } extern "C" PyObject *ImplImage_Create(int32_t width, int32_t height, NVCVImageFormat fmt, int32_t rowAlign) { - std::shared_ptr img = Image::Create({width, height}, nvcv::ImageFormat{fmt}, rowAlign); - return py::cast(std::move(img)).release().ptr(); + try + { + std::shared_ptr img = Image::Create({width, height}, nvcv::ImageFormat{fmt}, rowAlign); + return py::cast(std::move(img)).release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Image create failed") } extern "C" NVCVImageHandle ImplImage_GetHandle(PyObject *img) { - return ToSharedObj(img)->impl().handle(); + try + { + return ToSharedObj(img)->impl().handle(); + } + CATCH_RETURN_DEFAULT(0, "Image get handle failed") } extern "C" PyObject *ImplContainer_Create(nvcvpy::Container *pcont) { - NVCV_ASSERT(pcont != nullptr); - auto cont = std::make_shared(*pcont); + try + { + NVCV_ASSERT(pcont != nullptr); + auto cont = std::make_shared(*pcont); - py::object ocont = py::cast(cont); - return ocont.release().ptr(); + py::object ocont = py::cast(cont); + return ocont.release().ptr(); + } + CATCH_RETURN_DEFAULT(nullptr, "Container create failed") } extern "C" void ImplCache_RemoveAllNotInUseMatching(const IKey *pkey) { - NVCV_ASSERT(pkey != nullptr); + try + { + NVCV_ASSERT(pkey != nullptr); - Cache::Instance().removeAllNotInUseMatching(*pkey); + Cache::Instance().removeAllNotInUseMatching(*pkey); + } + CATCH_RETURN_DEFAULT(, "Cache cleanup failed when removing all not in use matching") } } // namespace +// Note these functions will set a PyError if an exception is thrown, this must be then checked by calling +// CheckCAPIError() before returning to Python. void ExportCAPI(py::module &m) { static CAPI capi = { diff --git a/python/mod_nvcv/DataType.cpp b/python/mod_nvcv/DataType.cpp index ac1c42908..10baef79e 100644 --- a/python/mod_nvcv/DataType.cpp +++ b/python/mod_nvcv/DataType.cpp @@ -221,7 +221,8 @@ py::dtype SelectDType(std::tuple, const nvcv::DataType &dtype) } } // namespace - // + +// std::optional ToNVCVDataType(const py::dtype &dt) { diff --git a/python/mod_nvcv/Image.cpp b/python/mod_nvcv/Image.cpp index 393937306..15ffe6730 100644 --- a/python/mod_nvcv/Image.cpp +++ b/python/mod_nvcv/Image.cpp @@ -1069,7 +1069,7 @@ void Image::Export(py::module &m) { using namespace py::literals; - py::class_, Container>(m, "Image") + py::class_, Container>(m, "Image", "Image") .def(py::init(&Image::Create), "size"_a, "format"_a, "rowalign"_a = 0, "Constructor that takes a size, format and optional row align of the image") .def(py::init(&Image::CreateHost), "buffer"_a, "format"_a = nvcv::FMT_NONE, "rowalign"_a = 0, diff --git a/python/mod_nvcv/ImageBatch.cpp b/python/mod_nvcv/ImageBatch.cpp index c0595b22b..38c838093 100644 --- a/python/mod_nvcv/ImageBatch.cpp +++ b/python/mod_nvcv/ImageBatch.cpp @@ -189,7 +189,8 @@ void ImageBatchVarShape::Export(py::module &m) { using namespace py::literals; - py::class_, Container>(m, "ImageBatchVarShape") + py::class_, Container>(m, "ImageBatchVarShape", + "Batch of Images.") .def(py::init(&ImageBatchVarShape::Create), "capacity"_a, "Create a new ImageBatchVarShape object with the specified capacity.") .def_property_readonly("uniqueformat", &ImageBatchVarShape::uniqueFormat, diff --git a/python/mod_nvcv/Main.cpp b/python/mod_nvcv/Main.cpp index 372f02a24..18d0a4485 100644 --- a/python/mod_nvcv/Main.cpp +++ b/python/mod_nvcv/Main.cpp @@ -37,13 +37,6 @@ namespace py = pybind11; PYBIND11_MODULE(nvcv, m) { - m.doc() = R"pbdoc( - NVCV Python API reference - ======================== - - This is the Python API reference for the NVIDIA® NVCV library. - )pbdoc"; - m.attr("__version__") = NVCV_VERSION_STRING; using namespace nvcvpy::priv; @@ -59,10 +52,10 @@ PYBIND11_MODULE(nvcv, m) ExternalBuffer::Export(m); // Supporting objects + ExportColorSpec(m); ExportImageFormat(m); ExportDataType(m); ExportRect(m); - ExportColorSpec(m); // Objects Tensor::Export(m); diff --git a/python/mod_nvcv/Rect.cpp b/python/mod_nvcv/Rect.cpp index 3d38f64e5..01e6bc003 100644 --- a/python/mod_nvcv/Rect.cpp +++ b/python/mod_nvcv/Rect.cpp @@ -31,7 +31,7 @@ void ExportRect(py::module &m) { using namespace py::literals; - py::class_(m, "RectI") + py::class_(m, "RectI", "RectI") .def(py::init([]() { return NVCVRectI{}; }), "Default constructor") .def(py::init( [](int x, int y, int w, int h) diff --git a/python/mod_nvcv/Tensor.cpp b/python/mod_nvcv/Tensor.cpp index ea21d08af..99d01cc57 100644 --- a/python/mod_nvcv/Tensor.cpp +++ b/python/mod_nvcv/Tensor.cpp @@ -390,7 +390,7 @@ void Tensor::Export(py::module &m) py::implicitly_convertible(); - py::class_, Container>(m, "Tensor") + py::class_, Container>(m, "Tensor", "Tensor") .def(py::init(&Tensor::CreateForImageBatch), "nimages"_a, "imgsize"_a, "format"_a, "rowalign"_a = 0, "Create a Tensor object for an ImageBatch.") .def(py::init(&Tensor::Create), "shape"_a, "dtype"_a, "layout"_a = std::nullopt, "rowalign"_a = 0, diff --git a/python/mod_nvcv/TensorBatch.cpp b/python/mod_nvcv/TensorBatch.cpp index 99d514970..2e838c735 100644 --- a/python/mod_nvcv/TensorBatch.cpp +++ b/python/mod_nvcv/TensorBatch.cpp @@ -229,8 +229,7 @@ void TensorBatch::Export(py::module &m) "The capacity of the container must be specified upfront in the batch initialization.\n" "The tensors in the batch may differ in shapes but they must have " "a uniform dimensionality, data type and layout.") - .def(py::init(&TensorBatch::Create), - "capacity"_a + .def(py::init(&TensorBatch::Create), "capacity"_a, "Create a new TensorBatch object with the specified capacity.") .def_property_readonly("layout", &TensorBatch::layout, "Layout of the tensors in the tensor batch." diff --git a/python/mod_nvcv/include/nvcv/python/Array.hpp b/python/mod_nvcv/include/nvcv/python/Array.hpp index 5d3d2bcfe..e80a6a5e8 100644 --- a/python/mod_nvcv/include/nvcv/python/Array.hpp +++ b/python/mod_nvcv/include/nvcv/python/Array.hpp @@ -23,6 +23,7 @@ #include "Resource.hpp" #include "Shape.hpp" +#include #include #include #include @@ -41,9 +42,9 @@ class Array static Array Create(int64_t length, nvcv::DataType dtype) { PyObject *oarray = capi().Array_Create(length, dtype); - + CheckCAPIError(); + NVCV_ASSERT(oarray == nullptr); py::object pyarray = py::reinterpret_steal(oarray); - return Array(pyarray); } @@ -59,7 +60,7 @@ class Array explicit Array(py::object obj) : Resource(obj) - , nvcv::Array(FromHandle(capi().Array_GetHandle(this->ptr()), true)) + , nvcv::Array(FromHandle(CheckCAPIError(capi().Array_GetHandle(this->ptr())), true)) { } }; diff --git a/python/mod_nvcv/include/nvcv/python/CAPI.hpp b/python/mod_nvcv/include/nvcv/python/CAPI.hpp index 664ed87b5..612d6583f 100644 --- a/python/mod_nvcv/include/nvcv/python/CAPI.hpp +++ b/python/mod_nvcv/include/nvcv/python/CAPI.hpp @@ -98,6 +98,26 @@ inline const CAPI &capi() return *capi; } +/* Check for an error inside the CAPI, since exceptions cannot cross the C api + * boundary, this must be called to make sure en exception was not converted to + * a PyErr + */ +inline void CheckCAPIError() +{ + if (PyErr_Occurred()) + { + // Propagate the exception to Python + throw pybind11::error_already_set(); + } +}; + +template +decltype(auto) CheckCAPIError(T &&arg) +{ + CheckCAPIError(); + return std::forward(arg); +} + } // namespace nvcvpy #endif // NVCV_PYTHON_CAPI_HPP diff --git a/python/mod_nvcv/include/nvcv/python/Cache.hpp b/python/mod_nvcv/include/nvcv/python/Cache.hpp index 66aec85c2..8ef8017e2 100644 --- a/python/mod_nvcv/include/nvcv/python/Cache.hpp +++ b/python/mod_nvcv/include/nvcv/python/Cache.hpp @@ -72,6 +72,7 @@ class Cache static void add(ICacheItem &item) { capi().Cache_Add(&item); + CheckCAPIError(); } static std::vector> fetch(const IKey &key) @@ -80,6 +81,7 @@ class Cache { capi().Cache_Fetch(&key) }; + CheckCAPIError(); std::vector> out; for (int i = 0; list[i]; ++i) @@ -92,6 +94,7 @@ class Cache static void removeAllNotInUseMatching(const IKey &key) { capi().Cache_RemoveAllNotInUseMatching(&key); + CheckCAPIError(); } }; diff --git a/python/mod_nvcv/include/nvcv/python/Container.hpp b/python/mod_nvcv/include/nvcv/python/Container.hpp index fc0eb0a98..b774420d1 100644 --- a/python/mod_nvcv/include/nvcv/python/Container.hpp +++ b/python/mod_nvcv/include/nvcv/python/Container.hpp @@ -22,6 +22,7 @@ #include "Cache.hpp" #include "Resource.hpp" +#include #include namespace nvcvpy { @@ -39,8 +40,12 @@ class Container } explicit Container() - : Resource(py::reinterpret_steal(capi().Container_Create(this))) { + PyObject *raw_obj = capi().Container_Create(this); + CheckCAPIError(); + NVCV_ASSERT(raw_obj != nullptr); + py::object temp = py::reinterpret_steal(raw_obj); + new (static_cast(this)) Resource(temp); } }; diff --git a/python/mod_nvcv/include/nvcv/python/DataType.hpp b/python/mod_nvcv/include/nvcv/python/DataType.hpp index 2f79674f8..c7d6bca62 100644 --- a/python/mod_nvcv/include/nvcv/python/DataType.hpp +++ b/python/mod_nvcv/include/nvcv/python/DataType.hpp @@ -34,13 +34,16 @@ struct type_caster bool load(handle src, bool) { NVCVDataType p = cvpy::capi().DataType_FromPython(src.ptr()); - value = nvcv::DataType(p); + cvpy::CheckCAPIError(); + value = nvcv::DataType(p); return true; } static handle cast(nvcv::DataType type, return_value_policy /* policy */, handle /*parent */) { - return cvpy::capi().DataType_ToPython(static_cast(type)); + handle out = cvpy::capi().DataType_ToPython(static_cast(type)); + cvpy::CheckCAPIError(); + return out; } }; diff --git a/python/mod_nvcv/include/nvcv/python/Image.hpp b/python/mod_nvcv/include/nvcv/python/Image.hpp index b56463e0a..58b15dc30 100644 --- a/python/mod_nvcv/include/nvcv/python/Image.hpp +++ b/python/mod_nvcv/include/nvcv/python/Image.hpp @@ -21,6 +21,7 @@ #include "CAPI.hpp" #include "Resource.hpp" +#include #include #include #include @@ -48,7 +49,8 @@ class Image static Image Create(nvcv::Size2D size, nvcv::ImageFormat fmt, int rowAlign = 0) { PyObject *oimg = capi().Image_Create(size.w, size.h, static_cast(fmt), rowAlign); - + CheckCAPIError(); + NVCV_ASSERT(oimg != nullptr); py::object pyimg = py::reinterpret_steal(oimg); return Image(pyimg); @@ -62,7 +64,7 @@ class Image explicit Image(py::object obj) : Resource(obj) - , nvcv::Image(FromHandle(capi().Image_GetHandle(this->ptr()), true)) + , nvcv::Image(FromHandle(CheckCAPIError(capi().Image_GetHandle(this->ptr())), true)) { } }; diff --git a/python/mod_nvcv/include/nvcv/python/ImageBatchVarShape.hpp b/python/mod_nvcv/include/nvcv/python/ImageBatchVarShape.hpp index 86f8ec06d..24bf0d2f0 100644 --- a/python/mod_nvcv/include/nvcv/python/ImageBatchVarShape.hpp +++ b/python/mod_nvcv/include/nvcv/python/ImageBatchVarShape.hpp @@ -21,6 +21,7 @@ #include "CAPI.hpp" #include "Resource.hpp" +#include #include #include @@ -38,7 +39,8 @@ class ImageBatchVarShape static ImageBatchVarShape Create(int capacity) { PyObject *ovarshape = capi().ImageBatchVarShape_Create(capacity); - + CheckCAPIError(); + NVCV_ASSERT(ovarshape != nullptr); py::object pyvarshape = py::reinterpret_steal(ovarshape); return ImageBatchVarShape(pyvarshape); @@ -55,16 +57,19 @@ class ImageBatchVarShape void pushBack(Image img) { capi().ImageBatchVarShape_PushBack(this->ptr(), img.ptr()); + CheckCAPIError(); } void popBack(int cnt) { capi().ImageBatchVarShape_PopBack(this->ptr(), cnt); + CheckCAPIError(); } void clear() { capi().ImageBatchVarShape_Clear(this->ptr()); + CheckCAPIError(); } // By default we use the varshape interface. @@ -81,7 +86,7 @@ class ImageBatchVarShape explicit ImageBatchVarShape(py::object obj) : Resource(obj) - , nvcv::ImageBatchVarShape(FromHandle(capi().ImageBatchVarShape_GetHandle(this->ptr()), true)) + , nvcv::ImageBatchVarShape(FromHandle(CheckCAPIError(capi().ImageBatchVarShape_GetHandle(this->ptr())), true)) { } }; diff --git a/python/mod_nvcv/include/nvcv/python/ImageFormat.hpp b/python/mod_nvcv/include/nvcv/python/ImageFormat.hpp index 43ff55b37..366f5a011 100644 --- a/python/mod_nvcv/include/nvcv/python/ImageFormat.hpp +++ b/python/mod_nvcv/include/nvcv/python/ImageFormat.hpp @@ -34,13 +34,16 @@ struct type_caster bool load(handle src, bool) { NVCVImageFormat p = cvpy::capi().ImageFormat_FromPython(src.ptr()); - value = nvcv::ImageFormat(p); + cvpy::CheckCAPIError(); + value = nvcv::ImageFormat(p); return true; } static handle cast(nvcv::ImageFormat type, return_value_policy /* policy */, handle /*parent */) { - return cvpy::capi().ImageFormat_ToPython(static_cast(type)); + handle out = cvpy::capi().ImageFormat_ToPython(static_cast(type)); + cvpy::CheckCAPIError(); + return out; } }; diff --git a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp index 5ad2bae5d..0be46ff66 100644 --- a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp +++ b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp @@ -63,6 +63,7 @@ class ResourceGuard { py::object pyRes = r.get(); capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr()); + CheckCAPIError(); m_resourcesPerLockMode.append(std::make_pair(pyLockMode, std::move(pyRes))); } @@ -72,6 +73,7 @@ class ResourceGuard void commit() { capi().Stream_HoldResources(m_pyStream.ptr(), m_resourcesPerLockMode.ptr()); + CheckCAPIError(); } private: diff --git a/python/mod_nvcv/include/nvcv/python/Stream.hpp b/python/mod_nvcv/include/nvcv/python/Stream.hpp index d55ca6f90..38ef51700 100644 --- a/python/mod_nvcv/include/nvcv/python/Stream.hpp +++ b/python/mod_nvcv/include/nvcv/python/Stream.hpp @@ -20,6 +20,7 @@ #include "CAPI.hpp" +#include #include namespace nvcvpy { @@ -33,12 +34,17 @@ class Stream : public py::object static Stream Current() { - return Stream(py::reinterpret_borrow(capi().Stream_GetCurrent())); + py::object temp = py::reinterpret_borrow(capi().Stream_GetCurrent()); + CheckCAPIError(); + NVCV_ASSERT(temp.is_none() == false); + return Stream(temp); } cudaStream_t cudaHandle() const { - return capi().Stream_GetCudaHandle(this->ptr()); + cudaStream_t out = capi().Stream_GetCudaHandle(this->ptr()); + CheckCAPIError(); + return out; } private: diff --git a/python/mod_nvcv/include/nvcv/python/Tensor.hpp b/python/mod_nvcv/include/nvcv/python/Tensor.hpp index 63fff4830..e3ba66053 100644 --- a/python/mod_nvcv/include/nvcv/python/Tensor.hpp +++ b/python/mod_nvcv/include/nvcv/python/Tensor.hpp @@ -23,6 +23,7 @@ #include "Resource.hpp" #include "Shape.hpp" +#include #include #include #include @@ -43,7 +44,8 @@ class Tensor { PyObject *otensor = capi().Tensor_Create(tshape.size(), &tshape[0], static_cast(dtype), static_cast(tshape.layout()), rowalign); - + CheckCAPIError(); + NVCV_ASSERT(otensor != nullptr); py::object pytensor = py::reinterpret_steal(otensor); return Tensor(pytensor); @@ -59,7 +61,8 @@ class Tensor { PyObject *otensor = capi().Tensor_CreateForImageBatch(numImages, size.w, size.h, static_cast(fmt), rowalign); - + CheckCAPIError(); + NVCV_ASSERT(otensor != nullptr); py::object pytensor = py::reinterpret_steal(otensor); return Tensor(pytensor); @@ -72,7 +75,7 @@ class Tensor explicit Tensor(py::object obj) : Resource(obj) - , nvcv::Tensor(FromHandle(capi().Tensor_GetHandle(this->ptr()), true)) + , nvcv::Tensor(FromHandle(CheckCAPIError(capi().Tensor_GetHandle(this->ptr())), true)) { } }; diff --git a/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp b/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp index b13e184f7..c6bf604b4 100644 --- a/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp +++ b/python/mod_nvcv/include/nvcv/python/TensorBatch.hpp @@ -21,6 +21,7 @@ #include "CAPI.hpp" #include "Resource.hpp" +#include #include #include @@ -38,7 +39,8 @@ class TensorBatch static TensorBatch Create(int capacity) { PyObject *tensorBatch = capi().TensorBatch_Create(capacity); - + CheckCAPIError(); + NVCV_ASSERT(tensorBatch != nullptr); py::object pytensorBatch = py::reinterpret_steal(tensorBatch); return TensorBatch(pytensorBatch); @@ -47,16 +49,19 @@ class TensorBatch void pushBack(Tensor tensor) { capi().TensorBatch_PushBack(this->ptr(), tensor.ptr()); + CheckCAPIError(); } void popBack(int cnt) { capi().TensorBatch_PopBack(this->ptr(), cnt); + CheckCAPIError(); } void clear() { capi().TensorBatch_Clear(this->ptr()); + CheckCAPIError(); } using nvcv::TensorBatch::operator[]; @@ -70,7 +75,7 @@ class TensorBatch explicit TensorBatch(py::object obj) : Resource(obj) - , nvcv::TensorBatch(FromHandle(capi().TensorBatch_GetHandle(this->ptr()), true)) + , nvcv::TensorBatch(FromHandle(CheckCAPIError(capi().TensorBatch_GetHandle(this->ptr())), true)) { } }; diff --git a/python/setup.py.in b/python/setup.py.in index c22e9d0ff..b88d9bc74 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -66,7 +66,7 @@ setup( "": ["*.so", "cvcuda.libs/*.*"] }, # Includes the binding .so + core .so files include_package_data=True, - install_requires=["numpy>=1.23.5"], + install_requires=["numpy>=1.23.5,<2.0.0"], python_requires="==${PYTHON_VERSION}.*", zip_safe=False, cmdclass={ diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 806192fe1..d3767378a 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -35,6 +35,8 @@ set(PYSAMPLES classification object_detection label) +# Append extra (proprietary) samples + foreach(sample ${CPPSAMPLES}) add_subdirectory(${sample}) endforeach() diff --git a/samples/common/python/nvcodec_utils.py b/samples/common/python/nvcodec_utils.py index 420e15fe2..36f5c4de7 100644 --- a/samples/common/python/nvcodec_utils.py +++ b/samples/common/python/nvcodec_utils.py @@ -614,12 +614,12 @@ def __call__(self, batch): assert isinstance(batch.data, torch.Tensor) - image_tensors_nchw = batch.data + image_tensors_nhwc = batch.data # Create an empty list to store filenames filenames = [] - chwtensor_list = [] + hwctensor_list = [] # Iterate through each image to prepare the filenames - for img_idx in range(image_tensors_nchw.shape[0]): + for img_idx in range(image_tensors_nhwc.shape[0]): img_name = os.path.splitext(os.path.basename(batch.fileinfo[img_idx]))[0] results_path = os.path.join(self.output_path, f"out_{img_name}.jpg") self.logger.info(f"Preparing to save the image to: {results_path}") @@ -627,10 +627,10 @@ def __call__(self, batch): filenames.append(results_path) # Add the image tensor CAI to a CAI list from an NCHW tensor # (this was a stacked tensor if N images) - chwtensor_list.append(image_tensors_nchw[img_idx].cuda()) + hwctensor_list.append(image_tensors_nhwc[img_idx].cuda()) # Pass the image tensors and filenames to the encoder. - self.encoder.write(filenames, chwtensor_list) + self.encoder.write(filenames, hwctensor_list) self.cvcuda_perf.pop_range() # docs_tag: end_call_imagebatchencoder_nvimagecodec diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index 40fae3cb9..f7f17b5b2 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -28,13 +28,13 @@ SAMPLES_DIR="$(dirname "$SCRIPT_DIR")" CLASSIFICATION_OUT_DIR=/tmp/classification SEGMENTATION_OUT_DIR="/tmp/segmentation" DETECTION_OUT_DIR="/tmp/object_detection" -DISTANCE_LABEL_OUT_DIR="/tmp/distance_label" +LABEL_OUT_DIR="/tmp/label" echo "SAMPLES_DIR: $SAMPLES_DIR" echo "CLASSIFICATION_OUT_DIR: $CLASSIFICATION_OUT_DIR" echo "SEGMENTATION_OUT_DIR: $SEGMENTATION_OUT_DIR" echo "DETECTION_OUT_DIR: $DETECTION_OUT_DIR" -echo "DISTANCE_LABEL_OUT_DIR: $DISTANCE_LABEL_OUT_DIR" +echo "LABEL_OUT_DIR: $LABEL_OUT_DIR" create_output_dir() { local base_dir=$1 @@ -128,11 +128,11 @@ python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/imag DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR") python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_RUN_DIR" -# Run the distance label Python sample with default settings, without any command-line args. -rm -rf "$DISTANCE_LABEL_OUT_DIR" -mkdir "$DISTANCE_LABEL_OUT_DIR" -DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR") -python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_RUN_DIR" +# Run the label Python sample with default settings, without any command-line args. +rm -rf "$LABEL_OUT_DIR" +mkdir "$LABEL_OUT_DIR" +LABEL_RUN_DIR=$(create_output_dir "$LABEL_OUT_DIR") +python3 $SAMPLES_DIR/label/python/main.py -o "$LABEL_RUN_DIR" # Run it with batch size 1 on a single image -DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR") -python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DISTANCE_LABEL_RUN_DIR" +LABEL_RUN_DIR=$(create_output_dir "$LABEL_OUT_DIR") +python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$LABEL_RUN_DIR" diff --git a/src/cvcuda/OpResizeCropConvertReformat.cpp b/src/cvcuda/OpResizeCropConvertReformat.cpp index 2db154d6f..4da7dd2cc 100644 --- a/src/cvcuda/OpResizeCropConvertReformat.cpp +++ b/src/cvcuda/OpResizeCropConvertReformat.cpp @@ -44,21 +44,21 @@ CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatCreate, (NVCV CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatSubmit, (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip)) + const NVCVChannelManip manip, const float scale, const float offset)) { return nvcv::ProtectCall( [&] { nvcv::TensorWrapHandle input(in), output(out); priv::ToDynamicRef(handle)(stream, input, output, resizeDim, interpolation, - cropPos, manip); + cropPos, manip, scale, offset); }); } CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatVarShapeSubmit, (NVCVOperatorHandle handle, cudaStream_t stream, NVCVImageBatchHandle in, NVCVTensorHandle out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip)) + const NVCVChannelManip manip, const float scale, const float offset)) { return nvcv::ProtectCall( [&] @@ -66,6 +66,6 @@ CVCUDA_DEFINE_API(0, 8, NVCVStatus, cvcudaResizeCropConvertReformatVarShapeSubmi nvcv::ImageBatchVarShapeWrapHandle input(in); nvcv::TensorWrapHandle output(out); priv::ToDynamicRef(handle)(stream, input, output, resizeDim, interpolation, - cropPos, manip); + cropPos, manip, scale, offset); }); } diff --git a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h index 041ff8003..ca27f038f 100644 --- a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h +++ b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.h @@ -19,7 +19,7 @@ * @file OpResizeCropConvertReformat.h * * @brief Defines functions that fuses resize, crop, data type conversion, channel manipulation, and layout reformat operations to optimize pipelines. - * @defgroup NVCV_C_ALGORITHM__RESIZE_CROP Resize Crop + * @defgroup NVCV_C_ALGORITHM__RESIZE_CROP Resize Crop Convert * @{ */ @@ -53,30 +53,58 @@ extern "C" */ CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatCreate(NVCVOperatorHandle *handle); -/** Executes the ResizeCropConvertReformat operation on the given cuda stream. This operation - * does not wait for completion. +/** Executes the fused ResizeCropConvertReformat operation on the given cuda + * stream. This operation does not wait for completion. * - * ResizeCropConvertReformat performs the following operations in order: - * 1) Resize either a single tensor or each image in an ImageBatchVarShape + * ResizeCropConvertReformat is a fused operator that performs the following + * operations in order: + * + * 1. Resize either a single tensor or each image in an ImageBatchVarShape * to a specified width and height (other dimensions are unchanged). - * 2) Crops a specified region of size width x height (determined by the - * output tensor's width & height) starting at the pixel position - * (cropPos.x, cropPos.y) out of the resized tensor. - * 3) Convert the element data type to the output tensor's data type. For - * example, convert uchar elements to float. Limited options availble. - * 4) Optional channel manipulation--i.e., re-order the channels + * This step is identical to the stand-alone Resize operation with the + * exception of optionally not type-casting the interpolation results + * back to the input data type (see the srcCast parameter for details). + * + * 2. Crops a specified region out of the resized tensor. + * + * 3. Apply a scale and offset to output result (after resizing and + * cropping). This can be used to normalize to a new range of values. + * For example, if the input is unsigned 8-bit values and the output is + * floating point, setting scale = 1.0/127.5 and offset = -1.0 will + * convert the 8-bit input values (ranging from 0 to 255) to floating + * point output values between -1.0 and 1.0. + * + * 4. Optional channel manipulation--i.e., re-order the channels * of a tensor (e.g., RGB to BGR). Limited options available. - * 5) If output tensor's layout doesn't match the input's layout, reshape + * + * 5. Convert the element data type to the output tensor's data type. For + * example, convert uchar elements to float. Limited options availble. + * + * 6. If output tensor's layout doesn't match the input's layout, reshape * the layout to match output layout (e.g., NHWC to NCHW). Limited * options available. - * NOTE: Since all images in an ImageBatchVarShape are resized to the - * same size, the resulting collection now fits in a single tensor. + * + * NOTES: + * + Since all images in an ImageBatchVarShape are resized to the same size, + * the resulting collection now fits in a single tensor. + * + Except for nearest-neighbor interpolation (NVCV_INTERP_NEAREST), + * interpolation (e.g., NVCV_INTERP_LINEAR, NVCV_INTERP_CUBIC, and + * NVCV_INTERP_AREA) computes resized pixel values using floating point + * math. However, the stand-alone resize operation (i.e., running the + * standard Resize operator independently) converts interpolated pixel + * values back to the source data type since its input and output types + * must be the same. As an option, this fused operator can either cast + * the resized pixel values back to the source type (to match results + * from running the steps independently), or leave them in the + * interpolated floating-point space to avoid quantization issues that + * occur from casting back to an integer source type (e.g., uchar). See + * the srcCast parameter for details. * * Limitations: * - * Input: STILL NEED TO FILL THIS IN - * Data Layout: [NVCV_TENSOR_HWC, NVCV_TENSOR_NHWC] - * Channels: [1, 3] + * Input: + * + Data Layout: [NVCV_TENSOR_HWC, NVCV_TENSOR_NHWC] + * + Channels: [1, 3] * * Data Type | Allowed * -------------- | ------------- @@ -90,9 +118,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatCreate(NVCVOperatorHandl * 64bit Float | No * * Output: - * Data Layout: [NVCV_TENSOR_NHWC, NVCV_TENSOR_HWC, - * NVCV_TENSOR_NCHW, NVCV_TENSOR_CHW] - * Channels: [1, 3] + * + Data Layout: [NVCV_TENSOR_NHWC, NVCV_TENSOR_HWC, + * NVCV_TENSOR_NCHW, NVCV_TENSOR_CHW] + * + Channels: [1, 3] * * Data Type | Allowed * -------------- | ------------- @@ -105,7 +133,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatCreate(NVCVOperatorHandl * 32bit Float | Yes * 64bit Float | No * - * Input/Output dependency + * Input/Output dependency: * * Property | Input == Output * -------------- | ------------- @@ -116,41 +144,75 @@ CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatCreate(NVCVOperatorHandl * Width | No * Height | No * - * @param [in] handle Handle to the operator. - * + Must not be NULL. - * @param [in] stream Handle to a valid CUDA stream. - * - * @param [in] in Input tensor or image batch. The images in an image batch can be of different - * sizes, but all images must have the same data type, channels, and layout. - * - * @param [in] resizeDim Dimensions, {width, height}, to resize the tensor method to be used, - * see \ref NVCVSize2D for more details. - * - * @param [in] interpolation Interpolation method to be used, see \ref NVCVInterpolationType for - * more details. Currently, only NVCV_INTERP_NEAREST and NVCV_INTERP_LINEAR - * are available. + * @param [in] handle Handle to the operator. Must not be NULL. * - * @param [in] cropPos Crop position, (x, y), specifying the top-left corner of the crop region. - * The crop region's width and height is specified by the output tensor's - * width & height. - * @note: The crop must fall within the resized image. Let (x, y, w, h) - * represent the crop rectangle, where x & y are the cropPos coordinates - * and w & h are the output tensor's width and height, then the following - * must all be true: - * x >= 0 - * y >= 0 - * x + w <= resizeDim.w - * y + h <= resizeDim.h - * - * - * @param [in] manip Channel manipulation to be used (e.g., reshuffle RGB to BGR), - * see \ref NVCVChannelManip for more details. - * - * @param [out] out Output tensor. In addition to the output tensor determining the crop width - * and height, the output tensor also specifies the data type (e.g., uchar3 or - * float) and tensor layout (NHWC or NCHW), with limitations. + * @param [in] stream Handle to a valid CUDA stream. * - * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @param [in] in Input tensor or image batch. The images in an image batch can + * be of different sizes, but all images must have the same data + * type, channels, and layout. + * + * @param [in] resizeDim Dimensions, {width, height}, that tensor or image + * batch images are resized to prior to cropping, see + * \ref NVCVSize2D for more details. + * + * @param [in] interpolation Interpolation method to be used, (see \ref + * NVCVInterpolationType). Currently, only + * NVCV_INTERP_NEAREST and NVCV_INTERP_LINEAR are + * available. + * + * @param [in] cropPos Crop position, (x, y), specifying the top-left corner of + * the crop region. The crop region's width and height is + * specified by the output tensor's width & height. The crop + * must fall within the resized image. Let (x, y, w, h) + * represent the crop rectangle, where x & y are the cropPos + * coordinates and w and h are the output tensor's width and + * height, respectively, then it must be true that: + * + x >= 0, + * + y >= 0, + * + x + w <= resizeDim.w, and + * + y + h <= resizeDim.h. + * + * @param [in] manip Channel manipulation to be used--e.g., reshuffle RGB to + * BGR (see \ref NVCVChannelManip). + * + * @param [in] scale Scale (i.e., multiply) the resized and cropped output + * values by this amount. 1.0 results in no scaling of the + * output values. + * + * @param [in] offset Offset (i.e., add to) the output values by this amount. + * This is applied after scaling--if v is a resized and + * cropped value, then scale * v + offset is the final output + * value. 0.0 results in no offset being added to the output. + * + * @param [in] srcCast Boolean value indicating whether or not the interpolation + * results during the resize are re-cast back to the input + * (or source) data type. Most interpolation methods (e.g., + * NVCV_INTERP_LINEAR) compute resized pixel values using + * floating point math. This parameter determines if the + * interpolation result is cast to the source data type + * before computing the remaining steps in this operator: + * + true: the interpolation result is cast back to the + * source type prior to computing the remaining steps -- + * as if calling the stand-alone Resize operator (since + * its input and output types must be the same). Note: + * this option can produce quantized outputs (e.g., the + * input source type is uchar3), even if the destination + * data type is floating point. + * + false: the interpolation result is NOT cast back to + * the source type. Rather, the floating-point + * interpolation results are directly passed on to the + * remaining steps in the fused operator. + * + Note: in either case (true or false) the final (fused) + * result is still cast to the destination data type + * before writing values into the output tensor. + * + * @param [out] out Output tensor. In addition to the output tensor determining + * the crop width and height, the output tensor also specifies + * the data type (e.g., uchar3 or float) and tensor layout + * (NHWC or NCHW), with limitations. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is invalid or outside valid range. * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. * @retval #NVCV_SUCCESS Operation executed successfully. */ @@ -159,14 +221,15 @@ CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatSubmit(NVCVOperatorHandl NVCVTensorHandle in, NVCVTensorHandle out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, - const int2 cropPos, const NVCVChannelManip manip); + const int2 cropPos, const NVCVChannelManip manip, + const float scale, const float offset, bool srcCast); CVCUDA_PUBLIC NVCVStatus cvcudaResizeCropConvertReformatVarShapeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVImageBatchHandle in, NVCVTensorHandle out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, - const int2 cropPos, - const NVCVChannelManip manip); + const int2 cropPos, const NVCVChannelManip manip, + const float scale, float offset, bool srcCast); /** @} */ #ifdef __cplusplus diff --git a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp index 1e7fb143f..cbb13ee78 100644 --- a/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp +++ b/src/cvcuda/include/cvcuda/OpResizeCropConvertReformat.hpp @@ -46,11 +46,13 @@ class ResizeCropConvertReformat final : public IOperator void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP); + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP, const float scale = 1, const float offset = 0, + const bool srcCast = true); void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP); + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP, const float scale = 1, const float offset = 0, + const bool srcCast = true); virtual NVCVOperatorHandle handle() const noexcept override; @@ -72,19 +74,21 @@ inline ResizeCropConvertReformat::~ResizeCropConvertReformat() inline void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, - const int2 cropPos, const NVCVChannelManip manip) + const int2 cropPos, const NVCVChannelManip manip, const float scale, + const float offset, const bool srcCast) { - nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatSubmit(m_handle, stream, in.handle(), out.handle(), - resizeDim, interpolation, cropPos, manip)); + nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatSubmit( + m_handle, stream, in.handle(), out.handle(), resizeDim, interpolation, cropPos, manip, scale, offset, srcCast)); } inline void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip) + const NVCVChannelManip manip, const float scale, const float offset, + const bool srcCast) { - nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatVarShapeSubmit(m_handle, stream, in.handle(), out.handle(), - resizeDim, interpolation, cropPos, manip)); + nvcv::detail::CheckThrow(cvcudaResizeCropConvertReformatVarShapeSubmit( + m_handle, stream, in.handle(), out.handle(), resizeDim, interpolation, cropPos, manip, scale, offset, srcCast)); } inline NVCVOperatorHandle ResizeCropConvertReformat::handle() const noexcept diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt index 85683e409..488669f70 100644 --- a/src/cvcuda/priv/CMakeLists.txt +++ b/src/cvcuda/priv/CMakeLists.txt @@ -35,6 +35,7 @@ set(CV_CUDA_PRIV_OP_FILES OpNonMaximumSuppression.cu OpReformat.cpp OpResize.cpp + OpResize.cu OpCustomCrop.cpp OpNormalize.cpp OpPadAndStack.cpp diff --git a/src/cvcuda/priv/OpAdvCvtColor.cu b/src/cvcuda/priv/OpAdvCvtColor.cu index be97e77df..cac368e71 100644 --- a/src/cvcuda/priv/OpAdvCvtColor.cu +++ b/src/cvcuda/priv/OpAdvCvtColor.cu @@ -537,10 +537,20 @@ void AdvCvtColor::Yuv2Bgr(cudaStream_t stream, const nvcv::TensorDataStridedCuda { case legacy::kCV_8U: { - auto srcWrap = cuda::CreateTensorWrapNHWC(in); - auto dstWrap = cuda::CreateTensorWrapNHWC(out); - const YUV2RGBConstants &cooef = getYUV2RGBCooef(spec); - yuv_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, cooef); + const YUV2RGBConstants &cooef = getYUV2RGBCooef(spec); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= cuda::TypeTraits::max) + { + auto srcWrap = cuda::CreateTensorWrapNHWC(in); + auto dstWrap = cuda::CreateTensorWrapNHWC(out); + yuv_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, cooef); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } checkKernelErrors(); } break; @@ -574,10 +584,20 @@ void AdvCvtColor::Bgr2Yuv(cudaStream_t stream, const nvcv::TensorDataStridedCuda { case legacy::kCV_8U: { - auto srcWrap = cuda::CreateTensorWrapNHWC(in); - auto dstWrap = cuda::CreateTensorWrapNHWC(out); - const RGB2YUVConstants &cooef = getRGB2YUVCooef(spec); - bgr_to_yuv_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, cooef); + const RGB2YUVConstants &cooef = getRGB2YUVCooef(spec); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= cuda::TypeTraits::max) + { + auto srcWrap = cuda::CreateTensorWrapNHWC(in); + auto dstWrap = cuda::CreateTensorWrapNHWC(out); + bgr_to_yuv_char_nhwc<<>>(srcWrap, dstWrap, dstSize, bidx, cooef); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } checkKernelErrors(); } break; @@ -636,11 +656,21 @@ void AdvCvtColor::NvYuv2Bgr(cudaStream_t stream, const nvcv::TensorDataStridedCu { case legacy::kCV_8U: { - auto srcWrap = cuda::CreateTensorWrapNHWC(in); - auto dstWrap = cuda::CreateTensorWrapNHWC(out); - const YUV2RGBConstants &cooef = getYUV2RGBCooef(spec); - yuv420sp_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, dcn, bidx, uidx, - cooef); + const YUV2RGBConstants &cooef = getYUV2RGBCooef(spec); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= cuda::TypeTraits::max) + { + auto srcWrap = cuda::CreateTensorWrapNHWC(in); + auto dstWrap = cuda::CreateTensorWrapNHWC(out); + yuv420sp_to_bgr_char_nhwc<<>>(srcWrap, dstWrap, dstSize, dcn, bidx, uidx, + cooef); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } checkKernelErrors(); } break; @@ -694,11 +724,21 @@ void AdvCvtColor::Bgr2NvYuv(cudaStream_t stream, const nvcv::TensorDataStridedCu { case legacy::kCV_8U: { - auto srcWrap = cuda::CreateTensorWrapNHWC(in); - auto dstWrap = cuda::CreateTensorWrapNHWC(out); - const RGB2YUVConstants &cooef = getRGB2YUVCooef(spec); - bgr_to_yuv420sp_char_nhwc<<>>(srcWrap, dstWrap, srcSize, inputShape.C, bidx, - uidx, cooef); + const RGB2YUVConstants &cooef = getRGB2YUVCooef(spec); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= cuda::TypeTraits::max) + { + auto srcWrap = cuda::CreateTensorWrapNHWC(in); + auto dstWrap = cuda::CreateTensorWrapNHWC(out); + bgr_to_yuv420sp_char_nhwc<<>>(srcWrap, dstWrap, srcSize, inputShape.C, bidx, + uidx, cooef); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } checkKernelErrors(); } break; diff --git a/src/cvcuda/priv/OpBrightnessContrast.cu b/src/cvcuda/priv/OpBrightnessContrast.cu index 2e55c3dfd..735febf7c 100644 --- a/src/cvcuda/priv/OpBrightnessContrast.cu +++ b/src/cvcuda/priv/OpBrightnessContrast.cu @@ -52,6 +52,9 @@ inline constexpr bool RequiresDouble = std::is_integral_v && sizeof(T) >= 4; template using GetArgType = std::conditional_t || RequiresDouble, double, float>; +template +using ArgWrapper = cuda::Tensor1DWrap; + template struct SampleArgs { @@ -64,15 +67,15 @@ struct SampleArgs template struct BatchArgsWrap { - int brightnessLen, contrastLen, brightnessShiftLen, contrastCenterLen; - const cuda::Tensor1DWrap brightness; - const cuda::Tensor1DWrap contrast; - const cuda::Tensor1DWrap brightnessShift; - const cuda::Tensor1DWrap contrastCenter; + int brightnessLen, contrastLen, brightnessShiftLen, contrastCenterLen; + const ArgWrapper brightness; + const ArgWrapper contrast; + const ArgWrapper brightnessShift; + const ArgWrapper contrastCenter; }; template -inline __device__ BT GetArg(const cuda::Tensor1DWrap &tensorArg, int argLen, int sampleIdx, BT defaultVal) +inline __device__ BT GetArg(const ArgWrapper &tensorArg, int argLen, int sampleIdx, BT defaultVal) { if (argLen == 0) { @@ -198,23 +201,32 @@ inline void RunBrightnessContrast(cudaStream_t stream, const SrcData &srcData, c auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(srcData); int2 size = cuda::StaticCast(long2{srcAccess->numCols(), srcAccess->numRows()}); dim3 grid(util::DivUp(size.x, block.x), util::DivUp(size.y, block.y), srcAccess->numSamples()); + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); + + int64_t inMaxStride = srcAccess->sampleStride() * srcAccess->numSamples(); + int64_t outMaxStride = dstAccess->sampleStride() * dstAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) > cuda::TypeTraits::max) + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } + using StrideType = int32_t; if constexpr (!isPlanar) { - auto src = cuda::CreateTensorWrapNHW(srcData); - auto dst = cuda::CreateTensorWrapNHW(dstData); + auto src = cuda::CreateTensorWrapNHW(srcData); + auto dst = cuda::CreateTensorWrapNHW(dstData); BrightnessContrast<<>>(src, dst, batchArgs, size, 1); } else { - auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); - auto src = cuda::Tensor4DWrap( + auto src = cuda::Tensor4DWrap( srcData.basePtr(), static_cast(srcAccess->sampleStride()), static_cast(srcAccess->planeStride()), static_cast(srcAccess->rowStride())); - auto dst = cuda::Tensor4DWrap(dstData.basePtr(), static_cast(dstAccess->sampleStride()), - static_cast(dstAccess->planeStride()), - static_cast(dstAccess->rowStride())); - int numPlanes = srcAccess->numPlanes(); + auto dst = cuda::Tensor4DWrap( + dstData.basePtr(), static_cast(dstAccess->sampleStride()), + static_cast(dstAccess->planeStride()), static_cast(dstAccess->rowStride())); + int numPlanes = srcAccess->numPlanes(); BrightnessContrast<<>>(src, dst, batchArgs, size, numPlanes); } NVCV_CHECK_THROW(cudaGetLastError()); @@ -419,12 +431,10 @@ inline BatchArgsWrap GetBatchArgsWrap(nvcv::Optional{} : cuda::Tensor1DWrap(*brightnessData), - contrastLen == 0 ? cuda::Tensor1DWrap{} : cuda::Tensor1DWrap(*contrastData), - brightnessShiftLen == 0 ? cuda::Tensor1DWrap{} - : cuda::Tensor1DWrap(*brightnessShiftData), - constrastCenterLen == 0 ? cuda::Tensor1DWrap{} - : cuda::Tensor1DWrap(*contrastCenterData)}; + brightnessLen == 0 ? ArgWrapper{} : ArgWrapper(*brightnessData), + contrastLen == 0 ? ArgWrapper{} : ArgWrapper(*contrastData), + brightnessShiftLen == 0 ? ArgWrapper{} : ArgWrapper(*brightnessShiftData), + constrastCenterLen == 0 ? ArgWrapper{} : ArgWrapper(*contrastCenterData)}; } inline auto validateSrcDstVarBatch(int &numSamples, int &numInterleavedChannels, int &numPlanes, diff --git a/src/cvcuda/priv/OpColorTwist.cu b/src/cvcuda/priv/OpColorTwist.cu index 1354def1f..4fd6f3995 100644 --- a/src/cvcuda/priv/OpColorTwist.cu +++ b/src/cvcuda/priv/OpColorTwist.cu @@ -167,13 +167,26 @@ inline void RunColorTwist(cudaStream_t stream, const SrcData &srcData, const Dst dim3 block(32, 4, 1); if constexpr (std::is_same_v) { - auto srcAccess = nvcv::TensorDataAccessStridedImage::Create(srcData); - int2 size = cuda::StaticCast(long2{srcAccess->numCols(), srcAccess->numRows()}); - dim3 grid(util::DivUp(size.x, block.x), util::DivUp(size.y, block.y), srcAccess->numSamples()); - - auto src = cuda::CreateTensorWrapNHW(srcData); - auto dst = cuda::CreateTensorWrapNHW(dstData); - ColorTwist<<>>(src, dst, size, param); + auto inAccess = nvcv::TensorDataAccessStridedImage::Create(srcData); + NVCV_ASSERT(inAccess); + auto outAccess = nvcv::TensorDataAccessStridedImage::Create(dstData); + NVCV_ASSERT(outAccess); + int2 size = cuda::StaticCast(long2{inAccess->numCols(), inAccess->numRows()}); + dim3 grid(util::DivUp(size.x, block.x), util::DivUp(size.y, block.y), inAccess->numSamples()); + + int64_t inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateTensorWrapNHW(srcData); + auto dst = cuda::CreateTensorWrapNHW(dstData); + ColorTwist<<>>(src, dst, size, param); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } NVCV_CHECK_THROW(cudaGetLastError()); } else diff --git a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu index 8c9426b76..7d38cf156 100644 --- a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu +++ b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu @@ -59,11 +59,11 @@ __device__ float get_scale_value(TensorWrapper data, int ch, int numChannels, fl } } -template -__device__ void transfer_data(cuda::BorderVarShapeWrap srcWrap, cuda::Tensor4DWrap dstWrap, - int2 src_idx, int2 dst_idx, int batchidx, int ch, TensorWrapper baseWrap, - TensorWrapper scaleWrap, float global_scale, float global_shift, float epsilon, - uint32_t flags, int base_channels, int scale_channels, bool dst_planar) +template +__device__ void transfer_data(cuda::BorderVarShapeWrap srcWrap, DstWrapper dstWrap, int2 src_idx, + int2 dst_idx, int batchidx, int ch, TensorWrapper baseWrap, TensorWrapper scaleWrap, + float global_scale, float global_shift, float epsilon, uint32_t flags, int base_channels, + int scale_channels, bool dst_planar) { if (dst_planar) { @@ -71,7 +71,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrap srcWrap, cud { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){src_idx.x, src_idx.y, c, batchidx}] - base) * scale * global_scale + global_shift); } } @@ -81,17 +81,17 @@ __device__ void transfer_data(cuda::BorderVarShapeWrap srcWrap, cud { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){src_idx.x, src_idx.y, c, batchidx}] - base) * scale * global_scale + global_shift); } } } -template -__device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, cuda::Tensor4DWrap dstWrap, - int2 src_idx, int2 dst_idx, int batchidx, int ch, TensorWrapper baseWrap, - TensorWrapper scaleWrap, float global_scale, float global_shift, float epsilon, - uint32_t flags, int base_channels, int scale_channels, bool dst_planar) +template +__device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, DstWrapper dstWrap, int2 src_idx, + int2 dst_idx, int batchidx, int ch, TensorWrapper baseWrap, TensorWrapper scaleWrap, + float global_scale, float global_shift, float epsilon, uint32_t flags, int base_channels, + int scale_channels, bool dst_planar) { if (dst_planar) { @@ -99,7 +99,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift); } } @@ -109,27 +109,27 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift); } } } -template -__device__ void set_data(cuda::Tensor4DWrap dstWrap, int2 dst_idx, int batchidx, int ch, T2 val, bool dst_planar) +template +__device__ void set_data(DstWrapper dstWrap, int2 dst_idx, int batchidx, int ch, T val, bool dst_planar) { if (dst_planar) { for (int c = 0; c < ch; c++) { - dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::StaticCast(val); + dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::StaticCast(val); } } else { for (int c = 0; c < ch; c++) { - dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::StaticCast(val); + dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::StaticCast(val); } } } @@ -185,20 +185,15 @@ __global__ void slice_flip_normalize(SrcWrapper srcWrap, DstWrapper dstWrap, Fli global_scale, global_shift, epsilon, flags, base_ch, scale_ch, dst_planar); } -template -void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVarShapeDataStridedCuda &srcData, - const nvcv::TensorDataStridedCuda &dstData, - const nvcv::TensorDataStridedCuda &flipCodeData, - const nvcv::TensorDataStridedCuda &baseData, - const nvcv::TensorDataStridedCuda &scaleData, const float borderValue, - const nvcv::TensorDataStridedCuda &cropRect, float global_scale, float shift, - float epsilon, uint32_t flags, int channel) +template +void RunCropFlipNormalizeReformatS(cudaStream_t stream, const nvcv::ImageBatchVarShapeDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, + const nvcv::TensorDataStridedCuda &flipCodeData, + const nvcv::TensorDataStridedCuda &baseData, + const nvcv::TensorDataStridedCuda &scaleData, const float borderValue, + const nvcv::TensorDataStridedCuda &cropRect, float global_scale, float shift, + float epsilon, uint32_t flags, int channel, int3 out_size, dim3 block, dim3 grid) { - int num_channels = srcData.uniqueFormat().numChannels(); - auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); - NVCV_ASSERT(outAccess); - const int3 out_size = {outAccess->numCols(), outAccess->numRows(), num_channels}; - nvcv::TensorLayout dst_layout = dstData.layout(); if (!(dst_layout == nvcv::TENSOR_NHWC || dst_layout == nvcv::TENSOR_NCHW)) { @@ -208,28 +203,22 @@ void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVar bool src_planar = srcData.uniqueFormat().numPlanes() > 1; bool dst_planar = dst_layout == nvcv::TENSOR_NCHW; - nvcv::Size2D maxSize = {outAccess->numCols(), outAccess->numRows()}; - int32_t batchSize = srcData.numImages(); - dim3 block(32, 32, 1); - dim3 grid(std::ceil(maxSize.w / static_cast(block.x)), std::ceil(maxSize.h / static_cast(block.y)), - batchSize); - auto baseAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(baseData); auto scaleAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(scaleData); int scale_channels = scaleAccess->numChannels(); int base_channels = baseAccess->numChannels(); - auto cropRectWrap = cuda::CreateTensorWrapNHWC(cropRect); - cuda::Tensor1DWrap flipCodeWrap(flipCodeData); - auto baseWrap = cuda::CreateTensorWrapNHWC(baseData); - auto scaleWrap = cuda::CreateTensorWrapNHWC(scaleData); + auto cropRectWrap = cuda::CreateTensorWrapNHWC(cropRect); + cuda::Tensor1DWrap flipCodeWrap(flipCodeData); + auto baseWrap = cuda::CreateTensorWrapNHWC(baseData); + auto scaleWrap = cuda::CreateTensorWrapNHWC(scaleData); if (src_planar && dst_planar) { cuda::ImageBatchVarShapeWrap srcWrap(srcData); // planar cuda::BorderVarShapeWrap srcBorderWrap(srcWrap, static_cast(borderValue)); - cuda::Tensor4DWrap dstWrap(dstData); // planar + cuda::Tensor4DWrap dstWrap(dstData); // planar slice_flip_normalize<<>>(srcBorderWrap, dstWrap, flipCodeWrap, baseWrap, scaleWrap, cropRectWrap, global_scale, shift, epsilon, flags, channel, base_channels, scale_channels, out_size, dst_planar); @@ -238,7 +227,7 @@ void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVar { cuda::ImageBatchVarShapeWrap srcWrap(srcData); // planar cuda::BorderVarShapeWrap srcBorderWrap(srcWrap, static_cast(borderValue)); - auto dstWrap = cuda::CreateTensorWrapNHWC(dstData); // interleaved + auto dstWrap = cuda::CreateTensorWrapNHWC(dstData); // interleaved slice_flip_normalize<<>>(srcBorderWrap, dstWrap, flipCodeWrap, baseWrap, scaleWrap, cropRectWrap, global_scale, shift, epsilon, flags, channel, base_channels, scale_channels, out_size, dst_planar); @@ -247,7 +236,7 @@ void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVar { cuda::ImageBatchVarShapeWrapNHWC srcWrap(srcData, channel); // interleaved cuda::BorderVarShapeWrapNHWC srcBorderWrap(srcWrap, static_cast(borderValue)); - cuda::Tensor4DWrap dstWrap(dstData); // planar + cuda::Tensor4DWrap dstWrap(dstData); // planar slice_flip_normalize<<>>(srcBorderWrap, dstWrap, flipCodeWrap, baseWrap, scaleWrap, cropRectWrap, global_scale, shift, epsilon, flags, channel, base_channels, scale_channels, out_size, dst_planar); @@ -256,13 +245,47 @@ void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVar { cuda::ImageBatchVarShapeWrapNHWC srcWrap(srcData, channel); // interleaved cuda::BorderVarShapeWrapNHWC srcBorderWrap(srcWrap, static_cast(borderValue)); - auto dstWrap = cuda::CreateTensorWrapNHWC(dstData); // interleaved + auto dstWrap = cuda::CreateTensorWrapNHWC(dstData); // interleaved slice_flip_normalize<<>>(srcBorderWrap, dstWrap, flipCodeWrap, baseWrap, scaleWrap, cropRectWrap, global_scale, shift, epsilon, flags, channel, base_channels, scale_channels, out_size, dst_planar); } } +template +void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVarShapeDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, + const nvcv::TensorDataStridedCuda &flipCodeData, + const nvcv::TensorDataStridedCuda &baseData, + const nvcv::TensorDataStridedCuda &scaleData, const float borderValue, + const nvcv::TensorDataStridedCuda &cropRect, float global_scale, float shift, + float epsilon, uint32_t flags, int channel) +{ + int num_channels = srcData.uniqueFormat().numChannels(); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); + NVCV_ASSERT(outAccess); + const int3 out_size = {outAccess->numCols(), outAccess->numRows(), num_channels}; + + nvcv::Size2D maxSize = {outAccess->numCols(), outAccess->numRows()}; + int32_t batchSize = srcData.numImages(); + dim3 block(32, 32, 1); + dim3 grid(std::ceil(maxSize.w / static_cast(block.x)), std::ceil(maxSize.h / static_cast(block.y)), + batchSize); + + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (outMaxStride <= cuda::TypeTraits::max) + { + RunCropFlipNormalizeReformatS(stream, srcData, dstData, flipCodeData, baseData, + scaleData, borderValue, cropRect, global_scale, shift, + epsilon, flags, channel, out_size, block, grid); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } +} + template void RunCropFlipNormalizeReformat(cudaStream_t stream, const nvcv::ImageBatchVarShapeDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData, diff --git a/src/cvcuda/priv/OpHQResize.cu b/src/cvcuda/priv/OpHQResize.cu index e7b924d81..86c069f51 100644 --- a/src/cvcuda/priv/OpHQResize.cu +++ b/src/cvcuda/priv/OpHQResize.cu @@ -1618,6 +1618,21 @@ inline int TensorNumChannels(const nvcv::Tensor &tensor) return shape[channelAxis]; } +inline int64_t TensorByteSize(const nvcv::Tensor &tensor) +{ + auto data = tensor.exportData().cast(); + assert(data); + return data->stride(0) * data->shape(0); +} + +inline int64_t ImageByteSize(const nvcv::Image &image) +{ + auto data = image.exportData(); + assert(data); + auto plane = data->plane(0); // only single-plane images are supported + return plane.rowStride * plane.height; +} + inline int SampleNumChannels(const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst, int sampleIdx) { const auto &srcSample = src[sampleIdx]; @@ -1957,6 +1972,10 @@ public: kIntermediateAlignment); } + auto inMaxStride = shape::TensorByteSize(src); + auto outMaxStride = shape::TensorByteSize(dst); + bool wideStride = std::max(inMaxStride, outMaxStride) > cuda::TypeTraits::max; + RunTypedSwitch( srcDtype, dstDtype, numChannels, [&](auto dummySrcVal, auto intermediateVal, auto dummyDstVal, auto numChannelsVal) @@ -1970,8 +1989,16 @@ public: static_assert(cuda::NumElements == cuda::NumElements); auto &[srcAccess, dstAccess, numSamples, numChannels, srcDtype, dstDtype] = tensorAccess; - RunPasses(sampleDesc, *dstAccess, *srcAccess, intermediate, - numSamples, ws, stream); + if (wideStride) + { + RunPasses( + sampleDesc, *dstAccess, *srcAccess, intermediate, numSamples, ws, stream); + } + else + { + RunPasses( + sampleDesc, *dstAccess, *srcAccess, intermediate, numSamples, ws, stream); + } }); } @@ -1999,6 +2026,9 @@ public: SampleDescT *sampleDescsCpu = allocator.getPinned(numSamples); SampleDescT *sampleDescsGpu = allocator.getCuda(numSamples); size_t intermediateSizes[kNumTmpBuffers]{}; + + int64_t inMaxStride = 0; + int64_t outMaxStride = 0; for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { const VecI srcShape = shape::SampleShape(src, sampleIdx); @@ -2007,12 +2037,16 @@ public: int numChannels; if constexpr (std::is_same_v) { - numChannels = uniqueNumChannels; + numChannels = uniqueNumChannels; + inMaxStride = std::max(inMaxStride, shape::ImageByteSize(src[sampleIdx])); + outMaxStride = std::max(outMaxStride, shape::ImageByteSize(dst[sampleIdx])); } - else if constexpr (!std::is_same_v) + else { static_assert(std::is_same_v); - numChannels = shape::SampleNumChannels(src, dst, sampleIdx); + numChannels = shape::SampleNumChannels(src, dst, sampleIdx); + inMaxStride = std::max(inMaxStride, shape::TensorByteSize(src[sampleIdx])); + outMaxStride = std::max(outMaxStride, shape::TensorByteSize(dst[sampleIdx])); } SampleDescT &sampleDesc = sampleDescsCpu[sampleIdx]; SetupSampleDesc(sampleDesc, srcShape, dstShape, numChannels, sampleRoi, minFilter, magFilter); @@ -2021,6 +2055,8 @@ public: intermediateSizes[t] += GetPassOutputVolume(sampleDesc, t); } } + bool wideStride = std::max(inMaxStride, outMaxStride) > cuda::TypeTraits::max; + NVCV_CHECK_THROW(cudaMemcpyAsync(sampleDescsGpu, sampleDescsCpu, numSamples * sizeof(SampleDescT), cudaMemcpyHostToDevice, stream)); @@ -2029,7 +2065,8 @@ public: IntermediateBaseT *intermediate[kNumTmpBuffers]; for (int t = 0; t < kNumTmpBuffers; t++) { - intermediateMeta[t] = batch_wrapper::dynamic::AllocateDynamicBatchWrapMeta(allocator, numSamples); + intermediateMeta[t] + = batch_wrapper::dynamic::AllocateDynamicBatchWrapMeta(allocator, numSamples, wideStride); } // allocate space for intermediate data for (int t = 0; t < kNumTmpBuffers; t++) @@ -2037,17 +2074,16 @@ public: intermediate[t] = allocator.getCuda(intermediateSizes[t], kIntermediateAlignment); } - RunTyped(sampleDescsCpu, sampleDescsGpu, src, dst, intermediate, intermediateMeta, numSamples, srcDtype, - dstDtype, uniqueNumChannels, ws, stream); + Run(sampleDescsCpu, sampleDescsGpu, src, dst, intermediate, intermediateMeta, numSamples, srcDtype, dstDtype, + uniqueNumChannels, wideStride, ws, stream); } private: - void RunTyped(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, - const nvcv::ImageBatchVarShape &src, const nvcv::ImageBatchVarShape &dst, - IntermediateBaseT *intermediate[kNumTmpBuffers], - const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, - const nvcv::DataType srcDtype, const nvcv::DataType dstDtype, int uniqueNumChannels, - const cvcuda::Workspace &ws, cudaStream_t stream) const + void Run(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const nvcv::ImageBatchVarShape &src, + const nvcv::ImageBatchVarShape &dst, IntermediateBaseT *intermediate[kNumTmpBuffers], + const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, const nvcv::DataType srcDtype, + const nvcv::DataType dstDtype, int uniqueNumChannels, bool wideStride, const cvcuda::Workspace &ws, + cudaStream_t stream) const { static_assert(kSpatialNDim == 2, "ImageBatchVarShape does not support 3D spatial resampling"); @@ -2083,18 +2119,27 @@ private: static_assert(numStaticChannels == cuda::NumElements); static_assert(cuda::NumElements == cuda::NumElements); static_assert(cuda::NumElements == cuda::NumElements); - RunPasses(sampleDescsCpu, sampleDescsGpu, *dstData, - *srcData, intermediate, intermediateMeta, - numSamples, ws, stream); + if (wideStride) + { + RunPasses( + sampleDescsCpu, sampleDescsGpu, *dstData, *srcData, intermediate, intermediateMeta, + numSamples, ws, stream); + } + else + { + RunPasses( + sampleDescsCpu, sampleDescsGpu, *dstData, *srcData, intermediate, intermediateMeta, + numSamples, ws, stream); + } } }); } - void RunTyped(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const nvcv::TensorBatch &src, - const nvcv::TensorBatch &dst, IntermediateBaseT *intermediate[kNumTmpBuffers], - const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, - const nvcv::DataType srcDtype, const nvcv::DataType dstDtype, int uniqueNumChannels, - const cvcuda::Workspace &ws, cudaStream_t stream) const + void Run(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const nvcv::TensorBatch &src, + const nvcv::TensorBatch &dst, IntermediateBaseT *intermediate[kNumTmpBuffers], + const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, const nvcv::DataType srcDtype, + const nvcv::DataType dstDtype, int uniqueNumChannels, bool wideStride, const cvcuda::Workspace &ws, + cudaStream_t stream) const { // Other cointainer allow exporting data with const qualifiers const auto srcData @@ -2140,13 +2185,23 @@ private: static_assert(cuda::NumElements == cuda::NumElements); static_assert(cuda::NumElements == cuda::NumElements); - RunPasses(sampleDescsCpu, sampleDescsGpu, *dstData, - *srcData, intermediate, intermediateMeta, - numSamples, ws, stream); + if (wideStride) + { + RunPasses( + sampleDescsCpu, sampleDescsGpu, *dstData, *srcData, intermediate, intermediateMeta, numSamples, + ws, stream); + } + else + { + RunPasses( + sampleDescsCpu, sampleDescsGpu, *dstData, *srcData, intermediate, intermediateMeta, numSamples, + ws, stream); + } }); } - template + template std::enable_if_t RunPasses(const SampleDescT &sampleDesc, const nvcv::TensorDataAccessStridedImagePlanar &dstAccess, const nvcv::TensorDataAccessStridedImagePlanar &srcAccess, @@ -2157,16 +2212,17 @@ private: constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; // sample extent, spatial extents, optional dynamic channel extent constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; - using OutWrap = cuda::TensorNDWrap; - using InWrap = cuda::TensorNDWrap; - using InterWrap = cuda::TensorNDWrap; + using OutWrap = cuda::TensorNDWrap; + using InWrap = cuda::TensorNDWrap; + using InterWrap = cuda::TensorNDWrap; static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); - const OutWrap outWrap = batch_wrapper::tensor::WrapTensor(dstAccess); - const InWrap inWrap = batch_wrapper::tensor::WrapTensor( + const OutWrap outWrap + = batch_wrapper::tensor::WrapTensor(dstAccess); + const InWrap inWrap = batch_wrapper::tensor::WrapTensor( srcAccess, sampleDesc.inRoiOffset); - const InterWrap interWrap = batch_wrapper::tensor::CreateDenseWrap( + const InterWrap interWrap = batch_wrapper::tensor::CreateDenseWrap( intermediate[0], sampleDesc.channels, sampleDesc.shapes[1]); RunPass(sampleDesc, interWrap, inWrap, numSamples, stream); RunPass(sampleDesc, outWrap, interWrap, numSamples, stream); @@ -2176,7 +2232,8 @@ private: } } - template + template std::enable_if_t RunPasses(const SampleDescT &sampleDesc, const nvcv::TensorDataAccessStridedImagePlanar &dstAccess, const nvcv::TensorDataAccessStridedImagePlanar &srcAccess, @@ -2187,19 +2244,22 @@ private: constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; // sample extent, spatial extents, optional dynamic channel extent constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; - using OutWrap = cuda::TensorNDWrap; - using InWrap = cuda::TensorNDWrap; - using InterWrap = cuda::TensorNDWrap; + using OutWrap = cuda::TensorNDWrap; + using InWrap = cuda::TensorNDWrap; + using InterWrap = cuda::TensorNDWrap; static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); - const OutWrap outWrap = batch_wrapper::tensor::WrapTensor(dstAccess); - const InWrap inWrap = batch_wrapper::tensor::WrapTensor( + const OutWrap outWrap + = batch_wrapper::tensor::WrapTensor(dstAccess); + const InWrap inWrap = batch_wrapper::tensor::WrapTensor( srcAccess, sampleDesc.inRoiOffset); - const InterWrap interWrap0 = batch_wrapper::tensor::CreateDenseWrap( - intermediate[0], sampleDesc.channels, sampleDesc.shapes[1]); - const InterWrap interWrap1 = batch_wrapper::tensor::CreateDenseWrap( - intermediate[1], sampleDesc.channels, sampleDesc.shapes[2]); + const InterWrap interWrap0 + = batch_wrapper::tensor::CreateDenseWrap( + intermediate[0], sampleDesc.channels, sampleDesc.shapes[1]); + const InterWrap interWrap1 + = batch_wrapper::tensor::CreateDenseWrap( + intermediate[1], sampleDesc.channels, sampleDesc.shapes[2]); RunPass(sampleDesc, interWrap0, inWrap, numSamples, stream); RunPass(sampleDesc, interWrap1, interWrap0, numSamples, stream); RunPass(sampleDesc, outWrap, interWrap1, numSamples, stream); @@ -2209,8 +2269,8 @@ private: } } - template + template std::enable_if_t RunPasses(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const BatchDataStridedCuda &dstData, const BatchDataStridedCuda &srcData, IntermediateBaseT *intermediate[kNumTmpBuffers], @@ -2223,20 +2283,20 @@ private: constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; using BatchWrapOutT = std::conditional_t, - batch_wrapper::ImageBatchVarShapeWrapAdapter, - batch_wrapper::TensorBatchWrapAdapter>; + batch_wrapper::ImageBatchVarShapeWrapAdapter, + batch_wrapper::TensorBatchWrapAdapter>; using BatchWrapInT = std::conditional_t, - batch_wrapper::ImageBatchVarShapeWrapAdapter, - batch_wrapper::TensorBatchWrapAdapter>; - using DynamicBatchWrap = batch_wrapper::dynamic::DynamicBatchWrap; + batch_wrapper::ImageBatchVarShapeWrapAdapter, + batch_wrapper::TensorBatchWrapAdapter>; + using DynamicBatchWrap = batch_wrapper::dynamic::DynamicBatchWrap; static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); const BatchWrapOutT outWrap(dstData); const BatchWrapInT inWrap(srcData); const DynamicBatchWrap intermediateWrap - = batch_wrapper::dynamic::CreateDynamicBatchWrap( + = batch_wrapper::dynamic::CreateDynamicBatchWrap( 0, intermediate[0], intermediateMeta[0], sampleDescsCpu, numSamples, stream); if (ws.pinnedMem.ready != nullptr) { @@ -2250,7 +2310,8 @@ private: } } - template + template std::enable_if_t RunPasses(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const nvcv::TensorBatchDataStridedCuda &dstData, const nvcv::TensorBatchDataStridedCuda &srcData, @@ -2262,19 +2323,19 @@ private: constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; // sample extent, spatial extents, optional dynamic channel extent constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; - using TensorBatchWrapOutT = batch_wrapper::TensorBatchWrapAdapter; - using TensorBatchWrapInT = batch_wrapper::TensorBatchWrapAdapter; - using DynamicBatchWrap = batch_wrapper::dynamic::DynamicBatchWrap; + using TensorBatchWrapOutT = batch_wrapper::TensorBatchWrapAdapter; + using TensorBatchWrapInT = batch_wrapper::TensorBatchWrapAdapter; + using DynamicBatchWrap = batch_wrapper::dynamic::DynamicBatchWrap; static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); static_assert(std::is_trivially_copyable_v); const TensorBatchWrapOutT outWrap(dstData); const TensorBatchWrapInT inWrap(srcData); const DynamicBatchWrap intermediateWrap0 - = batch_wrapper::dynamic::CreateDynamicBatchWrap( + = batch_wrapper::dynamic::CreateDynamicBatchWrap( 0, intermediate[0], intermediateMeta[0], sampleDescsCpu, numSamples, stream); const DynamicBatchWrap intermediateWrap1 - = batch_wrapper::dynamic::CreateDynamicBatchWrap( + = batch_wrapper::dynamic::CreateDynamicBatchWrap( 1, intermediate[1], intermediateMeta[1], sampleDescsCpu, numSamples, stream); if (ws.pinnedMem.ready != nullptr) { diff --git a/src/cvcuda/priv/OpHQResizeBatchWrap.cuh b/src/cvcuda/priv/OpHQResizeBatchWrap.cuh index 8f7b69411..02232975f 100644 --- a/src/cvcuda/priv/OpHQResizeBatchWrap.cuh +++ b/src/cvcuda/priv/OpHQResizeBatchWrap.cuh @@ -77,127 +77,134 @@ auto ComputeDenseStrides(VecI<3> shape, Channels... channels) } namespace tensor { -template +template auto CreateDenseWrap(cuda::BaseType *base, const std::array strides) { constexpr int N = kNStrides + 1; for (auto stride : strides) { - NVCV_ASSERT(stride <= cuda::TypeTraits::max); + NVCV_ASSERT(stride <= cuda::TypeTraits::max); } static_assert(2 <= N && N <= 5); if constexpr (N == 5) { - return cuda::TensorNDWrap(base, static_cast(strides[3]), static_cast(strides[2]), - static_cast(strides[1]), static_cast(strides[0])); + return cuda::TensorNDWrap(base, static_cast(strides[3]), + static_cast(strides[2]), static_cast(strides[1]), + static_cast(strides[0])); } else if constexpr (N == 4) { - return cuda::TensorNDWrap(base, static_cast(strides[2]), static_cast(strides[1]), - static_cast(strides[0])); + return cuda::TensorNDWrap(base, static_cast(strides[2]), + static_cast(strides[1]), static_cast(strides[0])); } else if constexpr (N == 3) { - return cuda::TensorNDWrap(base, static_cast(strides[1]), static_cast(strides[0])); + return cuda::TensorNDWrap(base, static_cast(strides[1]), + static_cast(strides[0])); } else if constexpr (N == 2) { - return cuda::TensorNDWrap(base, static_cast(strides[0])); + return cuda::TensorNDWrap(base, static_cast(strides[0])); } } -template +template auto CreateDenseWrap(cuda::BaseType *base, int numChannels, ShapeT shape) { static constexpr int kNStrides = cuda::NumElements + kHasDynamicChannels; if constexpr (kHasDynamicChannels) { auto strides = ComputeDenseStrides(shape, numChannels); - return CreateDenseWrap(base, strides); + return CreateDenseWrap(base, strides); } else if constexpr (!kHasDynamicChannels) { auto strides = ComputeDenseStrides(shape); - return CreateDenseWrap(base, strides); + return CreateDenseWrap(base, strides); } } -template -std::enable_if_t> WrapTensor( +template +std::enable_if_t> WrapTensor( const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const ptrdiff_t roiOffset = 0) { - NVCV_ASSERT(tensorAccess.sampleStride() <= cuda::TypeTraits::max); - NVCV_ASSERT(tensorAccess.rowStride() <= cuda::TypeTraits::max); - NVCV_ASSERT(tensorAccess.colStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.sampleStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.rowStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.colStride() <= cuda::TypeTraits::max); if constexpr (kHasDynamicChannels) { - return cuda::TensorNDWrap( - tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), - static_cast(tensorAccess.rowStride()), static_cast(tensorAccess.colStride())); + return cuda::TensorNDWrap( + tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.rowStride()), static_cast(tensorAccess.colStride())); } else { - return cuda::TensorNDWrap(tensorAccess.sampleData(0) + roiOffset, - static_cast(tensorAccess.sampleStride()), - static_cast(tensorAccess.rowStride())); + return cuda::TensorNDWrap(tensorAccess.sampleData(0) + roiOffset, + static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.rowStride())); } } -template -std::enable_if_t> WrapTensor( +template +std::enable_if_t> WrapTensor( const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const ptrdiff_t roiOffset = 0) { - NVCV_ASSERT(tensorAccess.sampleStride() <= cuda::TypeTraits::max); - NVCV_ASSERT(tensorAccess.depthStride() <= cuda::TypeTraits::max); - NVCV_ASSERT(tensorAccess.rowStride() <= cuda::TypeTraits::max); - NVCV_ASSERT(tensorAccess.colStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.sampleStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.depthStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.rowStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.colStride() <= cuda::TypeTraits::max); if constexpr (kHasDynamicChannels) { - return cuda::TensorNDWrap( - tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), - static_cast(tensorAccess.depthStride()), static_cast(tensorAccess.rowStride()), - static_cast(tensorAccess.colStride())); + return cuda::TensorNDWrap( + tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.depthStride()), static_cast(tensorAccess.rowStride()), + static_cast(tensorAccess.colStride())); } else { - return cuda::TensorNDWrap( - tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), - static_cast(tensorAccess.depthStride()), static_cast(tensorAccess.rowStride())); + return cuda::TensorNDWrap( + tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.depthStride()), static_cast(tensorAccess.rowStride())); } } -template -std::enable_if_t> WrapTensor( +template +std::enable_if_t> WrapTensor( const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const VecI<2> &roiOffset) { ptrdiff_t offset = tensorAccess.rowStride() * roiOffset.y + tensorAccess.colStride() * roiOffset.x; - return WrapTensor(tensorAccess, offset); + return WrapTensor(tensorAccess, offset); } -template -std::enable_if_t> WrapTensor( +template +std::enable_if_t> WrapTensor( const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const VecI<3> &roiOffset) { ptrdiff_t offset = tensorAccess.depthStride() * roiOffset.z + tensorAccess.rowStride() * roiOffset.y + tensorAccess.colStride() * roiOffset.x; - return WrapTensor(tensorAccess, offset); + return WrapTensor(tensorAccess, offset); } template auto __device__ GetSampleView(const TensorWrap &batchTensorWrap, const int sampleIdx) { using T = typename TensorWrap::ValueType; + using StrideType = typename TensorWrap::StrideType; static constexpr int kNumDimensions = TensorWrap::kNumDimensions; static constexpr int kNumSampleDim = kNumDimensions - 1; // not including sample (N) dim static constexpr int kVariableStrides = kNumSampleDim - 1; // the innermost stride is static - sizeof type - using TensorWrapT = cuda::TensorNDWrap; + using TensorWrapT = cuda::TensorNDWrap; static_assert(kVariableStrides == TensorWrapT::kVariableStrides); static_assert(kVariableStrides + 1 == TensorWrap::kVariableStrides); static_assert(1 <= kVariableStrides && kVariableStrides <= 3); - auto *basePtr = batchTensorWrap.ptr(sampleIdx); - const int *strides = batchTensorWrap.strides(); + auto *basePtr = batchTensorWrap.ptr(sampleIdx); + const auto *strides = batchTensorWrap.strides(); if constexpr (kVariableStrides == 1) { return TensorWrapT{basePtr, strides[1]}; @@ -215,25 +222,30 @@ auto __device__ GetSampleView(const TensorWrap &batchTensorWrap, const int sampl } // namespace tensor namespace dynamic { -struct TensorAccessDesc -{ - static constexpr int kMaxNStrides = 3; +struct TensorAccessDescBase +{ unsigned char *basePtr; - int strides[kMaxNStrides]; }; -template -void SetupTensorAccessStrides(TensorAccessDesc &tensorAccessDesc, const std::array strides) +template +struct TensorAccessDesc : public TensorAccessDescBase +{ + static constexpr int kMaxNStrides = 3; + StrideT strides[kMaxNStrides]; +}; + +template +void SetupTensorAccessStrides(TensorAccessDesc *tensorAccessDesc, const std::array strides) { // we ignore the last stride (sample stride), it's not needed for a single sample // as the samples are not assumed to be uniform static constexpr int kNSampleStrides = kNStrides - 1; - static_assert(kNSampleStrides <= TensorAccessDesc::kMaxNStrides); + static_assert(kNSampleStrides <= TensorAccessDesc::kMaxNStrides); for (int d = 0; d < kNSampleStrides; d++) { - NVCV_ASSERT(strides[d] <= cuda::TypeTraits::max); - tensorAccessDesc.strides[kNSampleStrides - 1 - d] = strides[d]; + NVCV_ASSERT(strides[d] <= cuda::TypeTraits::max); + tensorAccessDesc->strides[kNSampleStrides - 1 - d] = strides[d]; } } @@ -241,18 +253,19 @@ void SetupTensorAccessStrides(TensorAccessDesc &tensorAccessDesc, const std::arr * @brief Wrapper for batch of dynamically created samples * (here, batch of intermediate samples between resampling passes) */ -template +template struct DynamicBatchWrap { using ValueType = T; + using StrideType = StrideT; static constexpr int kNumDimensions = N; static constexpr int kNumSampleDim = kNumDimensions - 1; // not including sample (N) dim static constexpr int kVariableStrides = kNumSampleDim - 1; // the innermost stride is static - sizeof type - using TensorWrapT = cuda::TensorNDWrap; + using TensorWrapT = cuda::TensorNDWrap; static_assert(kVariableStrides == TensorWrapT::kVariableStrides); - static_assert(kVariableStrides >= 1 && kVariableStrides <= TensorAccessDesc::kMaxNStrides); + static_assert(kVariableStrides >= 1 && kVariableStrides <= TensorAccessDesc::kMaxNStrides); - DynamicBatchWrap(TensorAccessDesc *samples) + DynamicBatchWrap(TensorAccessDesc *samples) : m_samples{samples} { } @@ -279,34 +292,43 @@ struct DynamicBatchWrap } private: - TensorAccessDesc *m_samples; + TensorAccessDesc *m_samples; }; struct DynamicBatchWrapMeta { - TensorAccessDesc *cpu; - TensorAccessDesc *gpu; + TensorAccessDescBase *cpu; + TensorAccessDescBase *gpu; }; inline void AddDynamicBatchWrapMeta(WorkspaceEstimator &est, int numSamples) { - est.addPinned(numSamples); - est.addCuda(numSamples); + est.addPinned>(numSamples); + est.addCuda>(numSamples); } -inline DynamicBatchWrapMeta AllocateDynamicBatchWrapMeta(WorkspaceAllocator &allocator, int numSamples) +inline DynamicBatchWrapMeta AllocateDynamicBatchWrapMeta(WorkspaceAllocator &allocator, int numSamples, bool wideStride) { DynamicBatchWrapMeta meta; - meta.cpu = allocator.getPinned(numSamples); - meta.gpu = allocator.getCuda(numSamples); + if (wideStride) + { + meta.cpu = allocator.getPinned>(numSamples); + meta.gpu = allocator.getCuda>(numSamples); + } + else + { + meta.cpu = allocator.getPinned>(numSamples); + meta.gpu = allocator.getCuda>(numSamples); + } return meta; } -template -DynamicBatchWrap CreateDynamicBatchWrap(int pass, cuda::BaseType *intermediate, - const DynamicBatchWrapMeta tensorBatchMeta, - const SampleDescT *sampleDescsCpu, int numSamples, cudaStream_t stream) +DynamicBatchWrap CreateDynamicBatchWrap(int pass, cuda::BaseType *intermediate, + const DynamicBatchWrapMeta tensorBatchMeta, + const SampleDescT *sampleDescsCpu, int numSamples, + cudaStream_t stream) { static constexpr int kSpatialNDim = SampleDescT::kSpatialNDim; static_assert(N == 1 + kSpatialNDim + kHasDynamicChannels); @@ -316,8 +338,9 @@ DynamicBatchWrap CreateDynamicBatchWrap(int pass, cuda::BaseType *inter { const SampleDescT &sampleDesc = sampleDescsCpu[sampleIdx]; VecI outputShape = sampleDesc.shapes[pass + 1]; - TensorAccessDesc &tensorAccess = tensorBatchMeta.cpu[sampleIdx]; - tensorAccess.basePtr = reinterpret_cast(intermediate) + sampleOffset; + auto *cpuMeta = reinterpret_cast *>(tensorBatchMeta.cpu); + auto *tensorAccess = &cpuMeta[sampleIdx]; + tensorAccess->basePtr = reinterpret_cast(intermediate) + sampleOffset; if constexpr (kHasDynamicChannels) { constexpr int kNStrides = kSpatialNDim + 1; @@ -333,21 +356,22 @@ DynamicBatchWrap CreateDynamicBatchWrap(int pass, cuda::BaseType *inter sampleOffset += strides[kNStrides - 1]; } } - NVCV_CHECK_THROW(cudaMemcpyAsync(tensorBatchMeta.gpu, tensorBatchMeta.cpu, numSamples * sizeof(TensorAccessDesc), - cudaMemcpyHostToDevice, stream)); + NVCV_CHECK_THROW(cudaMemcpyAsync(tensorBatchMeta.gpu, tensorBatchMeta.cpu, + numSamples * sizeof(TensorAccessDesc), cudaMemcpyHostToDevice, stream)); - return {tensorBatchMeta.gpu}; + return {reinterpret_cast *>(tensorBatchMeta.gpu)}; } } // namespace dynamic -template +template struct ImageBatchVarShapeWrapAdapter { using ValueType = T; + using StrideType = StrideT; static constexpr int kNumDimensions = 3; // NHW static constexpr int kNumSampleDim = 2; // HW static constexpr int kVariableStrides = 1; // the innermost stride is static - sizeof type - using TensorWrapT = cuda::TensorNDWrap; + using TensorWrapT = cuda::TensorNDWrap; static_assert(kVariableStrides == TensorWrapT::kVariableStrides); ImageBatchVarShapeWrapAdapter(const nvcv::ImageBatchVarShapeDataStridedCuda &batchData) @@ -369,15 +393,16 @@ private: cuda::ImageBatchVarShapeWrap m_batch; }; -template +template struct TensorBatchWrapAdapter { using ValueType = T; + using StrideType = StrideT; static constexpr int kNumDimensions = N; static constexpr int kNumSampleDim = kNumDimensions - 1; // not including sample (N) dim static constexpr int kVariableStrides = kNumSampleDim - 1; - using TensorWrapT = cuda::TensorNDWrap; - using TensorBatchWrapT = cuda::TensorBatchNDWrap; + using TensorWrapT = cuda::TensorNDWrap; + using TensorBatchWrapT = cuda::TensorBatchNDWrap; static_assert(kVariableStrides == TensorWrapT::kVariableStrides); static_assert(kVariableStrides == TensorBatchWrapT::kVariableStrides); diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu index b552e8656..0ea67c5f8 100644 --- a/src/cvcuda/priv/OpLabel.cu +++ b/src/cvcuda/priv/OpLabel.cu @@ -72,6 +72,9 @@ constexpr int REGION_NOT_MARKED = 0; constexpr int REGION_REMOVED = 1; constexpr int REGION_INSIDE_MASK = 2; +template +using ArgWrap = cuda::Tensor1DWrap; + // CUDA kernels ---------------------------------------------------------------- template @@ -126,10 +129,10 @@ __device__ DT Reduction(DT *labels, DT label1, DT label2) // -- 2D kernels -- -template -__global__ void BlockLabel2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, cuda::Tensor1DWrap minThresh, - cuda::Tensor1DWrap maxThresh, int2 size) +template +__global__ void BlockLabel2D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int2 size) { + using DT = typename DstWrap::ValueType; __shared__ DT labels[BW * BH]; int2 tc = cuda::StaticCast(cuda::DropCast<2>(threadIdx)); @@ -211,10 +214,10 @@ __global__ void BlockLabel2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap } } -template -__global__ void YLabelReduction2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, - cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int2 size) +template +__global__ void YLabelReduction2D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int2 size) { + using DT = typename DstWrap::ValueType; int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = (blockIdx.y * blockDim.y + threadIdx.y) * blockDim.y + blockDim.y; @@ -261,10 +264,11 @@ __global__ void YLabelReduction2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap } } -template -__global__ void XLabelReduction2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, - cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int2 size) +template +__global__ void XLabelReduction2D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int2 size) { + using DT = typename DstWrap::ValueType; + int3 gc; gc.x = (blockIdx.y * blockDim.y + threadIdx.y) * blockDim.x + blockDim.x; gc.y = blockIdx.x * blockDim.x + threadIdx.x; @@ -313,8 +317,8 @@ __global__ void XLabelReduction2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap } } -template -__global__ void ResolveLabels2D(cuda::Tensor3DWrap
dst, int2 size) +template +__global__ void ResolveLabels2D(DstWrap dst, int2 size) { int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -329,11 +333,12 @@ __global__ void ResolveLabels2D(cuda::Tensor3DWrap
dst, int2 size) dst[gc] = FindRoot(dst.ptr(gc.z), dst[gc]); } -template -__global__ void ReplaceBgLabels2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap src, - cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap minThresh, - cuda::Tensor1DWrap maxThresh, int2 size) +template +__global__ void ReplaceBgLabels2D(DstWrap dst, SrcWrap src, ArgWrap bgLabel, ArgWrap minThresh, + ArgWrap maxThresh, int2 size) { + using DT = typename DstWrap::ValueType; + int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -379,9 +384,9 @@ __global__ void ReplaceBgLabels2D(cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap } } -template -__global__ void CountLabels2D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, - cuda::Tensor1DWrap bgLabel, int2 size, int maxCapacity) +template +__global__ void CountLabels2D(ArgWrap
count, StatsWrap stats, DstWrap dst, ArgWrap bgLabel, int2 size, + int maxCapacity) { int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -440,10 +445,11 @@ __global__ void CountLabels2D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap -__global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor3DWrap mask, - cuda::Tensor1DWrap bgLabel, int2 size, int maskN, bool relabel) +template +__global__ void ComputeStats2D(StatsWrap stats, DstWrap dst, MaskWrap mask, ArgWrap bgLabel, int2 size, int maskN, + bool relabel) { + using DT = typename DstWrap::ValueType; int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -525,11 +531,11 @@ __global__ void ComputeStats2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap< } } -template -__global__ void RemoveIslands2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, - cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int2 size, bool relabel, - bool hasMask) +template +__global__ void RemoveIslands2D(StatsWrap stats, DstWrap dst, ArgWrap bgLabel, ArgWrap
minSize, int2 size, + bool relabel, bool hasMask) { + static_assert(std::is_same_v); int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -604,10 +610,11 @@ __global__ void RemoveIslands2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap } } -template -__global__ void Relabel2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
dst, cuda::Tensor1DWrap bgLabel, - cuda::Tensor1DWrap
minSize, int2 size, bool relabel, bool hasMask) +template +__global__ void Relabel2D(StatsWrap stats, DstWrap dst, ArgWrap bgLabel, ArgWrap
minSize, int2 size, + bool relabel, bool hasMask) { + static_assert(std::is_same_v); int3 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -665,10 +672,11 @@ __global__ void Relabel2D(cuda::Tensor3DWrap
stats, cuda::Tensor3DWrap
d // -- 3D kernels -- -template -__global__ void BlockLabel3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, cuda::Tensor1DWrap minThresh, - cuda::Tensor1DWrap maxThresh, int4 shape) +template +__global__ void BlockLabel3D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int4 shape) { + using DT = typename DstWrap::ValueType; + __shared__ DT labels[BW * BH * BD]; int3 tc = cuda::StaticCast(threadIdx); @@ -775,10 +783,10 @@ __global__ void BlockLabel3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap } } -template -__global__ void ZLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, - cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int4 shape) +template +__global__ void ZLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int4 shape) { + using DT = typename DstWrap::ValueType; int4 gc; gc.x = ((blockIdx.x * blockDim.x) + threadIdx.x); gc.y = ((blockIdx.y * blockDim.y) + threadIdx.y); @@ -857,10 +865,11 @@ __global__ void ZLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap } } -template -__global__ void YLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, - cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int4 shape) +template +__global__ void YLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int4 shape) { + using DT = typename DstWrap::ValueType; + int4 gc; gc.x = ((blockIdx.x * blockDim.x) + threadIdx.x); gc.y = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.y + blockDim.y; @@ -939,10 +948,11 @@ __global__ void YLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap } } -template -__global__ void XLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, - cuda::Tensor1DWrap minThresh, cuda::Tensor1DWrap maxThresh, int4 shape) +template +__global__ void XLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap minThresh, ArgWrap maxThresh, int4 shape) { + using DT = typename DstWrap::ValueType; + int4 gc; gc.x = ((blockIdx.z * blockDim.z) + threadIdx.z) * blockDim.x + blockDim.x; gc.y = ((blockIdx.y * blockDim.y) + threadIdx.y); @@ -1021,8 +1031,8 @@ __global__ void XLabelReduction3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap } } -template -__global__ void ResolveLabels3D(cuda::Tensor4DWrap
dst, int4 shape) +template +__global__ void ResolveLabels3D(DstWrap dst, int4 shape) { int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; @@ -1040,11 +1050,12 @@ __global__ void ResolveLabels3D(cuda::Tensor4DWrap
dst, int4 shape) } } -template -__global__ void ReplaceBgLabels3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap src, - cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap minThresh, - cuda::Tensor1DWrap maxThresh, int4 shape) +template +__global__ void ReplaceBgLabels3D(DstWrap dst, SrcWrap src, ArgWrap bgLabel, ArgWrap minThresh, + ArgWrap maxThresh, int4 shape) { + using DT = typename DstWrap::ValueType; + int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -1094,10 +1105,11 @@ __global__ void ReplaceBgLabels3D(cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap } } -template -__global__ void CountLabels3D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, - cuda::Tensor1DWrap bgLabel, int4 shape, int maxCapacity) +template +__global__ void CountLabels3D(ArgWrap
count, StatsWrap stats, DstWrap dst, ArgWrap bgLabel, int4 shape, + int maxCapacity) { + static_assert(std::is_same_v); int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -1161,10 +1173,14 @@ __global__ void CountLabels3D(cuda::Tensor1DWrap
count, cuda::Tensor3DWrap -__global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor4DWrap mask, - cuda::Tensor1DWrap bgLabel, int4 shape, int maskN, bool relabel) +template +__global__ void ComputeStats3D(StatsWrap stats, DstWrap dst, MaskWrap mask, ArgWrap bgLabel, int4 shape, int maskN, + bool relabel) { + using DT = typename DstWrap::ValueType; + using MT = typename MaskWrap::ValueType; + static_assert(std::is_same_v); + int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -1251,11 +1267,11 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap< } } -template -__global__ void RemoveIslands3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, - cuda::Tensor1DWrap bgLabel, cuda::Tensor1DWrap
minSize, int4 shape, +template +__global__ void RemoveIslands3D(StatsWrap stats, DstWrap dst, ArgWrap bgLabel, ArgWrap
minSize, int4 shape, bool relabel, bool hasMask) { + static_assert(std::is_same_v); int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -1333,10 +1349,11 @@ __global__ void RemoveIslands3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap } } -template -__global__ void Relabel3D(cuda::Tensor3DWrap
stats, cuda::Tensor4DWrap
dst, cuda::Tensor1DWrap bgLabel, - cuda::Tensor1DWrap
minSize, int4 shape, bool relabel, bool hasMask) +template +__global__ void Relabel3D(StatsWrap stats, DstWrap dst, ArgWrap bgLabel, ArgWrap
minSize, int4 shape, + bool relabel, bool hasMask) { + static_assert(std::is_same_v); int4 gc; gc.x = blockIdx.x * blockDim.x + threadIdx.x; gc.y = blockIdx.y * blockDim.y + threadIdx.y; @@ -1431,6 +1448,8 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big out tensor"); } + using SType = int32_t; + nvcv::Optional mskData; int4 mskIdsNDHW = {0, 0, 0, 0}; int maskN = 0; @@ -1456,9 +1475,9 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu maskN = mskIdsNDHW.x == -1 ? 1 : (int)mskData->shape()[mskIdsNDHW.x]; } - cuda::Tensor1DWrap bgLabelWrap, minThreshWrap, maxThreshWrap; - cuda::Tensor1DWrap minSizeWrap, countWrap; - cuda::Tensor3DWrap statsWrap; + ArgWrap bgLabelWrap, minThreshWrap, maxThreshWrap; + ArgWrap minSizeWrap, countWrap; + cuda::Tensor3DWrap statsWrap; int maxCapacity = 0; @@ -1473,10 +1492,10 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu TENSORWRAP = WRAPPER(data->basePtr()); \ } - CVCUDA_LABEL_WRAP(bgLabel, cuda::Tensor1DWrap, bgLabelWrap); - CVCUDA_LABEL_WRAP(minThresh, cuda::Tensor1DWrap, minThreshWrap); - CVCUDA_LABEL_WRAP(maxThresh, cuda::Tensor1DWrap, maxThreshWrap); - CVCUDA_LABEL_WRAP(minSize, cuda::Tensor1DWrap, minSizeWrap); + CVCUDA_LABEL_WRAP(bgLabel, ArgWrap, bgLabelWrap); + CVCUDA_LABEL_WRAP(minThresh, ArgWrap, minThreshWrap); + CVCUDA_LABEL_WRAP(maxThresh, ArgWrap, maxThreshWrap); + CVCUDA_LABEL_WRAP(minSize, ArgWrap, minSizeWrap); #undef CVCUDA_LABEL_WRAP @@ -1488,7 +1507,7 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "count tensor must be cuda-accessible"); } - countWrap = cuda::Tensor1DWrap(data->basePtr()); + countWrap = ArgWrap(data->basePtr()); NVCV_CHECK_THROW(cudaMemsetAsync(data->basePtr(), 0, sizeof(DstT) * shapeWHDN.w, stream)); } @@ -1500,7 +1519,7 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "stats tensor must be cuda-accessible"); } - statsWrap = cuda::Tensor3DWrap(data->basePtr(), (int)data->stride(0), (int)data->stride(1)); + statsWrap = cuda::Tensor3DWrap(data->basePtr(), (int)data->stride(0), (int)data->stride(1)); maxCapacity = data->shape(1); } @@ -1519,16 +1538,16 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu dim3 redBlocksX(util::DivUp(sizeWH.y, BW), util::DivUp((int)labBlocks.x, BH), shapeWHDN.w); dim3 redBlocksY(util::DivUp(sizeWH.x, BW), util::DivUp((int)labBlocks.y, BH), shapeWHDN.w); - cuda::Tensor3DWrap srcWrap(srcData.basePtr(), srcStridesNH.x, srcStridesNH.y); - cuda::Tensor3DWrap dstWrap(dstData.basePtr(), dstStridesNH.x, dstStridesNH.y); - cuda::Tensor3DWrap mskWrap; + cuda::Tensor3DWrap srcWrap(srcData.basePtr(), srcStridesNH.x, srcStridesNH.y); + cuda::Tensor3DWrap dstWrap(dstData.basePtr(), dstStridesNH.x, dstStridesNH.y); + cuda::Tensor3DWrap mskWrap; if (hasMask) { int2 mskStridesNH{0, (int)mskData->stride(mskIdsNDHW.z)}; mskStridesNH.x = mskIdsNDHW.x == -1 ? mskStridesNH.y * shapeWHDN.y : (int)mskData->stride(mskIdsNDHW.x); - mskWrap = cuda::Tensor3DWrap(mskData->basePtr(), mskStridesNH.x, mskStridesNH.y); + mskWrap = cuda::Tensor3DWrap(mskData->basePtr(), mskStridesNH.x, mskStridesNH.y); } BlockLabel2D @@ -1582,16 +1601,17 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu dim3 redBlocksY(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.z, BH), util::DivUp((int)labBlocks.y, BD)); dim3 redBlocksZ(util::DivUp(shapeWHDN.x, BW), util::DivUp(shapeWHDN.y, BH), util::DivUp((int)labBlocks.z, BD)); - cuda::Tensor4DWrap srcWrap(srcData.basePtr(), srcStridesNDH.x, srcStridesNDH.y, srcStridesNDH.z); - cuda::Tensor4DWrap dstWrap(dstData.basePtr(), dstStridesNDH.x, dstStridesNDH.y, dstStridesNDH.z); - cuda::Tensor4DWrap mskWrap; + cuda::Tensor4DWrap srcWrap(srcData.basePtr(), srcStridesNDH.x, srcStridesNDH.y, srcStridesNDH.z); + cuda::Tensor4DWrap dstWrap(dstData.basePtr(), dstStridesNDH.x, dstStridesNDH.y, dstStridesNDH.z); + cuda::Tensor4DWrap mskWrap; if (hasMask) { int3 mskStridesNDH{0, (int)mskData->stride(mskIdsNDHW.y), (int)mskData->stride(mskIdsNDHW.z)}; mskStridesNDH.x = mskIdsNDHW.x == -1 ? mskStridesNDH.y * shapeWHDN.z : (int)mskData->stride(mskIdsNDHW.x); - mskWrap = cuda::Tensor4DWrap(mskData->basePtr(), mskStridesNDH.x, mskStridesNDH.y, mskStridesNDH.z); + mskWrap = cuda::Tensor4DWrap(mskData->basePtr(), mskStridesNDH.x, mskStridesNDH.y, + mskStridesNDH.z); } BlockLabel3D diff --git a/src/cvcuda/priv/OpMinMaxLoc.cu b/src/cvcuda/priv/OpMinMaxLoc.cu index ba16fbe76..60107e85d 100644 --- a/src/cvcuda/priv/OpMinMaxLoc.cu +++ b/src/cvcuda/priv/OpMinMaxLoc.cu @@ -571,15 +571,15 @@ inline void RunMinMaxLocForType(cudaStream_t stream, const DataStridedCuda &inDa if (minValData && maxValData) { - cuda::Tensor1DWrap> minWrap(minValData->get().basePtr()); - cuda::Tensor1DWrap> maxWrap(maxValData->get().basePtr()); + cuda::Tensor1DWrap, int32_t> minWrap(minValData->get().basePtr()); + cuda::Tensor1DWrap, int32_t> maxWrap(maxValData->get().basePtr()); auto outWrap = OutputWrapper(minWrap, maxWrap); - cuda::Tensor2DWrap minLocWrap(minLocData->get().basePtr(), (int)minLocData->get().stride(0)); - cuda::Tensor2DWrap maxLocWrap(maxLocData->get().basePtr(), (int)maxLocData->get().stride(0)); - cuda::Tensor1DWrap numMinWrap(numMinData->get().basePtr()); - cuda::Tensor1DWrap numMaxWrap(numMaxData->get().basePtr()); + cuda::Tensor2DWrap minLocWrap(minLocData->get().basePtr(), (int)minLocData->get().stride(0)); + cuda::Tensor2DWrap maxLocWrap(maxLocData->get().basePtr(), (int)maxLocData->get().stride(0)); + cuda::Tensor1DWrap numMinWrap(numMinData->get().basePtr()); + cuda::Tensor1DWrap numMaxWrap(numMaxData->get().basePtr()); int minLocCapacity = minLocData->get().shape(GetCapacityIdx(minLocData->get().rank())); int maxLocCapacity = maxLocData->get().shape(GetCapacityIdx(maxLocData->get().rank())); @@ -597,12 +597,12 @@ inline void RunMinMaxLocForType(cudaStream_t stream, const DataStridedCuda &inDa } else if (minValData) { - cuda::Tensor1DWrap> minWrap(minValData->get().basePtr()); + cuda::Tensor1DWrap, int32_t> minWrap(minValData->get().basePtr()); auto outWrap = OutputWrapper(minWrap); - cuda::Tensor2DWrap minLocWrap(minLocData->get().basePtr(), (int)minLocData->get().stride(0)); - cuda::Tensor1DWrap numMinWrap(numMinData->get().basePtr()); + cuda::Tensor2DWrap minLocWrap(minLocData->get().basePtr(), (int)minLocData->get().stride(0)); + cuda::Tensor1DWrap numMinWrap(numMinData->get().basePtr()); int minLocCapacity = minLocData->get().shape(GetCapacityIdx(minLocData->get().rank())); @@ -618,12 +618,12 @@ inline void RunMinMaxLocForType(cudaStream_t stream, const DataStridedCuda &inDa } else if (maxValData) { - cuda::Tensor1DWrap> maxWrap(maxValData->get().basePtr()); + cuda::Tensor1DWrap, int32_t> maxWrap(maxValData->get().basePtr()); auto outWrap = OutputWrapper(maxWrap); - cuda::Tensor2DWrap maxLocWrap(maxLocData->get().basePtr(), (int)maxLocData->get().stride(0)); - cuda::Tensor1DWrap numMaxWrap(numMaxData->get().basePtr()); + cuda::Tensor2DWrap maxLocWrap(maxLocData->get().basePtr(), (int)maxLocData->get().stride(0)); + cuda::Tensor1DWrap numMaxWrap(numMaxData->get().basePtr()); int maxLocCapacity = maxLocData->get().shape(GetCapacityIdx(maxLocData->get().rank())); diff --git a/src/cvcuda/priv/OpNonMaximumSuppression.cu b/src/cvcuda/priv/OpNonMaximumSuppression.cu index b8c65739f..d21f77d43 100644 --- a/src/cvcuda/priv/OpNonMaximumSuppression.cu +++ b/src/cvcuda/priv/OpNonMaximumSuppression.cu @@ -63,9 +63,10 @@ inline __device__ float ComputeIoU(const T &box1, const T &box2) } template -__global__ void NonMaximumSuppression(cuda::Tensor2DWrap inBBoxes, cuda::Tensor2DWrap outMask, - cuda::Tensor2DWrap inScores, int numBBoxes, float scoreThreshold, - float iouThreshold) +__global__ void NonMaximumSuppression(cuda::Tensor2DWrap inBBoxes, + cuda::Tensor2DWrap outMask, + cuda::Tensor2DWrap inScores, int numBBoxes, + float scoreThreshold, float iouThreshold) { const int bboxX = blockDim.x * blockIdx.x + threadIdx.x; if (bboxX >= numBBoxes) @@ -118,9 +119,9 @@ inline __host__ void RunNonMaximumSuppresion(const nvcv::TensorDataStridedCuda & const nvcv::TensorDataStridedCuda &scores, float scThresh, float iouThresh, cudaStream_t stream) { - cuda::Tensor2DWrap inWrap(in); - cuda::Tensor2DWrap outWrap(out); - cuda::Tensor2DWrap scoresWrap(scores); + cuda::Tensor2DWrap inWrap(in); + cuda::Tensor2DWrap outWrap(out); + cuda::Tensor2DWrap scoresWrap(scores); int numSamples = in.shape(0); int numBBoxes = in.shape(1); diff --git a/src/cvcuda/priv/OpRemap.cu b/src/cvcuda/priv/OpRemap.cu index 427c5c593..36488589e 100644 --- a/src/cvcuda/priv/OpRemap.cu +++ b/src/cvcuda/priv/OpRemap.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -147,19 +148,12 @@ __global__ void Remap(SrcWrapper src, DstWrapper dst, MapWrapper map, int2 mapSi // Host run remap functions ---------------------------------------------------- -template +template void RunRemap(cudaStream_t stream, const DataStridedCuda &srcData, const DataStridedCuda &dstData, - const nvcv::TensorDataStridedCuda &mapData, NVCVRemapMapValueType mapValueType, bool alignCorners, - const T &borderValue) + const MapWrapper &mapWrap, NVCVRemapMapValueType mapValueType, bool alignCorners, const T &borderValue, + int2 mapSize, int mapNumSamples) { - auto mapAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(mapData); - int2 mapSize = cuda::StaticCast(long2{mapAccess->numCols(), mapAccess->numRows()}); - int mapNumSamples = mapAccess->numSamples(); - dim3 block(32, 4, 1); - - auto map = cuda::CreateInterpolationWrapNHW(mapData); - if constexpr (std::is_same_v) { auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(srcData); @@ -172,10 +166,20 @@ void RunRemap(cudaStream_t stream, const DataStridedCuda &srcData, const DataStr dim3 grid(util::DivUp(dstSize.x, block.x), util::DivUp(dstSize.y, block.y), dstAccess->numSamples()); - auto src = cuda::CreateInterpolationWrapNHW(srcData, borderValue); - auto dst = cuda::CreateTensorWrapNHW(dstData); - - Remap<<>>(src, dst, map, dstSize, mapNumSamples, params); + int64_t srcMaxStride = srcAccess->sampleStride() * srcAccess->numSamples(); + int64_t dstMaxStride = dstAccess->sampleStride() * dstAccess->numSamples(); + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateInterpolationWrapNHW(srcData, borderValue); + auto dst = cuda::CreateTensorWrapNHW(dstData); + + Remap<<>>(src, dst, mapWrap, dstSize, mapNumSamples, params); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input or output size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } } else { @@ -188,7 +192,29 @@ void RunRemap(cudaStream_t stream, const DataStridedCuda &srcData, const DataStr cuda::InterpolationVarShapeWrap src(srcData, borderValue); cuda::ImageBatchVarShapeWrap dst(dstData); - Remap<<>>(src, dst, map, mapSize, mapNumSamples, alignCorners, mapValueType); + Remap<<>>(src, dst, mapWrap, mapSize, mapNumSamples, alignCorners, mapValueType); + } +} + +template +void RunRemap(cudaStream_t stream, const DataStridedCuda &srcData, const DataStridedCuda &dstData, + const nvcv::TensorDataStridedCuda &mapData, NVCVRemapMapValueType mapValueType, bool alignCorners, + const T &borderValue) +{ + auto mapAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(mapData); + int2 mapSize = cuda::StaticCast(long2{mapAccess->numCols(), mapAccess->numRows()}); + int mapNumSamples = mapAccess->numSamples(); + + if (mapAccess->sampleStride() * mapAccess->numSamples() <= cuda::TypeTraits::max) + { + auto map = cuda::CreateInterpolationWrapNHW(mapData); + RunRemap(stream, srcData, dstData, map, mapValueType, alignCorners, borderValue, mapSize, + mapNumSamples); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Map size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); } } diff --git a/src/cvcuda/priv/OpResize.cpp b/src/cvcuda/priv/OpResize.cpp index f8ee58f2a..f136f7414 100644 --- a/src/cvcuda/priv/OpResize.cpp +++ b/src/cvcuda/priv/OpResize.cpp @@ -29,9 +29,8 @@ namespace legacy = nvcv::legacy::cuda_op; Resize::Resize() { - legacy::DataShape maxIn, maxOut; - // maxIn/maxOut not used by op. - m_legacyOp = std::make_unique(maxIn, maxOut); + legacy::DataShape maxIn, maxOut; // maxIn/maxOut not used by op. + m_legacyOpVarShape = std::make_unique(maxIn, maxOut); } @@ -52,7 +51,7 @@ void Resize::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv: "Output must be cuda-accessible, pitch-linear tensor"); } - NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *outData, interpolation, stream)); + RunResize(stream, *inData, *outData, interpolation); } void Resize::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, diff --git a/src/cvcuda/priv/OpResize.cu b/src/cvcuda/priv/OpResize.cu new file mode 100644 index 000000000..6f18839ef --- /dev/null +++ b/src/cvcuda/priv/OpResize.cu @@ -0,0 +1,527 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "OpResize.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +namespace cuda = nvcv::cuda; +namespace util = nvcv::util; + +// Destination pack type given the source and destination type T +template +using DPT = std::conditional_t; + +// Number of items in x written by each thread +template +constexpr int NIX = sizeof(DPT) / sizeof(T); + +// Write a pack of N elements of type T as a different pack type DPT +template +__device__ void WritePack(T &u, const T (&v)[NIX]) +{ + reinterpret_cast &>(u) = reinterpret_cast &>(v); +} + +// Nearest --------------------------------------------------------------------- + +template +inline __device__ void NearestInterpolatePack(T *dstRow, SrcWrapper src, int3 iSrcCoord, float srcCoordX, int srcSizeX, + int dstCoordX, int dstSizeX, float scaleRatioX) +{ + int iPrevCoordX; + T srcPack; + + if (dstCoordX + NIX - 1 < dstSizeX) + { + T dstPack[NIX]; +#pragma unroll + for (int x = 0; x < NIX; ++x) + { + iSrcCoord.x = floor(srcCoordX + x * scaleRatioX); + iSrcCoord.x = cuda::min(iSrcCoord.x, srcSizeX - 1); + + if constexpr (INTERSECT) + { + if (x == 0 || iSrcCoord.x != iPrevCoordX) + { + srcPack = src[iSrcCoord]; + } + + dstPack[x] = srcPack; + + iPrevCoordX = iSrcCoord.x; + } + else + { + dstPack[x] = src[iSrcCoord]; + } + } + + WritePack(dstRow[dstCoordX], dstPack); + } + else + { +#pragma unroll + for (int x = 0; x < NIX; ++x) + { + if (dstCoordX + x < dstSizeX) + { + iSrcCoord.x = floor(srcCoordX + x * scaleRatioX); + iSrcCoord.x = cuda::min(iSrcCoord.x, srcSizeX - 1); + + if constexpr (INTERSECT) + { + if (x == 0 || iSrcCoord.x != iPrevCoordX) + { + srcPack = src[iSrcCoord]; + } + + dstRow[dstCoordX + x] = srcPack; + + iPrevCoordX = iSrcCoord.x; + } + else + { + dstRow[dstCoordX + x] = src[iSrcCoord]; + } + } + } + } +} + +template +__global__ void NearestResize(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, float2 scaleRatio) +{ + using T = typename DstWrapper::ValueType; + + int3 dstCoord; + dstCoord.z = blockIdx.z; + dstCoord.y = (blockIdx.y * blockDim.y + threadIdx.y); + + if (dstCoord.y < dstSize.y) + { + dstCoord.x = (blockIdx.x * blockDim.x + threadIdx.x) * NIX; + + float2 srcCoord = (cuda::DropCast<2>(dstCoord) + 0.5f) * scaleRatio; + int3 iSrcCoord{0, (int)floor(srcCoord.y), dstCoord.z}; + + iSrcCoord.y = cuda::min(iSrcCoord.y, srcSize.y - 1); + + T *dstRow = dst.ptr(dstCoord.z, dstCoord.y); + + NearestInterpolatePack(dstRow, src, iSrcCoord, srcCoord.x, srcSize.x, dstCoord.x, dstSize.x, + scaleRatio.x); + } +} + +// Linear ---------------------------------------------------------------------- + +template +inline __device__ void LinearReadPack(SrcWrapper src, T (&srcPack)[4], int3 iSrcCoord) +{ + srcPack[0] = src[int3{iSrcCoord.x, iSrcCoord.y, iSrcCoord.z}]; + srcPack[1] = src[int3{iSrcCoord.x + 1, iSrcCoord.y, iSrcCoord.z}]; + srcPack[2] = src[int3{iSrcCoord.x, iSrcCoord.y + 1, iSrcCoord.z}]; + srcPack[3] = src[int3{iSrcCoord.x + 1, iSrcCoord.y + 1, iSrcCoord.z}]; +} + +template +inline __device__ T LinearInterpolatePack(T *dstRow, SrcWrapper src, int3 iSrcCoord, float srcCoordX, int srcSizeX, + int dstCoordX, int dstSizeX, float scaleRatioX, float2 w) +{ + float sx; + int iPrevCoordX; + T srcPack[4]; + + if (dstCoordX + NIX - 1 < dstSizeX) + { + T dstPack[NIX]; +#pragma unroll + for (int x = 0; x < NIX; ++x) + { + sx = srcCoordX + x * scaleRatioX; + iSrcCoord.x = floor(sx); + + w.x = sx - iSrcCoord.x; + w.x = (iSrcCoord.x < 0 || iSrcCoord.x >= srcSizeX - 1) ? 0 : w.x; + + iSrcCoord.x = cuda::max(0, cuda::min(iSrcCoord.x, srcSizeX - 2)); + + if constexpr (INTERSECT) + { + if (x == 0) + { + LinearReadPack(src, srcPack, iSrcCoord); + } + else + { + if (iSrcCoord.x != iPrevCoordX) + { + if (iSrcCoord.x == (iPrevCoordX + 1)) + { + srcPack[0] = srcPack[1]; + srcPack[2] = srcPack[3]; + srcPack[1] = src[int3{iSrcCoord.x + 1, iSrcCoord.y, iSrcCoord.z}]; + srcPack[3] = src[int3{iSrcCoord.x + 1, iSrcCoord.y + 1, iSrcCoord.z}]; + } + else + { + LinearReadPack(src, srcPack, iSrcCoord); + } + } + } + dstPack[x] + = cuda::SaturateCast(srcPack[0] * ((1.f - w.x) * (1.f - w.y)) + srcPack[1] * (w.x * (1.f - w.y)) + + srcPack[2] * ((1.f - w.x) * w.y) + srcPack[3] * (w.x * w.y)); + + iPrevCoordX = iSrcCoord.x; + } + else + { + dstPack[x] = cuda::SaturateCast( + src[int3{iSrcCoord.x, iSrcCoord.y, iSrcCoord.z}] * ((1.f - w.x) * (1.f - w.y)) + + src[int3{iSrcCoord.x + 1, iSrcCoord.y, iSrcCoord.z}] * (w.x * (1.f - w.y)) + + src[int3{iSrcCoord.x, iSrcCoord.y + 1, iSrcCoord.z}] * ((1.f - w.x) * w.y) + + src[int3{iSrcCoord.x + 1, iSrcCoord.y + 1, iSrcCoord.z}] * (w.x * w.y)); + } + } + + WritePack(dstRow[dstCoordX], dstPack); + } + else + { +#pragma unroll + for (int x = 0; x < NIX; ++x) + { + if (dstCoordX + x < dstSizeX) + { + sx = srcCoordX + x * scaleRatioX; + iSrcCoord.x = floor(sx); + + w.x = sx - iSrcCoord.x; + w.x = (iSrcCoord.x < 0 || iSrcCoord.x >= srcSizeX - 1) ? 0 : w.x; + + iSrcCoord.x = cuda::max(0, cuda::min(iSrcCoord.x, srcSizeX - 2)); + + if constexpr (INTERSECT) + { + if (x == 0) + { + LinearReadPack(src, srcPack, iSrcCoord); + } + else + { + if (iSrcCoord.x != iPrevCoordX) + { + if (iSrcCoord.x == (iPrevCoordX + 1)) + { + srcPack[0] = srcPack[1]; + srcPack[2] = srcPack[3]; + srcPack[1] = src[int3{iSrcCoord.x + 1, iSrcCoord.y, iSrcCoord.z}]; + srcPack[3] = src[int3{iSrcCoord.x + 1, iSrcCoord.y + 1, iSrcCoord.z}]; + } + else + { + LinearReadPack(src, srcPack, iSrcCoord); + } + } + } + dstRow[dstCoordX + x] = cuda::SaturateCast( + srcPack[0] * ((1.f - w.x) * (1.f - w.y)) + srcPack[1] * (w.x * (1.f - w.y)) + + srcPack[2] * ((1.f - w.x) * w.y) + srcPack[3] * (w.x * w.y)); + + iPrevCoordX = iSrcCoord.x; + } + else + { + dstRow[dstCoordX + x] = cuda::SaturateCast( + src[int3{iSrcCoord.x, iSrcCoord.y, iSrcCoord.z}] * ((1.f - w.x) * (1.f - w.y)) + + src[int3{iSrcCoord.x + 1, iSrcCoord.y, iSrcCoord.z}] * (w.x * (1.f - w.y)) + + src[int3{iSrcCoord.x, iSrcCoord.y + 1, iSrcCoord.z}] * ((1.f - w.x) * w.y) + + src[int3{iSrcCoord.x + 1, iSrcCoord.y + 1, iSrcCoord.z}] * (w.x * w.y)); + } + } + } + } +} + +template +__global__ void LinearResize(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, float2 scaleRatio) +{ + using T = typename DstWrapper::ValueType; + + int3 dstCoord; + dstCoord.z = blockIdx.z; + dstCoord.y = (blockIdx.y * blockDim.y + threadIdx.y); + + if (dstCoord.y < dstSize.y) + { + dstCoord.x = (blockIdx.x * blockDim.x + threadIdx.x) * NIX; + + float2 srcCoord = (cuda::DropCast<2>(dstCoord) + .5f) * scaleRatio - .5f; + int3 iSrcCoord{0, (int)floor(srcCoord.y), dstCoord.z}; + + float2 w; + w.y = srcCoord.y - iSrcCoord.y; + + iSrcCoord.y = cuda::max(0, cuda::min(iSrcCoord.y, srcSize.y - 2)); + + T *dstRow = dst.ptr(dstCoord.z, dstCoord.y); + + LinearInterpolatePack(dstRow, src, iSrcCoord, srcCoord.x, srcSize.x, dstCoord.x, dstSize.x, + scaleRatio.x, w); + } +} + +// Cubic ----------------------------------------------------------------------- + +inline __device__ void GetCubicCoeffs(float delta, float &w0, float &w1, float &w2, float &w3) +{ + constexpr float A = -0.75f; + + w0 = ((A * (delta + 1) - 5 * A) * (delta + 1) + 8 * A) * (delta + 1) - 4 * A; + w1 = ((A + 2) * delta - (A + 3)) * delta * delta + 1; + w2 = ((A + 2) * (1 - delta) - (A + 3)) * (1 - delta) * (1 - delta) + 1; + w3 = 1.f - w0 - w1 - w2; +} + +template +__global__ void CubicResize(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, float2 scaleRatio) +{ + using T = typename DstWrapper::ValueType; + using FT = nvcv::cuda::ConvertBaseTypeTo; + + int3 dstCoord; + dstCoord.z = blockIdx.z; + dstCoord.y = blockIdx.y * blockDim.y + threadIdx.y; + dstCoord.x = blockIdx.x * blockDim.x + threadIdx.x; + + if (dstCoord.y < dstSize.y && dstCoord.x < dstSize.x) + { + float2 srcCoord = (cuda::DropCast<2>(dstCoord) + .5f) * scaleRatio - .5f; + int3 iSrcCoord{(int)floor(srcCoord.x), (int)floor(srcCoord.y), dstCoord.z}; + + float fx = srcCoord.x - iSrcCoord.x; + float fy = srcCoord.y - iSrcCoord.y; + + fx = (iSrcCoord.x < 1 || iSrcCoord.x >= srcSize.x - 3) ? 0 : fx; + + iSrcCoord.y = cuda::max(1, cuda::min(iSrcCoord.y, srcSize.y - 3)); + iSrcCoord.x = cuda::max(1, cuda::min(iSrcCoord.x, srcSize.x - 3)); + + float wx[4]; + float wy[4]; + + GetCubicCoeffs(fx, wx[0], wx[1], wx[2], wx[3]); + GetCubicCoeffs(fy, wy[0], wy[1], wy[2], wy[3]); + + FT sum = FT{}; + +#pragma unroll + for (int cy = -1; cy <= 2; cy++) + { +#pragma unroll + for (int cx = -1; cx <= 2; cx++) + { + sum += src[int3{iSrcCoord.x + cx, iSrcCoord.y + cy, iSrcCoord.z}] * (wx[cx + 1] * wy[cy + 1]); + } + } + + dst[dstCoord] = cuda::SaturateCast(cuda::abs(sum)); + } +} + +// Area ------------------------------------------------------------------------ + +template +__global__ void AreaResize(SrcWrapper src, DstWrapper dst, int2 dstSize) +{ + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + int z = blockIdx.z; + + if (x >= dstSize.x || y >= dstSize.y) + return; + + int3 coord{x, y, z}; + + dst[coord] = src[cuda::StaticCast(coord)]; +} + +// Host run resize functions --------------------------------------------------- + +template +void RunResizeInterp(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, int2 srcSize, int2 dstSize, int batchSize, + const NVCVInterpolationType interpolation) +{ + float2 scaleRatio{(float)srcSize.x / dstSize.x, (float)srcSize.y / dstSize.y}; + + auto srcTW = cuda::CreateTensorWrapNHW(srcData); + auto dstTW = cuda::CreateTensorWrapNHW(dstData); + auto srcIW = cuda::CreateInterpolationWrapNHW( + srcData, T{}, scaleRatio.x, scaleRatio.y); + + dim3 threads1(32, 4, 1); + dim3 blocks1(util::DivUp(dstSize.x, threads1.x * NIX), util::DivUp(dstSize.y, threads1.y), batchSize); + + dim3 threads2(128, 1, 1); + dim3 blocks2(util::DivUp(dstSize.x, threads2.x), util::DivUp(dstSize.y, threads2.y), batchSize); + + switch (interpolation) + { + case NVCV_INTERP_NEAREST: + if (scaleRatio.x < 1) + NearestResize<<>>(srcTW, dstTW, srcSize, dstSize, scaleRatio); + else + NearestResize<<>>(srcTW, dstTW, srcSize, dstSize, scaleRatio); + break; + + case NVCV_INTERP_LINEAR: + if (scaleRatio.x < 2) + LinearResize<<>>(srcTW, dstTW, srcSize, dstSize, scaleRatio); + else + LinearResize<<>>(srcTW, dstTW, srcSize, dstSize, scaleRatio); + break; + + case NVCV_INTERP_CUBIC: + CubicResize<<>>(srcTW, dstTW, srcSize, dstSize, scaleRatio); + break; + + case NVCV_INTERP_AREA: + AreaResize<<>>(srcIW, dstTW, dstSize); + break; + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); + } +} + +inline void RunResizeInterpType(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, int2 srcSize, int2 dstSize, int numChannels, + int batchSize, const NVCVInterpolationType interpolation) +{ + // The data type may contain the channels baked in or the number of channels is in the tensor shape + + // clang-format off + +#define CVCUDA_RUN_RESIZE(BT, DT, T) \ + ((srcData.dtype() == nvcv::TYPE_##BT && numChannels == cuda::NumElements) \ + || (srcData.dtype() == nvcv::TYPE_##DT && numChannels == 1)) \ + RunResizeInterp(stream, srcData, dstData, srcSize, dstSize, batchSize, interpolation); + + if CVCUDA_RUN_RESIZE(U8, U8, uchar1) + else if CVCUDA_RUN_RESIZE(U8, 3U8, uchar3) + else if CVCUDA_RUN_RESIZE(U8, 4U8, uchar4) + else if CVCUDA_RUN_RESIZE(U16, U16, ushort) + else if CVCUDA_RUN_RESIZE(U16, 3U16, ushort3) + else if CVCUDA_RUN_RESIZE(U16, 4U16, ushort4) + else if CVCUDA_RUN_RESIZE(S16, S16, short) + else if CVCUDA_RUN_RESIZE(S16, 3S16, short3) + else if CVCUDA_RUN_RESIZE(S16, 4S16, short4) + else if CVCUDA_RUN_RESIZE(F32, F32, float) + else if CVCUDA_RUN_RESIZE(F32, 3F32, float3) + else if CVCUDA_RUN_RESIZE(F32, 4F32, float4) + else + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid input data type"); + } + +#undef CVCUDA_RUN_RESIZE + + // clang-format on +} + +} // anonymous namespace + +namespace cvcuda::priv { + +// Tensor operator ------------------------------------------------------------- + +void Resize::RunResize(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, const NVCVInterpolationType interpolation) const +{ + if (srcData.dtype() != dstData.dtype()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input and output data type are different"); + } + if (srcData.layout() != dstData.layout()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input and output data layout are different"); + } + if (srcData.layout() != nvcv::TENSOR_HWC && srcData.layout() != nvcv::TENSOR_NHWC) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must have (N)HWC layout"); + } + if (dstData.layout() != nvcv::TENSOR_HWC && dstData.layout() != nvcv::TENSOR_NHWC) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output must have (N)HWC layout"); + } + + auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(srcData); + auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); + NVCV_ASSERT(srcAccess && dstAccess); + + if (srcAccess->numSamples() != dstAccess->numSamples()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input and output samples are different"); + } + if (srcAccess->numChannels() != dstAccess->numChannels()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input and output channels are different"); + } + if (srcAccess->numChannels() > 4 || srcAccess->numChannels() < 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid number of channels"); + } + + constexpr int32_t kIntMax = cuda::TypeTraits::max; + + int64_t srcMaxStride = srcAccess->sampleStride() * srcAccess->numSamples(); + int64_t dstMaxStride = dstAccess->sampleStride() * dstAccess->numSamples(); + + if (std::max(srcMaxStride, dstMaxStride) > kIntMax || srcAccess->numSamples() > kIntMax + || srcAccess->numCols() > kIntMax || srcAccess->numRows() > kIntMax || dstAccess->numCols() > kIntMax + || dstAccess->numRows() > kIntMax) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input or output tensors are too large"); + } + + int numChannels{(int)srcAccess->numChannels()}; + int batchSize{(int)srcAccess->numSamples()}; + int2 srcSize{(int)srcAccess->numCols(), (int)srcAccess->numRows()}; + int2 dstSize{(int)dstAccess->numCols(), (int)dstAccess->numRows()}; + + RunResizeInterpType(stream, srcData, dstData, srcSize, dstSize, numChannels, batchSize, interpolation); +} + +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpResize.hpp b/src/cvcuda/priv/OpResize.hpp index 3adcc0b69..eaebaf932 100644 --- a/src/cvcuda/priv/OpResize.hpp +++ b/src/cvcuda/priv/OpResize.hpp @@ -46,7 +46,9 @@ class Resize final : public IOperator const NVCVInterpolationType interpolation) const; private: - std::unique_ptr m_legacyOp; + void RunResize(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData, + const nvcv::TensorDataStridedCuda &dstData, const NVCVInterpolationType interpolation) const; + std::unique_ptr m_legacyOpVarShape; }; diff --git a/src/cvcuda/priv/OpResizeCropConvertReformat.cu b/src/cvcuda/priv/OpResizeCropConvertReformat.cu index 624624284..571c2d63f 100644 --- a/src/cvcuda/priv/OpResizeCropConvertReformat.cu +++ b/src/cvcuda/priv/OpResizeCropConvertReformat.cu @@ -31,6 +31,7 @@ #include #include +#include // for numeric_limits #include namespace cuda = nvcv::cuda; @@ -41,189 +42,256 @@ namespace helpers = nvcv::legacy::helpers; namespace { -//******************** NN = Nearest Neighbor (TensorWrap src) +// clang-format off -template -__global__ void resizeCrop_NN(SrcWrapper src, DstT *dst, const int src_w, const int src_h, const int dst_w, - const int dst_h, const float scale_x, const float scale_y, const int crop_x, - const int crop_y, const size_t incrN, const size_t incrH, const size_t incrW, - const size_t incrC, const uchar4 mapC) +template +uchar4 remapC(const NVCVChannelManip manip) { - const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; - const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; + static_assert(N > 0 && N <= 4, "Number of remap channels must be >= 1 and <= 4."); - if ((dst_x < dst_w) && (dst_y < dst_h)) - { // Generic copy pixel to pixel. - const int sample = blockIdx.z; + if (manip == NVCV_CHANNEL_REVERSE) + { + if constexpr (N == 1) return uchar4{0, 0, 0, 0}; + if constexpr (N == 2) return uchar4{1, 0, 0, 0}; + if constexpr (N == 3) return uchar4{2, 1, 0, 0}; + if constexpr (N == 4) return uchar4{3, 2, 1, 0}; + } + return uchar4{0, 1, 2, 3}; +} - dst += sample * incrN + dst_y * incrH + dst_x * incrW; +template +class DstMap { +public: + using DstType = DstT; + + static_assert(N > 0 && N <= 4, "Number of DstMap channels must be >= 1 and <= 4."); + + DstMap(DstT *dst, size_t addN, int addH, int addW, size_t addC, + uchar4 mapC, int width, int height) + : m_dst {dst}, + m_addN{addN}, + m_addY{addH}, + m_addX{addW}, + m_wdth{width}, + m_hght{height} {_init(addC, mapC); } + + DstMap(DstT *dst, size_t addN, int addH, int addW, size_t addC, + const NVCVChannelManip manip, int width, int height) + : m_dst {dst}, + m_addN{addN}, + m_addY{addH}, + m_addX{addW}, + m_wdth{width}, + m_hght{height} {_init(addC, remapC(manip)); } + + __host__ __device__ __forceinline__ + int width() const { return m_wdth; } + + __host__ __device__ __forceinline__ + int height() const { return m_hght; } + + __host__ __device__ __forceinline__ + DstT *ptr(const uint n, const int y, const int x) { return m_dst + n * m_addN + (y * m_addY + x * m_addX); } + + template > > + __host__ __device__ __forceinline__ + void operator()(const uint n, const int y, const int x, const SrcT val) + { + static_assert(cuda::NumElements == N); - const int sx = cuda::min(cuda::round((dst_x + crop_x) * scale_x), src_w - 1); - const int sy = cuda::min(cuda::round((dst_y + crop_y) * scale_y), src_h - 1); + // Set destination pointer to correct pixel (batch, row, & column). + DstT *dst = ptr(n, y, x); - SrcT v = *src.ptr(sample, sy, sx); + // Shuffle pixel channels. + if constexpr (cuda::NumComponents > 1) { + dst[m_mapC[0]] = cuda::SaturateCast(val.x); + dst[m_mapC[1]] = cuda::SaturateCast(val.y); + if constexpr (N >= 3) dst[m_mapC[2]] = cuda::SaturateCast(val.z); + if constexpr (N == 4) dst[m_mapC[3]] = cuda::SaturateCast(val.w); + } + else if constexpr (cuda::NumComponents == 1) + *dst = cuda::SaturateCast(val.x); + else *dst = cuda::SaturateCast(val); + } - // Channel manipulation, convert type, and reformat. - dst[mapC.x * incrC] = (DstT)v.x; - dst[mapC.y * incrC] = (DstT)v.y; - dst[mapC.z * incrC] = (DstT)v.z; +private: + void _init(size_t addC, uchar4 mapC) + { + m_mapC[0] = mapC.x * addC; + if constexpr (N >= 2) m_mapC[1] = mapC.y * addC; + if constexpr (N >= 3) m_mapC[2] = mapC.z * addC; + if constexpr (N == 4) m_mapC[3] = mapC.w * addC; } -} // resizeCrop_NN -//******************** Bilinear (TensorWrap src) + size_t m_mapC[N]; + size_t m_addN; + int m_addY, m_addX; + int m_wdth, m_hght; + DstT *m_dst; +}; -template -__global__ void resizeCrop_bilinear(SrcWrapper src, DstT *dst, const int src_w, const int src_h, const int dst_w, - const int dst_h, const float scale_x, const float scale_y, const int crop_x, - const int crop_y, const size_t incrN, const size_t incrH, const size_t incrW, - const size_t incrC, const uchar4 mapC) +//******************** Tensor Source ********************// + +//******************** NN = Nearest Neighbor (TensorWrap) +template +__global__ void resizeCrop_NN(DstMap dst, SrcWrapper src, + const float2 resize, const int2 crop, + const float scale, const float offset) { const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x < dst_w && dst_y < dst_h) + if (dst_x < dst.width() && dst_y < dst.height()) { - const int sample = blockIdx.z; - - // Float space for weighted addition. - // Compute y coordinate. - float fy = (float)((dst_y + crop_y + 0.5f) * scale_y - 0.5f); - int sy = cuda::round(fy); - fy -= sy; - sy = cuda::max(0, cuda::min(sy, src_h - 2)); + // Copy nearest pixel to pixel. + // Compute source pixel positions. + const int sx = __float2int_rd((dst_x + crop.x + 0.5f) * resize.x); + const int sy = __float2int_rd((dst_y + crop.y + 0.5f) * resize.y); - // Row pointers. - const SrcT *aPtr = src.ptr(sample, sy, 0); // Start of upper row. - const SrcT *bPtr = src.ptr(sample, sy + 1, 0); // Start of lower row. - - dst += sample * incrN + dst_y * incrH + dst_x * incrW; + // Rescale, channel manipulation, convert type, and reformat. + dst(blockIdx.z, dst_y, dst_x, scale * *src.ptr((int)blockIdx.z, sy, sx) + offset); + } +} // resizeCrop_NN - { // Compute source data position and weight for [x0] components. - float fx = (float)((dst_x + crop_x + 0.5f) * scale_x - 0.5f); - int sx = cuda::round(fx); - fx -= sx; +//******************** Bilinear (TensorWrap; WITH normalization) +template +__global__ void resizeCrop_bilinear(DstMap dst, SrcWrapper src, const int src_w, const int src_h, + const float2 resize, const int2 crop, + const float scale, const float offset, bool src_cast) +{ + using SrcT = typename SrcWrapper::ValueType; - fx *= ((sx >= 0) && (sx < src_w - 1)); - sx = cuda::max(0, cuda::min(sx, src_w - 2)); + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; + const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - SrcT v = cuda::SaturateCast((1.0f - fx) * (aPtr[sx] * (1.0f - fy) + bPtr[sx] * fy) - + fx * (aPtr[sx + 1] * (1.0f - fy) + bPtr[sx + 1] * fy)); - // Channel manipulation, convert type, and reformat. - dst[mapC.x * incrC] = (DstT)v.x; - dst[mapC.y * incrC] = (DstT)v.y; - dst[mapC.z * incrC] = (DstT)v.z; - } + if (dst_x < dst.width() && dst_y < dst.height()) + { + // Use floating-point space for bi-linear interpolation computation. + // Compute x and y coordinates, source data position, and weights. + float fx = (dst_x + crop.x + 0.5f) * resize.x - 0.5f; + float fy = (dst_y + crop.y + 0.5f) * resize.y - 0.5f; + + int sx0 = __float2int_rd(fx); + int sy0 = __float2int_rd(fy); + int sx1 = cuda::min(sx0 + 1, src_w - 1); + int sy1 = cuda::min(sy0 + 1, src_h - 1); + + fx -= sx0; + fy -= sy0; + sx0 = cuda::max(0, sx0); + sy0 = cuda::max(0, sy0); + sx1 = (sx1 > sx0); + + // Set up source row pointers. + const SrcT *ptr0 = src.ptr((int)blockIdx.z, sy0, sx0); // Pointer in upper row. + const SrcT *ptr1 = src.ptr((int)blockIdx.z, sy1, sx0); // Pointer in lower row. + + // Bi-linear interpolation, rescale, channel manipulation, convert type, and reformat. + if (src_cast) + dst(blockIdx.z, dst_y, dst_x, + scale * cuda::SaturateCast((1-fy) * ((1-fx) * ptr0[0] + ptr0[sx1] * fx) + + fy * ((1-fx) * ptr1[0] + ptr1[sx1] * fx)) + offset); + else + dst(blockIdx.z, dst_y, dst_x, scale * (1-fy) * ((1-fx) * ptr0[0] + ptr0[sx1] * fx) + + fy * ((1-fx) * ptr1[0] + ptr1[sx1] * fx) + offset); } } // resizeCrop_bilinear -//******************** NN = Nearest Neighbor (ImageBatchVarShape src) +//******************** ImageBatchVarShape Source ********************// -template -__global__ void resizeCrop_NN_varShape(SrcWrapper src, DstT *dst, const int dst_w, const int dst_h, - const float resize_w, const float resize_h, const int crop_x, const int crop_y, - const size_t incrN, const size_t incrH, const size_t incrW, const size_t incrC, - const uchar4 mapC) +//******************** NN = Nearest Neighbor (ImageBatchVarShapeWrap) +template +__global__ void resizeCrop_NN_varShape(DstMap dst, SrcWrapper src, + const NVCVSize2D resize, const int2 crop, + float scale, float offset) { const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if (dst_x < dst_w && dst_y < dst_h) + if (dst_x < dst.width() && dst_y < dst.height()) { // Generic copy pixel to pixel. - const int sample = blockIdx.z; - const int src_w = src.width(sample); - const int src_h = src.height(sample); - - const float scale_x = static_cast(src_w) / resize_w; - const float scale_y = static_cast(src_h) / resize_h; - - dst += sample * incrN + dst_y * incrH + dst_x * incrW; + const int src_w = src.width (blockIdx.z); + const int src_h = src.height(blockIdx.z); - const int sx = cuda::min(cuda::round((dst_x + crop_x) * scale_x), src_w - 1); - const int sy = cuda::min(cuda::round((dst_y + crop_y) * scale_y), src_h - 1); + // Compute scale factors. + const float resize_x = static_cast(src_w) / resize.w; + const float resize_y = static_cast(src_h) / resize.h; - SrcT v = *src.ptr(sample, sy, sx); + // Compute source pixel positions. + const int sx = __float2int_rd((dst_x + crop.x + 0.5f) * resize_x); + const int sy = __float2int_rd((dst_y + crop.y + 0.5f) * resize_y); - // Channel manipulation, convert type, and reformat. - dst[mapC.x * incrC] = (DstT)v.x; - dst[mapC.y * incrC] = (DstT)v.y; - dst[mapC.z * incrC] = (DstT)v.z; + // Rescale, channel manipulation, convert type, and reformat. + dst(blockIdx.z, dst_y, dst_x, scale * *src.ptr((int)blockIdx.z, sy, sx) + offset); } } // resizeCrop_NN_varShape -//******************** Bilinear (ImageBatchVarShape src) - -template -__global__ void resizeCrop_bilinear_varShape(SrcWrapper src, DstT *dst, const int dst_w, const int dst_h, - const float resize_w, const float resize_h, const int crop_x, - const int crop_y, const size_t incrN, const size_t incrH, - const size_t incrW, const size_t incrC, const uchar4 mapC) +//******************** Bilinear (ImageBatchVarShapeWrap; WITH normalization) +template +__global__ void resizeCrop_bilinear_varShape(DstMap dst, SrcWrapper src, + const NVCVSize2D resize, const int2 crop, + float scale, float offset, bool src_cast) { + using SrcT = typename SrcWrapper::ValueType; + const int dst_x = blockIdx.x * blockDim.x + threadIdx.x; const int dst_y = blockIdx.y * blockDim.y + threadIdx.y; - if ((dst_x < dst_w) && (dst_y < dst_h)) + if (dst_x < dst.width() && dst_y < dst.height()) { - const int sample = blockIdx.z; - const int src_w = src.width(sample); - const int src_h = src.height(sample); - - // Float space for weighted addition. - float scale_x = static_cast(src_w) / resize_w; - float scale_y = static_cast(src_h) / resize_h; - - // Compute y coordinate. - float fy = (float)((dst_y + crop_y + 0.5f) * scale_y - 0.5f); - int sy = cuda::round(fy); - fy -= sy; - sy = cuda::max(0, cuda::min(sy, src_h - 2)); - - // Row pointers. - const SrcT *aPtr = src.ptr(sample, sy, 0); // Start of upper row. - const SrcT *bPtr = src.ptr(sample, sy + 1, 0); // Start of lower row. - - dst += sample * incrN + dst_y * incrH + dst_x * incrW; - - { // Cimpute source data position and weight for [x0] components. - float fx = (float)((dst_x + crop_x + 0.5f) * scale_x - 0.5f); - int sx = cuda::round(fx); - fx -= sx; - - fx *= ((sx >= 0) && (sx < src_w - 1)); - sx = cuda::max(0, cuda::min(sx, src_w - 2)); - - SrcT v = cuda::SaturateCast((1.0f - fx) * (aPtr[sx] * (1.0f - fy) + bPtr[sx] * fy) - + fx * (aPtr[sx + 1] * (1.0f - fy) + bPtr[sx + 1] * fy)); - // Channel manipulation, convert type, and reformat. - dst[mapC.x * incrC] = (DstT)v.x; - dst[mapC.y * incrC] = (DstT)v.y; - dst[mapC.z * incrC] = (DstT)v.z; - } + const int src_w = src.width (blockIdx.z); + const int src_h = src.height(blockIdx.z); + + // Compute resize scale factors. + float resize_x = static_cast(src_w) / resize.w; + float resize_y = static_cast(src_h) / resize.h; + + // Use floating-point space for bi-linear interpolation computation. + // Compute x and y coordinates, source data position, and weights. + float fx = (dst_x + crop.x + 0.5f) * resize_x - 0.5f; + float fy = (dst_y + crop.y + 0.5f) * resize_y - 0.5f; + + int sx0 = __float2int_rd(fx); + int sy0 = __float2int_rd(fy); + int sx1 = cuda::min(sx0 + 1, src_w - 1); + int sy1 = cuda::min(sy0 + 1, src_h - 1); + + fx -= sx0; + fy -= sy0; + sx0 = cuda::max(0, sx0); + sy0 = cuda::max(0, sy0); + sx1 = (sx1 > sx0); + + // Set up source row pointers. + const SrcT *ptr0 = src.ptr((int)blockIdx.z, sy0, sx0); // Pointer in upper row. + const SrcT *ptr1 = src.ptr((int)blockIdx.z, sy1, sx0); // Pointer in lower row. + + // Bi-linear interpolation, rescale, channel manipulation, convert type, and reformat. + if (src_cast) + dst(blockIdx.z, dst_y, dst_x, + scale * cuda::SaturateCast((1-fy) * ((1-fx) * ptr0[0] + ptr0[sx1] * fx) + + fy * ((1-fx) * ptr1[0] + ptr1[sx1] * fx)) + offset); + else + dst(blockIdx.z, dst_y, dst_x, scale * (1-fy) * ((1-fx) * ptr0[0] + ptr0[sx1] * fx) + + fy * ((1-fx) * ptr1[0] + ptr1[sx1] * fx) + offset); } } // resizeCrop_bilinear_varShape -#define MAP(m, i, v) ((uint8_t *)&(m))[i] = (v) - -inline uchar4 remapChannels(const NVCVChannelManip manip, int channels) -{ - uchar4 map = make_uchar4(0, 1, 2, 3); - - if (manip == NVCV_CHANNEL_REVERSE) - { - for (int c = 0; c < channels; ++c) MAP(map, c, channels - c - 1); - } - return map; -} - -#undef MAP +// clang-format on template void resizeCropConvertReformat(const nvcv::TensorDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData, - const NVCVSize2D resizeDim, NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip, cudaStream_t stream) + const NVCVSize2D resizeDim, NVCVInterpolationType interp, const int2 cropPos, + const NVCVChannelManip manip, float scale, float offset, bool srcCast, + cudaStream_t stream) { + constexpr uint NumElems = cuda::NumElements; + using SrcBaseT = cuda::BaseType; using DstBaseT = cuda::BaseType; + using DstMapT = DstMap; + using StrideT = int32_t; auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(srcData); NVCV_ASSERT(srcAccess); @@ -241,54 +309,40 @@ void resizeCropConvertReformat(const nvcv::TensorDataStridedCuda &srcData, const NVCV_ASSERT(samples == dstAccess->numSamples()); NVCV_ASSERT(channels == dstAccess->numChannels()); - float scale_x = (float)src_w / resizeDim.w; - float scale_y = (float)src_h / resizeDim.h; + float2 resize{(float)src_w / resizeDim.w, (float)src_h / resizeDim.h}; const int planes = dstAccess->numPlanes(); - const uchar4 remap = remapChannels(manip, channels); + const size_t addC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1); + const int addW = channels / planes; // 1 if planar; channels if not. + const int addH = static_cast(dstAccess->rowStride() / sizeof(DstBaseT)); + const size_t addN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT); - const size_t incrC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1); - const size_t incrW = channels / planes; // 1 if planar; channels if not. - const size_t incrH = dstAccess->rowStride() / sizeof(DstBaseT); - const size_t incrN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT); + DstBaseT *dstPtr = reinterpret_cast(dstData.basePtr()); - const int THREADS_PER_BLOCK = 256; //256? 64? - const int BLOCK_WIDTH = 16; //as in 32x4 or 32x8. 16x8 and 16x16 are also viable + DstMapT dst{dstPtr, addN, addH, addW, addC, manip, dst_w, dst_h}; + + const int THREADS_PER_BLOCK = 256; // 256? 64? + const int BLOCK_WIDTH = 16; // as in 32x4 or 32x8. 16x8 and 16x16 are also viable const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1); const dim3 gridSize(util::DivUp(dst_w, blockSize.x), util::DivUp(dst_h, blockSize.y), samples); - auto src = cuda::CreateTensorWrapNHW(srcData); - - DstBaseT *dst = reinterpret_cast(dstData.basePtr()); - - //Note: resize is fundamentally a gather memory operation, with a little bit of compute - // our goals are to (a) maximize throughput, and (b) minimize occupancy for the same performance + auto src = cuda::CreateTensorWrapNHW(srcData); - switch (interpolation) + // Note: resize is fundamentally a gather memory operation, with a little bit of compute + // our goals are to (a) maximize throughput, and (b) minimize occupancy for the same performance + switch (interp) { case NVCV_INTERP_NEAREST: - resizeCrop_NN<<>>(src, dst, src_w, src_h, dst_w, dst_h, scale_x, scale_y, - cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap); + resizeCrop_NN<<>>(dst, src, resize, cropPos, scale, offset); break; case NVCV_INTERP_LINEAR: - resizeCrop_bilinear<<>>(src, dst, src_w, src_h, dst_w, dst_h, scale_x, scale_y, - cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, - remap); - break; - - case NVCV_INTERP_CUBIC: - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_CUBIC"); + resizeCrop_bilinear<<>>(dst, src, src_w, src_h, resize, cropPos, scale, offset, + srcCast); break; - - case NVCV_INTERP_AREA: - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_AREA"); - break; - default: - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); break; } //switch } //resize @@ -296,11 +350,14 @@ void resizeCropConvertReformat(const nvcv::TensorDataStridedCuda &srcData, const template void resizeCropConvertReformat(const nvcv::ImageBatchVarShapeDataStridedCuda &srcData, const nvcv::TensorDataStridedCuda &dstData, const NVCVSize2D resizeDim, - const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip, cudaStream_t stream) + const NVCVInterpolationType interp, const int2 cropPos, const NVCVChannelManip manip, + float scale, float offset, bool srcCast, cudaStream_t stream) { + constexpr uint NumElems = cuda::NumElements; + using SrcBaseT = cuda::BaseType; using DstBaseT = cuda::BaseType; + using DstMapT = DstMap; auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(dstData); NVCV_ASSERT(dstAccess); @@ -316,49 +373,38 @@ void resizeCropConvertReformat(const nvcv::ImageBatchVarShapeDataStridedCuda &sr NVCV_ASSERT(samples == dstAccess->numSamples()); NVCV_ASSERT(channels == dstAccess->numChannels()); - const int planes = dstAccess->numPlanes(); - const uchar4 remap = remapChannels(manip, channels); + const int planes = dstAccess->numPlanes(); + + const size_t addC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1); + const int addW = channels / planes; // 1 if planar; channels if not. + const int addH = static_cast(dstAccess->rowStride() / sizeof(DstBaseT)); + const size_t addN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT); + + DstBaseT *dstPtr = reinterpret_cast(dstData.basePtr()); - const size_t incrC = (planes > 1 ? dstAccess->planeStride() / sizeof(DstBaseT) : 1); - const size_t incrW = channels / planes; // 1 if planar; channels if not. - const size_t incrH = dstAccess->rowStride() / sizeof(DstBaseT); - const size_t incrN = dstAccess->rowStride() * dst_h * dstAccess->numPlanes() / sizeof(DstBaseT); + DstMapT dst{dstPtr, addN, addH, addW, addC, manip, dst_w, dst_h}; - const int THREADS_PER_BLOCK = 256; //Performance degrades above 256 and below 16 (GMEM speed limited) - const int BLOCK_WIDTH = 8; //as in 32x4 or 32x8 or 8x32. + const int THREADS_PER_BLOCK = 256; // Performance degrades above 256 and below 16 (GMEM speed limited) + const int BLOCK_WIDTH = 8; // as in 32x4 or 32x8 or 8x32. const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1); const dim3 gridSize(util::DivUp(dst_w, blockSize.x), util::DivUp(dst_h, blockSize.y), samples); cuda::ImageBatchVarShapeWrap src(srcData); - DstBaseT *dst = reinterpret_cast(dstData.basePtr()); - - switch (interpolation) + switch (interp) { case NVCV_INTERP_NEAREST: - resizeCrop_NN_varShape<<>>( - src, dst, dst_w, dst_h, resizeDim.w, resizeDim.h, cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap); + resizeCrop_NN_varShape<<>>(dst, src, resizeDim, cropPos, scale, offset); break; case NVCV_INTERP_LINEAR: - resizeCrop_bilinear_varShape<<>>( - src, dst, dst_w, dst_h, resizeDim.w, resizeDim.h, cropPos.x, cropPos.y, incrN, incrH, incrW, incrC, remap); - break; - - case NVCV_INTERP_CUBIC: - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_CUBIC"); + resizeCrop_bilinear_varShape<<>>(dst, src, resizeDim, cropPos, scale, offset, + srcCast); break; - - case NVCV_INTERP_AREA: - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Interpolation not implemented: NVCV_INTERP_AREA"); - break; - default: - throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); break; - - } //switch interpolation + } // switch } } // anonymous namespace @@ -370,8 +416,9 @@ ResizeCropConvertReformat::ResizeCropConvertReformat() { } // clang-format on void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tensor &src, const nvcv::Tensor &dst, - const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, - const int2 cropPos, const NVCVChannelManip manip) const + const NVCVSize2D resizeDim, const NVCVInterpolationType interp, + const int2 cropPos, const NVCVChannelManip manip, float scale, float offset, + bool srcCast) const { auto srcData = src.exportData(); if (!srcData) @@ -450,6 +497,13 @@ void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tens throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); } + if (resizeDim.w <= 1 || resizeDim.h <= 1) + { + std::string msg = "Invalid resize dimensions: width x hight = " + std::to_string(resizeDim.w) + " x " + + std::to_string(resizeDim.h) + " dimensions must be larger than 1."; + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); + } + if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > resizeDim.w || cropPos.y + dst_h > resizeDim.h) { std::string msg = "Invalid crop region: crop region(x, y, w, h) = (" + std::to_string(cropPos.x) + ", " @@ -458,25 +512,51 @@ void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Tens throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); } + if (srcAccess->sampleStride() * samples > cuda::TypeTraits::max) + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } + + if (interp != NVCV_INTERP_NEAREST && interp != NVCV_INTERP_LINEAR) + { + switch (interp) + { + case NVCV_INTERP_CUBIC: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Interpolation not implemented: NVCV_INTERP_CUBIC"); + break; + + case NVCV_INTERP_AREA: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Interpolation not implemented: NVCV_INTERP_AREA"); + break; + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); + break; + } // switch + } + if (srcType == cuda_op::kCV_8U) { if (dstType == cuda_op::kCV_8U) { - resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, - stream); + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interp, cropPos, manip, scale, + offset, srcCast, stream); } else if (dstType == cuda_op::kCV_32F) { - resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, - stream); + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interp, cropPos, manip, scale, + offset, srcCast, stream); } } } void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &src, const nvcv::Tensor &dst, const NVCVSize2D resizeDim, - const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip) const + const NVCVInterpolationType interp, const int2 cropPos, + const NVCVChannelManip manip, float scale, float offset, bool srcCast) const { auto srcData = src.exportData(stream); if (!srcData) @@ -559,7 +639,7 @@ void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Imag throw nvcv::Exception(nvcv::Status::ERROR_NOT_COMPATIBLE, "%s", msg.c_str()); } - if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > resizeDim.w || cropPos.y + dst_h > resizeDim.h) + if (cropPos.x < 0 || cropPos.y < 0 || cropPos.x + dst_w > abs(resizeDim.w) || cropPos.y + dst_h > abs(resizeDim.h)) { std::string msg = "Invalid crop region: crop region(x, y, w, h) = (" + std::to_string(cropPos.x) + ", " + std::to_string(cropPos.y) + ", " + std::to_string(dst_w) + ", " + std::to_string(dst_h) @@ -567,17 +647,37 @@ void ResizeCropConvertReformat::operator()(cudaStream_t stream, const nvcv::Imag throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "%s", msg.c_str()); } + if (interp != NVCV_INTERP_NEAREST && interp != NVCV_INTERP_LINEAR) + { + switch (interp) + { + case NVCV_INTERP_CUBIC: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Interpolation not implemented: NVCV_INTERP_CUBIC"); + break; + + case NVCV_INTERP_AREA: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Interpolation not implemented: NVCV_INTERP_AREA"); + break; + + default: + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid interpolation"); + break; + } // switch + } + if (srcType == cuda_op::kCV_8U) { if (dstType == cuda_op::kCV_8U) { - resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, - stream); + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interp, cropPos, manip, scale, + offset, srcCast, stream); } else if (dstType == cuda_op::kCV_32F) { - resizeCropConvertReformat(*srcData, *dstData, resizeDim, interpolation, cropPos, manip, - stream); + resizeCropConvertReformat(*srcData, *dstData, resizeDim, interp, cropPos, manip, scale, + offset, srcCast, stream); } } } diff --git a/src/cvcuda/priv/OpResizeCropConvertReformat.hpp b/src/cvcuda/priv/OpResizeCropConvertReformat.hpp index eea8d7df5..88a6eaafa 100644 --- a/src/cvcuda/priv/OpResizeCropConvertReformat.hpp +++ b/src/cvcuda/priv/OpResizeCropConvertReformat.hpp @@ -42,11 +42,13 @@ class ResizeCropConvertReformat final : public IOperator void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP) const; + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP, const float scale = 1, const float offset = 0, + const bool srcCast = true) const; void operator()(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::Tensor &out, const NVCVSize2D resizeDim, const NVCVInterpolationType interpolation, const int2 cropPos, - const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP) const; + const NVCVChannelManip manip = NVCV_CHANNEL_NO_OP, const float scale = 1, const float offset = 0, + const bool srcCast = true) const; }; } // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpSIFT.cu b/src/cvcuda/priv/OpSIFT.cu index cfe2499b7..ec988b1e2 100644 --- a/src/cvcuda/priv/OpSIFT.cu +++ b/src/cvcuda/priv/OpSIFT.cu @@ -80,14 +80,14 @@ constexpr int kDescHistTotalBins = (kDescWidth + 2) * (kDescWidth + 2) * (kDescH // Tensor wrap for LNHWC tensors with C inside its type template -using TensorWrapLNHW = cuda::TensorWrap; +using TensorWrapLNHW = cuda::TensorWrap32; // Border wrap for LNHWC tensors using the corresponding tensor wrap template using BorderWrapLNHW = cuda::BorderWrap, kBorderGauss, false, false, true, true>; // Tensor wrap for descriptor has 2 compile-time strides: per 128B descriptor and per 1B within descriptor -using TensorWrapForDescriptor = cuda::TensorWrap; +using TensorWrapForDescriptor = cuda::TensorWrap32; // CPU functions --------------------------------------------------------------- @@ -189,8 +189,8 @@ __global__ void DoComputePyramids(BorderWrapLNHW prevGauss, TensorWrapLNH __shared__ float gaussOutData[SH * BW]; // plain 1D Gaussian output (intermediary) data in SMEM // Using TensorWrap with compile-time strides for easy multi-dimensional access of Gaussian data in SMEM - cuda::TensorWrap gaussIn(&gaussInData[0]); - cuda::TensorWrap gaussOut(&gaussOutData[0]); + cuda::TensorWrap32 gaussIn(&gaussInData[0]); + cuda::TensorWrap32 gaussOut(&gaussOutData[0]); int half = currKernelSize / 2; // i.e. the halo or support data outside block to compute Gaussian filter @@ -316,8 +316,10 @@ __forceinline__ __device__ int4 adj(const int4 ¢er, int dLayer, int dRow, in } // Compute descriptors, using the Gaussian pyramid, the previously computed angle and feature radius and coordinates -__global__ void DoComputeDescriptors(TensorWrapForDescriptor featDescriptors, cuda::Tensor2DWrap featCoords, - cuda::Tensor2DWrap featMetadata, cuda::Tensor1DWrap numFeatures, +__global__ void DoComputeDescriptors(TensorWrapForDescriptor featDescriptors, + cuda::Tensor2DWrap featCoords, + cuda::Tensor2DWrap featMetadata, + cuda::Tensor1DWrap numFeatures, TensorWrapLNHW currGauss, int3 currShape, int featOctave, float unscaleOctave) { @@ -334,7 +336,7 @@ __global__ void DoComputeDescriptors(TensorWrapForDescriptor featDescriptors, cu __shared__ float histogram[kDescHistTotalBins]; // Histogram with intermediary output for descriptors // Using TensorWrap with compile-time strides for easy multi-dimensional access of Gaussian data in SMEM - cuda::TensorWrap gaussIn(&gaussInData[0]); + cuda::TensorWrap32 gaussIn(&gaussInData[0]); int featIdx = blockIdx.x; // each block thru x computes one feature descriptor int sampleIdx = blockIdx.z; // each block thru z computes one image sample @@ -556,11 +558,12 @@ __global__ void DoComputeDescriptors(TensorWrapForDescriptor featDescriptors, cu // Find extrema (feature coordinates + metadata) using Gaussian + DoG (Difference of Gaussians) pyramids template -__global__ void DoFindExtrema(cuda::Tensor2DWrap featCoords, cuda::Tensor2DWrap featMetadata, - int maxCapacity, cuda::Tensor1DWrap numFeatures, - TensorWrapLNHW currGauss, TensorWrapLNHW currDoG, - int3 currShape, int featOctave, float scaleOctave, int numOctaveLayers, int thr, - float contrastThreshold, float edgeThreshold, float initSigma) +__global__ void DoFindExtrema(cuda::Tensor2DWrap featCoords, + cuda::Tensor2DWrap featMetadata, int maxCapacity, + cuda::Tensor1DWrap numFeatures, TensorWrapLNHW currGauss, + TensorWrapLNHW currDoG, int3 currShape, int featOctave, float scaleOctave, + int numOctaveLayers, int thr, float contrastThreshold, float edgeThreshold, + float initSigma) { constexpr float kImageScale = 1.f / cuda::TypeTraits
::max; // source images data type scale constexpr float kDScale1 = kImageScale * .5f; // first derivative scale @@ -884,10 +887,10 @@ void SIFT::FindExtrema(const nvcv::TensorDataStridedCuda &featCoordsData, dim3 compBlocks1; dim3 compBlocks2(maxCapacity, 1, currShape.z); - cuda::Tensor2DWrap featCoordsWrap(featCoordsData.basePtr(), (int)featCoordsData.stride(0)); - cuda::Tensor2DWrap featMetadataWrap(featMetadataData.basePtr(), (int)featMetadataData.stride(0)); - cuda::Tensor1DWrap numFeaturesWrap(numFeaturesData.basePtr()); - TensorWrapForDescriptor featDescriptorsWrap(featDescriptorsData.basePtr(), (int)featDescriptorsData.stride(0)); + cuda::Tensor2DWrap featCoordsWrap(featCoordsData.basePtr(), (int)featCoordsData.stride(0)); + cuda::Tensor2DWrap featMetadataWrap(featMetadataData.basePtr(), (int)featMetadataData.stride(0)); + cuda::Tensor1DWrap numFeaturesWrap(numFeaturesData.basePtr()); + TensorWrapForDescriptor featDescriptorsWrap(featDescriptorsData.basePtr(), (int)featDescriptorsData.stride(0)); // Initially set to zero the number of features for each image within source tensor, currShape.z = # of images NVCV_CHECK_THROW(cudaMemsetAsync(numFeaturesData.basePtr(), 0, sizeof(int) * currShape.z, stream)); @@ -958,13 +961,13 @@ void SIFT::ComputePyramids(const nvcv::TensorDataStridedCuda &inData, int3 currS if (expandInput) { - auto srcBaseWrap = cuda::CreateInterpolationWrapNHW(inData); + auto srcBaseWrap = cuda::CreateInterpolationWrapNHW(inData); UpCopy<<>>(dstBaseWrap, srcBaseWrap, currShape); // upscale copy } else { - auto srcBaseWrap = cuda::CreateTensorWrapNHW(inData); + auto srcBaseWrap = cuda::CreateTensorWrapNHW(inData); Copy<<>>(dstBaseWrap, srcBaseWrap, currShape); // direct copy } @@ -1210,6 +1213,12 @@ void SIFT::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::T throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input tensor must have 1 channel and 1 plane"); } + if (inAccess->sampleStride() * inAccess->numSamples() > cuda::TypeTraits::max) + { + throw nvcv::Exception(nvcv::Status::ERROR_OVERFLOW, "Input size exceeds %d. Tensor is too large.", + cuda::TypeTraits::max); + } + int3 inShape{(int)inAccess->numCols(), (int)inAccess->numRows(), (int)inAccess->numSamples()}; bool expandInput = (flags == NVCV_SIFT_USE_EXPANDED_INPUT); diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt index 53bed6c41..d05db4a9e 100644 --- a/src/cvcuda/priv/legacy/CMakeLists.txt +++ b/src/cvcuda/priv/legacy/CMakeLists.txt @@ -19,7 +19,6 @@ set(CV_CUDA_PRIV_LEGACY_OP_FILES filter_utils.cu custom_crop.cu reformat.cu - resize.cu resize_var_shape.cu convert_to.cu normalize.cu @@ -53,7 +52,6 @@ set(CV_CUDA_PRIV_LEGACY_OP_FILES composite_var_shape.cu custom_crop.cu reformat.cu - resize.cu resize_var_shape.cu convert_to.cu normalize.cu diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h index 5d2f42c33..657e608f1 100644 --- a/src/cvcuda/priv/legacy/CvCudaLegacy.h +++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h @@ -652,74 +652,6 @@ class Reformat : public CudaBaseOp void checkDataFormat(DataFormat format); }; -class Resize : public CudaBaseOp -{ -public: - Resize() = delete; - - Resize(DataShape max_input_shape, DataShape max_output_shape) - : CudaBaseOp(max_input_shape, max_output_shape) - { - } - - /** - * @brief Resizes the input images. This class resizes the images down to or up to the specified size. - * - * - * Limitations: - * - * - * Input: - * Data Layout: [kNHWC, kHWC] - * Channels: [1, 3, 4] - * - * Data Type | Allowed - * -------------- | ------------- - * 8bit Unsigned | Yes - * 8bit Signed | No - * 16bit Unsigned | Yes - * 16bit Signed | Yes - * 32bit Unsigned | No - * 32bit Signed | No - * 32bit Float | Yes - * 64bit Float | No - * - * Output: - * Data Layout: [kNHWC, kHWC] - * Channels: [1, 3, 4] - * - * Data Type | Allowed - * -------------- | ------------- - * 8bit Unsigned | Yes - * 8bit Signed | No - * 16bit Unsigned | Yes - * 16bit Signed | Yes - * 32bit Unsigned | No - * 32bit Signed | No - * 32bit Float | Yes - * 64bit Float | No - * - * Input/Output dependency - * - * Property | Input == Output - * -------------- | ------------- - * Data Layout | Yes - * Data Type | Yes - * Number | Yes - * Channels | Yes - * Width | No - * Height | No - * - * @param [in] inData input tensor. - * @param [out] outData Output tensor. - * @param [in] interpolation Interpolation method. See \ref NVCVInterpolationType for more details. - * @param [in] stream Stream for the asynchronous execution. - * - */ - ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const NVCVInterpolationType interpolation, cudaStream_t stream); -}; - class Morphology : public CudaBaseOp { public: diff --git a/src/cvcuda/priv/legacy/adaptive_threshold.cu b/src/cvcuda/priv/legacy/adaptive_threshold.cu index 56bdb1263..455404655 100644 --- a/src/cvcuda/priv/legacy/adaptive_threshold.cu +++ b/src/cvcuda/priv/legacy/adaptive_threshold.cu @@ -126,30 +126,46 @@ __global__ void adaptive_threshold(SrcWrapper src, DstWrapper dst, Size2D dstSiz } template -void adaptive_threshold_caller(const TensorDataStridedCuda &in, const TensorDataStridedCuda &out, const uchar maxValue, - KernelWrapper kernel, const int blockSize, const int idelta, cudaStream_t stream) +ErrorCode adaptive_threshold_caller(const TensorDataStridedCuda &in, const TensorDataStridedCuda &out, + const uchar maxValue, KernelWrapper kernel, const int blockSize, const int idelta, + cudaStream_t stream) { auto outAccess = TensorDataAccessStridedImagePlanar::Create(out); NVCV_ASSERT(outAccess); - Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; + auto inAccess = TensorDataAccessStridedImagePlanar::Create(in); + NVCV_ASSERT(inAccess); - auto src = cuda::CreateBorderWrapNHW(in, cuda::SetAll(0.f)); - auto dst = cuda::CreateTensorWrapNHW(out); + Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 grid(divUp(dstSize.w, BLOCK_DIM_X * X_STEPS), divUp(dstSize.h, block.y), outAccess->numSamples()); int s_mem_size = (blockSize - 1 + BLOCK_DIM_X * X_STEPS) * (blockSize - 1 + BLOCK_DIM_Y) + blockSize * blockSize * sizeof(float); - adaptive_threshold - <<>>(src, dst, dstSize, maxValue, kernel, blockSize, idelta); + + int64_t inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateBorderWrapNHW(in, cuda::SetAll(0.f)); + auto dst = cuda::CreateTensorWrapNHW(out); + + adaptive_threshold + <<>>(src, dst, dstSize, maxValue, kernel, blockSize, idelta); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); #endif + return ErrorCode::SUCCESS; } AdaptiveThreshold::AdaptiveThreshold(DataShape maxInputShape, DataShape maxOutputShape, int32_t maxBlockSize) @@ -158,7 +174,7 @@ AdaptiveThreshold::AdaptiveThreshold(DataShape maxInputShape, DataShape maxOutpu if (maxBlockSize <= 0) { LOG_ERROR("Invalid num of max block size " << maxBlockSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "maxBlockSize must be >= 0"); } size_t bufferSize = maxBlockSize * maxBlockSize * sizeof(float); NVCV_CHECK_THROW(cudaMalloc(&m_kernel, bufferSize)); @@ -265,16 +281,14 @@ ErrorCode AdaptiveThreshold::infer(const TensorDataStridedCuda &in, const Tensor int idelta = thresholdType == NVCV_THRESH_BINARY ? (int)std::ceil(c) : (int)std::floor(c); if (thresholdType == NVCV_THRESH_BINARY) { - adaptive_threshold_caller>(in, out, imaxval, kernelPtr, blockSize, - idelta, stream); + return adaptive_threshold_caller>(in, out, imaxval, kernelPtr, + blockSize, idelta, stream); } else { - adaptive_threshold_caller>(in, out, imaxval, kernelPtr, - blockSize, idelta, stream); + return adaptive_threshold_caller>(in, out, imaxval, kernelPtr, + blockSize, idelta, stream); } - - return ErrorCode::SUCCESS; } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu b/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu index 8f372ea1d..092af389e 100644 --- a/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu +++ b/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu @@ -53,9 +53,10 @@ struct MyLessEqual }; template -__global__ void adaptive_threshold(const SrcWrapper src, DstWrapper dst, cuda::Tensor1DWrap maxValueArr, - cuda::Tensor1DWrap blockSizeArr, cuda::Tensor1DWrap cArr, - cuda::Tensor3DWrap kernel) +__global__ void adaptive_threshold(const SrcWrapper src, DstWrapper dst, + cuda::Tensor1DWrap maxValueArr, + cuda::Tensor1DWrap blockSizeArr, + cuda::Tensor1DWrap cArr, cuda::Tensor3DWrap kernel) { const int batch_idx = get_batch_idx(); int out_x = blockIdx.x * BLOCK_DIM_X * X_STEPS; @@ -138,10 +139,11 @@ __global__ void adaptive_threshold(const SrcWrapper src, DstWrapper dst, cuda::T template void adaptive_threshold_caller(const ImageBatchVarShapeDataStridedCuda &in, - const ImageBatchVarShapeDataStridedCuda &out, cuda::Tensor1DWrap maxValueArr, + const ImageBatchVarShapeDataStridedCuda &out, + cuda::Tensor1DWrap maxValueArr, NVCVAdaptiveThresholdType adaptiveMethod, NVCVThresholdType thresholdType, - cuda::Tensor1DWrap blockSizeArr, cuda::Tensor1DWrap cArr, - cuda::Tensor3DWrap kernel, int maxBlockSize, cudaStream_t stream) + cuda::Tensor1DWrap blockSizeArr, cuda::Tensor1DWrap cArr, + cuda::Tensor3DWrap kernel, int maxBlockSize, cudaStream_t stream) { float borderValue = .0f; cuda::BorderVarShapeWrap src(in, cuda::SetAll(borderValue)); @@ -181,7 +183,7 @@ AdaptiveThresholdVarShape::AdaptiveThresholdVarShape(DataShape maxInputShape, Da if (maxBlockSize <= 0) { LOG_ERROR("Invalid num of max block size " << maxBlockSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "maxBlockSize must be >= 0"); } if (maxVarShapeBatchSize > 0) { @@ -221,7 +223,7 @@ ErrorCode AdaptiveThresholdVarShape::infer(const ImageBatchVarShapeDataStridedCu if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -265,15 +267,15 @@ ErrorCode AdaptiveThresholdVarShape::infer(const ImageBatchVarShapeDataStridedCu return ErrorCode::INVALID_PARAMETER; } - cuda::Tensor1DWrap maxValueArr(maxValue); - cuda::Tensor1DWrap blockSizeArr(blockSize); - cuda::Tensor1DWrap cArr(c); + cuda::Tensor1DWrap maxValueArr(maxValue); + cuda::Tensor1DWrap blockSizeArr(blockSize); + cuda::Tensor1DWrap cArr(c); int kernelPitch2 = static_cast(m_maxBlockSize * sizeof(float)); int kernelPitch1 = m_maxBlockSize * kernelPitch2; - float *kernelPtr = (float *)m_kernel; - cuda::Tensor3DWrap kernelTensor(kernelPtr, kernelPitch1, kernelPitch2); + float *kernelPtr = (float *)m_kernel; + cuda::Tensor3DWrap kernelTensor(kernelPtr, kernelPitch1, kernelPitch2); dim3 block(32, 4); dim3 grid(divUp(m_maxBlockSize, block.x), divUp(m_maxBlockSize, block.y), out.numImages()); diff --git a/src/cvcuda/priv/legacy/bilateral_filter.cu b/src/cvcuda/priv/legacy/bilateral_filter.cu index 0fc6a54f2..0bf2e8338 100644 --- a/src/cvcuda/priv/legacy/bilateral_filter.cu +++ b/src/cvcuda/priv/legacy/bilateral_filter.cu @@ -25,6 +25,8 @@ #include "CvCudaUtils.cuh" +#include + using namespace nvcv::legacy::cuda_op; using namespace nvcv::legacy::helpers; @@ -160,16 +162,16 @@ __global__ void BilateralFilterKernel(SrcWrapper src, DstWrapper dst, const int } } -template -void BilateralFilterCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const int batch, - int rows, int columns, int radius, float sigmaColor, float sigmaSpace, float borderValue, - cudaStream_t stream) +template +void BilateralFilterCallerS(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const int batch, + int rows, int columns, int radius, float sigmaColor, float sigmaSpace, float borderValue, + cudaStream_t stream) { dim3 block(8, 8); dim3 grid(divUp(columns, block.x * 2), divUp(rows, block.y * 2), batch); - auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(borderValue)); - auto dst = cuda::CreateTensorWrapNHW(outData); + auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(borderValue)); + auto dst = cuda::CreateTensorWrapNHW(outData); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); @@ -184,6 +186,25 @@ void BilateralFilterCaller(const TensorDataStridedCuda &inData, const TensorData #endif } +template +ErrorCode BilateralFilterCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const int batch, int rows, int columns, int radius, float sigmaColor, float sigmaSpace, + float borderValue, cudaStream_t stream) +{ + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); + if (inAccess->sampleStride() * inAccess->numSamples() <= cuda::TypeTraits::max) + { + BilateralFilterCallerS(inData, outData, batch, rows, columns, radius, sigmaColor, sigmaSpace, + borderValue, stream); + } + else + { + LOG_ERROR("Input size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; +} + ErrorCode BilateralFilter::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, int d, float sigmaColor, float sigmaSpace, NVCVBorderType borderMode, cudaStream_t stream) { @@ -266,9 +287,9 @@ ErrorCode BilateralFilter::infer(const TensorDataStridedCuda &inData, const Tens float borderValue = .0f; - typedef void (*bilateral_filter_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - int batch, int rows, int columns, int radius, float sigmaColor, float sigmaSpace, - float borderValue, cudaStream_t stream); + typedef ErrorCode (*bilateral_filter_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + int batch, int rows, int columns, int radius, float sigmaColor, + float sigmaSpace, float borderValue, cudaStream_t stream); // All templated functions instantiated here to remove one level of indirection that just hides the same lookup // table in 5 parts @@ -359,9 +380,8 @@ ErrorCode BilateralFilter::infer(const TensorDataStridedCuda &inData, const Tens BilateralFilterCaller}, }, }; - funcs[borderMode][data_type][channels - 1](inData, outData, batch, rows, columns, radius, sigmaColor, sigmaSpace, - borderValue, stream); - return ErrorCode::SUCCESS; + return funcs[borderMode][data_type][channels - 1](inData, outData, batch, rows, columns, radius, sigmaColor, + sigmaSpace, borderValue, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/box_blur.cu b/src/cvcuda/priv/legacy/box_blur.cu index aa3637d4e..5dda7ff0a 100644 --- a/src/cvcuda/priv/legacy/box_blur.cu +++ b/src/cvcuda/priv/legacy/box_blur.cu @@ -216,6 +216,33 @@ static void cuosd_apply(cuOSDContext_t context, cudaStream_t stream) } } +template +inline void RenderBlur_RGB(SrcWrap src, DstWrap dst, const cuda_op::DataShape &inputShape, cuOSDContext_t context, + cudaStream_t stream) +{ + if (src.ptr(0) != dst.ptr(0)) + { + dim3 blockSize(32, 32); + dim3 gridSize(divUp(int(inputShape.W + 1), (int)blockSize.x), divUp(int(inputShape.H + 1), (int)blockSize.y), + inputShape.N); + + render_p2p_kernel<<>>(src, dst, inputShape.N, inputShape.H, inputShape.W, + inputShape.C); + checkKernelErrors(); + } + + if (context->blur_commands.size() > 0) + { + dim3 blockSize(32, 32); + dim3 gridSize(context->blur_commands.size(), 1); + + render_blur_rgb_kernel<<>>( + src, dst, context->gpu_blur_commands ? context->gpu_blur_commands->device() : nullptr, + context->blur_commands.size(), inputShape.N, inputShape.W, inputShape.H); + checkKernelErrors(); + } +} + inline ErrorCode ApplyBoxBlur_RGB(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, cuOSDContext_t context, cudaStream_t stream) { @@ -245,10 +272,29 @@ inline ErrorCode ApplyBoxBlur_RGB(const nvcv::TensorDataStridedCuda &inData, con cuosd_apply(context, stream); - auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); + int64_t srcMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t dstMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) + { + auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); + + RenderBlur_RGB(src, dst, inputShape, context, stream); + return ErrorCode::SUCCESS; + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } +} - if (inData.basePtr() != outData.basePtr()) +template +inline void RenderBlur_RGBA(SrcWrap src, DstWrap dst, const cuda_op::DataShape &inputShape, cuOSDContext_t context, + cudaStream_t stream) +{ + if (src.ptr(0) != dst.ptr(0)) { dim3 blockSize(32, 32); dim3 gridSize(divUp(int(inputShape.W + 1), (int)blockSize.x), divUp(int(inputShape.H + 1), (int)blockSize.y), @@ -264,12 +310,11 @@ inline ErrorCode ApplyBoxBlur_RGB(const nvcv::TensorDataStridedCuda &inData, con dim3 blockSize(32, 32); dim3 gridSize(context->blur_commands.size(), 1); - render_blur_rgb_kernel<<>>( + render_blur_rgba_kernel<<>>( src, dst, context->gpu_blur_commands ? context->gpu_blur_commands->device() : nullptr, context->blur_commands.size(), inputShape.N, inputShape.W, inputShape.H); checkKernelErrors(); } - return ErrorCode::SUCCESS; } inline ErrorCode ApplyBoxBlur_RGBA(const nvcv::TensorDataStridedCuda &inData, @@ -302,31 +347,22 @@ inline ErrorCode ApplyBoxBlur_RGBA(const nvcv::TensorDataStridedCuda &inData, cuosd_apply(context, stream); - auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); + int64_t srcMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t dstMaxStride = outAccess->sampleStride() * outAccess->numSamples(); - if (inData.basePtr() != outData.basePtr()) + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) { - dim3 blockSize(32, 32); - dim3 gridSize(divUp(int(inputShape.W + 1), (int)blockSize.x), divUp(int(inputShape.H + 1), (int)blockSize.y), - inputShape.N); + auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); - render_p2p_kernel<<>>(src, dst, inputShape.N, inputShape.H, inputShape.W, - inputShape.C); - checkKernelErrors(); + RenderBlur_RGBA(src, dst, inputShape, context, stream); + return ErrorCode::SUCCESS; } - - if (context->blur_commands.size() > 0) + else { - dim3 blockSize(32, 32); - dim3 gridSize(context->blur_commands.size(), 1); - - render_blur_rgba_kernel<<>>( - src, dst, context->gpu_blur_commands ? context->gpu_blur_commands->device() : nullptr, - context->blur_commands.size(), inputShape.N, inputShape.W, inputShape.H); - checkKernelErrors(); + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; } - return ErrorCode::SUCCESS; } static ErrorCode cuosd_draw_boxblur(cuOSDContext_t context, int width, int height, NVCVBlurBoxesImpl *bboxes) diff --git a/src/cvcuda/priv/legacy/center_crop.cu b/src/cvcuda/priv/legacy/center_crop.cu index aa8e6542c..668a80d6a 100644 --- a/src/cvcuda/priv/legacy/center_crop.cu +++ b/src/cvcuda/priv/legacy/center_crop.cu @@ -50,17 +50,20 @@ template void center_crop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, int crop_rows, int crop_columns, const int batch_size, const int rows, const int columns, cudaStream_t stream) { + using StrideType = int32_t; + int top_indices = (rows - crop_rows) / 2; int left_indices = (columns - crop_columns) / 2; dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(crop_columns, (int)blockSize.x), divUp(crop_rows, (int)blockSize.y), batch_size); - auto src = nvcv::cuda::CreateTensorWrapNHW(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHW(outData); + auto src = nvcv::cuda::CreateTensorWrapNHW(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHW(outData); center_crop_kernel_nhwc<<>>(src, dst, left_indices, top_indices, crop_rows, crop_columns); + checkKernelErrors(); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaDeviceSynchronize()); @@ -115,6 +118,14 @@ ErrorCode CenterCrop::infer(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::INVALID_DATA_SHAPE; } + int64_t inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) > nvcv::cuda::TypeTraits::max) + { + LOG_ERROR("Input or output size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + typedef void (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, int crop_rows, int crop_columns, const int batch_size, const int rows, const int columns, cudaStream_t stream); diff --git a/src/cvcuda/priv/legacy/composite.cu b/src/cvcuda/priv/legacy/composite.cu index 1c0ddffb6..4679c1a11 100644 --- a/src/cvcuda/priv/legacy/composite.cu +++ b/src/cvcuda/priv/legacy/composite.cu @@ -115,7 +115,7 @@ ErrorCode Composite::infer(const TensorDataStridedCuda &foreground, const Tensor if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid foreground DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/composite_var_shape.cu b/src/cvcuda/priv/legacy/composite_var_shape.cu index d5a10ad64..c4ce78a64 100644 --- a/src/cvcuda/priv/legacy/composite_var_shape.cu +++ b/src/cvcuda/priv/legacy/composite_var_shape.cu @@ -120,7 +120,7 @@ ErrorCode CompositeVarShape::infer(const ImageBatchVarShapeDataStridedCuda &fore if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid foreground DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/convert_to.cu b/src/cvcuda/priv/legacy/convert_to.cu index 5e510d04a..607934a89 100644 --- a/src/cvcuda/priv/legacy/convert_to.cu +++ b/src/cvcuda/priv/legacy/convert_to.cu @@ -59,12 +59,15 @@ __global__ void convertFormat(SrcWrapper src, DstWrapper dst, UnOp op, int2 size } template -void convertToScaleCN(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - const double alpha, const double beta, cudaStream_t stream) +ErrorCode convertToScaleCN(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + const double alpha, const double beta, cudaStream_t stream) { auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(outAccess); + const int2 size = {inAccess->numCols(), inAccess->numRows()}; const int batch_size = inAccess->numSamples(); @@ -77,39 +80,47 @@ void convertToScaleCN(const nvcv::TensorDataStridedCuda &inData, const nvcv::Ten Convertor op; - auto src = nvcv::cuda::CreateTensorWrapNHW(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHW(outData); - op.alpha = nvcv::cuda::SaturateCast(alpha); op.beta = nvcv::cuda::SaturateCast(beta); - convertFormat<<>>(src, dst, op, size); + + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= nvcv::cuda::TypeTraits::max) + { + auto src = nvcv::cuda::CreateTensorWrapNHW(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHW(outData); + + convertFormat<<>>(src, dst, op, size); + } + else + { + LOG_ERROR("Input or output size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; } template // -void convertToScale(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - int numChannels, const double alpha, const double beta, cudaStream_t stream) +ErrorCode convertToScale(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + int numChannels, const double alpha, const double beta, cudaStream_t stream) { switch (numChannels) { case 1: - convertToScaleCN(inData, outData, alpha, beta, stream); - break; + return convertToScaleCN(inData, outData, alpha, beta, stream); case 2: - convertToScaleCN(inData, outData, alpha, beta, stream); - break; + return convertToScaleCN(inData, outData, alpha, beta, stream); case 3: - convertToScaleCN(inData, outData, alpha, beta, stream); - break; + return convertToScaleCN(inData, outData, alpha, beta, stream); case 4: - convertToScaleCN(inData, outData, alpha, beta, stream); - break; + return convertToScaleCN(inData, outData, alpha, beta, stream); default: LOG_ERROR("Unknown number of channels"); - return; + return ErrorCode::INVALID_PARAMETER; } #ifdef CUDA_DEBUG_LOG @@ -163,8 +174,8 @@ ErrorCode ConvertTo::infer(const TensorDataStridedCuda &inData, const TensorData return ErrorCode::INVALID_DATA_TYPE; } - typedef void (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - int numChannels, const double alpha, const double beta, cudaStream_t stream); + typedef ErrorCode (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + int numChannels, const double alpha, const double beta, cudaStream_t stream); // clang-format off static const func_t funcs[7][7] = { @@ -179,9 +190,7 @@ ErrorCode ConvertTo::infer(const TensorDataStridedCuda &inData, const TensorData // clang-format on const func_t func = funcs[input_datatype][output_datatype]; - func(inData, outData, channels, alpha, beta, stream); - - return ErrorCode::SUCCESS; + return func(inData, outData, channels, alpha, beta, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/copy_make_border.cu b/src/cvcuda/priv/legacy/copy_make_border.cu index 256f79edb..15e3153a6 100644 --- a/src/cvcuda/priv/legacy/copy_make_border.cu +++ b/src/cvcuda/priv/legacy/copy_make_border.cu @@ -42,33 +42,50 @@ __global__ void copyMakeBorderKernel(SrcWrapper src, DstWrapper dst, int2 dstSiz template struct copyMakeBorderDispatcher { - static void call(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const T &borderValue, - const int left, const int top, cudaStream_t stream) + static ErrorCode call(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const T &borderValue, const int left, const int top, cudaStream_t stream) { - auto src = cuda::CreateBorderWrapNHW(inData, borderValue); - auto dst = cuda::CreateTensorWrapNHW(outData); - auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); NVCV_ASSERT(outAccess); + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); + int2 dstSize{outAccess->numCols(), outAccess->numRows()}; dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(dstSize.x, blockSize.x), divUp(dstSize.y, blockSize.y), outAccess->numSamples()); - copyMakeBorderKernel<<>>(src, dst, dstSize, left, top); + int64_t srcMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t dstMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateBorderWrapNHW(inData, borderValue); + auto dst = cuda::CreateTensorWrapNHW(outData); + + copyMakeBorderKernel<<>>(src, dst, dstSize, left, top); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + checkKernelErrors(); + return ErrorCode::SUCCESS; } }; template // uchar3 float3 uchar float -void copyMakeBorder(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const int top, - const int left, const NVCVBorderType border_type, const float4 &borderValue, cudaStream_t stream) +ErrorCode copyMakeBorder(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const int top, + const int left, const NVCVBorderType border_type, const float4 &borderValue, + cudaStream_t stream) { const T bvalue = cuda::DropCast>(cuda::StaticCast>(borderValue)); - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const T &borderValue, const int left, const int top, cudaStream_t stream); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const T &borderValue, const int left, const int top, cudaStream_t stream); static const func_t funcs[] = {copyMakeBorderDispatcher::call, @@ -76,7 +93,7 @@ void copyMakeBorder(const TensorDataStridedCuda &inData, const TensorDataStrided copyMakeBorderDispatcher::call, copyMakeBorderDispatcher::call, copyMakeBorderDispatcher::call}; - funcs[border_type](inData, outData, bvalue, left, top, stream); + return funcs[border_type](inData, outData, bvalue, left, top, stream); } ErrorCode CopyMakeBorder::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, @@ -94,7 +111,7 @@ ErrorCode CopyMakeBorder::infer(const TensorDataStridedCuda &inData, const Tenso if (!(input_format == kNHWC || input_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << input_format); + LOG_ERROR("Invalid input DataFormat " << input_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -148,9 +165,9 @@ ErrorCode CopyMakeBorder::infer(const TensorDataStridedCuda &inData, const Tenso return ErrorCode::INVALID_PARAMETER; } - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, const int top, - const int left, const NVCVBorderType border_type, const float4 &borderValue, - cudaStream_t stream); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const int top, const int left, const NVCVBorderType border_type, + const float4 &borderValue, cudaStream_t stream); // clang-format off static const func_t funcs[6][4] = { @@ -166,9 +183,7 @@ ErrorCode CopyMakeBorder::infer(const TensorDataStridedCuda &inData, const Tenso const func_t func = funcs[data_type][channels - 1]; NVCV_ASSERT(func != 0); - func(inData, outData, top, left, border_type, borderValue, stream); - - return SUCCESS; + return func(inData, outData, top, left, border_type, borderValue, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/copy_make_border_var_shape.cu b/src/cvcuda/priv/legacy/copy_make_border_var_shape.cu index 28242a05a..c0323fdf3 100644 --- a/src/cvcuda/priv/legacy/copy_make_border_var_shape.cu +++ b/src/cvcuda/priv/legacy/copy_make_border_var_shape.cu @@ -33,8 +33,8 @@ namespace nvcv::legacy::cuda_op { namespace { template -__global__ void copyMakeBorderKernel(const SrcWrapper src, DstWrapper dst, const cuda::Tensor3DWrap left_, - const cuda::Tensor3DWrap top_, int out_height, int out_width) +__global__ void copyMakeBorderKernel(const SrcWrapper src, DstWrapper dst, const cuda::Tensor3DWrap left_, + const cuda::Tensor3DWrap top_, int out_height, int out_width) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -53,8 +53,8 @@ __global__ void copyMakeBorderKernel(const SrcWrapper src, DstWrapper dst, const } template -__global__ void copyMakeBorderKernel(const SrcWrapper src, DstWrapper dst, const cuda::Tensor3DWrap left_, - const cuda::Tensor3DWrap top_) +__global__ void copyMakeBorderKernel(const SrcWrapper src, DstWrapper dst, const cuda::Tensor3DWrap left_, + const cuda::Tensor3DWrap top_) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -78,8 +78,8 @@ template struct copyMakeBorderDispatcher { static void call(const ImageBatchVarShapeDataStridedCuda &src, cuda::Tensor3DWrap dst, const T &borderValue, - const cuda::Tensor3DWrap &left, const cuda::Tensor3DWrap &top, int max_height, - int max_width, cudaStream_t stream) + const cuda::Tensor3DWrap &left, const cuda::Tensor3DWrap &top, + int max_height, int max_width, cudaStream_t stream) { dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(max_width, blockSize.x), divUp(max_height, blockSize.y), src.numImages()); @@ -96,8 +96,8 @@ struct copyMakeBorderDispatcher } static void call(const ImageBatchVarShapeDataStridedCuda &src, cuda::ImageBatchVarShapeWrap dst, - const T &borderValue, const cuda::Tensor3DWrap &left, const cuda::Tensor3DWrap &top, - int max_height, int max_width, cudaStream_t stream) + const T &borderValue, const cuda::Tensor3DWrap &left, + const cuda::Tensor3DWrap &top, int max_height, int max_width, cudaStream_t stream) { dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(max_width, blockSize.x), divUp(max_height, blockSize.y), src.numImages()); @@ -124,8 +124,8 @@ void copyMakeBorder(const ImageBatchVarShapeDataStridedCuda &inData, const OutTy #pragma unroll for (int i = 0; i < cn; i++) cuda::GetElement(brdVal, i) = cuda::GetElement(value, i); - cuda::Tensor3DWrap topVec(top); - cuda::Tensor3DWrap leftVec(left); + cuda::Tensor3DWrap topVec(top); + cuda::Tensor3DWrap leftVec(left); auto outSize = GetMaxImageSize(outData); @@ -136,8 +136,8 @@ void copyMakeBorder(const ImageBatchVarShapeDataStridedCuda &inData, const OutTy out_type dstWrap(outData); typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &src, out_type dst, const src_type &borderValue, - const cuda::Tensor3DWrap &left, const cuda::Tensor3DWrap &top, int max_height, - int max_width, cudaStream_t stream); + const cuda::Tensor3DWrap &left, const cuda::Tensor3DWrap &top, + int max_height, int max_width, cudaStream_t stream); static const func_t funcs[] = {copyMakeBorderDispatcher::call, copyMakeBorderDispatcher::call, @@ -168,7 +168,7 @@ ErrorCode CopyMakeBorderVarShape::inferWarp(const ImageBatchVarShapeDataStridedC auto format = input_format; if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/custom_crop.cu b/src/cvcuda/priv/legacy/custom_crop.cu index 3695007a3..14d0aea44 100644 --- a/src/cvcuda/priv/legacy/custom_crop.cu +++ b/src/cvcuda/priv/legacy/custom_crop.cu @@ -46,20 +46,34 @@ __global__ void custom_crop_kernel(const SrcWrapper src, DstWrapper dst, int sta } template -void customCrop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, NVCVRectI roi, - cudaStream_t stream) +ErrorCode customCrop(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + NVCVRectI roi, cudaStream_t stream) { auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); NVCV_ASSERT(outAccess); - auto src = nvcv::cuda::CreateTensorWrapNHW(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHW(outData); + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); dim3 block(16, 16); dim3 grid(divUp(roi.width, block.x), divUp(roi.height, block.y), outAccess->numSamples()); - custom_crop_kernel<<>>(src, dst, roi.x, roi.y, roi.width, roi.height); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= nvcv::cuda::TypeTraits::max) + { + auto src = nvcv::cuda::CreateTensorWrapNHW(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHW(outData); + + custom_crop_kernel<<>>(src, dst, roi.x, roi.y, roi.width, roi.height); + } + else + { + LOG_ERROR("Input or output size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); + return ErrorCode::SUCCESS; } namespace nvcv::legacy::cuda_op { @@ -128,8 +142,8 @@ ErrorCode CustomCrop::infer(const TensorDataStridedCuda &inData, const TensorDat return ErrorCode::INVALID_PARAMETER; } - typedef void (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - NVCVRectI roi, cudaStream_t stream); + typedef ErrorCode (*func_t)(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + NVCVRectI roi, cudaStream_t stream); static const func_t funcs[6][4] = { {customCrop, customCrop, customCrop, customCrop}, @@ -139,9 +153,7 @@ ErrorCode CustomCrop::infer(const TensorDataStridedCuda &inData, const TensorDat {customCrop, customCrop, customCrop, customCrop} }; - funcs[data_size / 2][channels - 1](inData, outData, roi, stream); - - return ErrorCode::SUCCESS; + return funcs[data_size / 2][channels - 1](inData, outData, roi, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/cvt_color.cu b/src/cvcuda/priv/legacy/cvt_color.cu index 8b794c631..2abb235f1 100644 --- a/src/cvcuda/priv/legacy/cvt_color.cu +++ b/src/cvcuda/priv/legacy/cvt_color.cu @@ -1510,7 +1510,7 @@ ErrorCode CvtColor::infer(const TensorDataStridedCuda &inData, const TensorDataS if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu index 0d469ca71..8113fe43a 100644 --- a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu +++ b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu @@ -1662,7 +1662,7 @@ ErrorCode CvtColorVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDat if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/erase.cu b/src/cvcuda/priv/legacy/erase.cu index 4dfe8a55f..336440a8e 100644 --- a/src/cvcuda/priv/legacy/erase.cu +++ b/src/cvcuda/priv/legacy/erase.cu @@ -118,7 +118,7 @@ Erase::Erase(DataShape max_input_shape, DataShape max_output_shape, int num_eras { cudaFree(d_max_values); LOG_ERROR("Invalid num of erasing area" << max_num_erasing_area); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "max_num_erasing_area must be >= 0"); } temp_storage = NULL; storage_bytes = 0; @@ -153,12 +153,19 @@ ErrorCode Erase::infer(const TensorDataStridedCuda &inData, const TensorDataStri const TensorDataStridedCuda &values, const TensorDataStridedCuda &imgIdx, bool random, unsigned int seed, bool inplace, cudaStream_t stream) { - DataFormat format = GetLegacyDataFormat(inData.layout()); - DataType data_type = GetLegacyDataType(inData.dtype()); + DataFormat format = GetLegacyDataFormat(inData.layout()); + DataFormat out_format = GetLegacyDataFormat(outData.layout()); + DataType data_type = GetLegacyDataType(inData.dtype()); + DataType out_data_type = GetLegacyDataType(outData.dtype()); if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); + return ErrorCode::INVALID_DATA_FORMAT; + } + if (!(out_format == kNHWC || out_format == kHWC)) + { + LOG_ERROR("Invalid output DataFormat " << out_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -168,6 +175,11 @@ ErrorCode Erase::infer(const TensorDataStridedCuda &inData, const TensorDataStri LOG_ERROR("Invalid DataType " << data_type); return ErrorCode::INVALID_DATA_TYPE; } + if (data_type != out_data_type) + { + LOG_ERROR("DataType of input and output must be equal, but got " << data_type << " and " << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } DataType anchor_data_type = GetLegacyDataType(anchor.dtype()); if (anchor_data_type != kCV_32S) diff --git a/src/cvcuda/priv/legacy/erase_var_shape.cu b/src/cvcuda/priv/legacy/erase_var_shape.cu index 518b95454..e08c405c8 100644 --- a/src/cvcuda/priv/legacy/erase_var_shape.cu +++ b/src/cvcuda/priv/legacy/erase_var_shape.cu @@ -118,7 +118,7 @@ EraseVarShape::EraseVarShape(DataShape max_input_shape, DataShape max_output_sha { cudaFree(d_max_values); LOG_ERROR("Invalid num of erasing area" << max_num_erasing_area); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "max_num_erasing_area must be >= 0"); } temp_storage = NULL; storage_bytes = 0; @@ -164,10 +164,16 @@ ErrorCode EraseVarShape::infer(const nvcv::ImageBatchVarShape &inbatch, const nv LOG_ERROR("Output must be varshape image batch"); } - DataFormat format = helpers::GetLegacyDataFormat(*inData); + DataFormat format = helpers::GetLegacyDataFormat(*inData); + DataFormat out_format = helpers::GetLegacyDataFormat(*outData); if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); + return ErrorCode::INVALID_DATA_FORMAT; + } + if (!(out_format == kNHWC || out_format == kHWC)) + { + LOG_ERROR("Invalid input DataFormat " << out_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -176,14 +182,25 @@ ErrorCode EraseVarShape::infer(const nvcv::ImageBatchVarShape &inbatch, const nv LOG_ERROR("Images in input batch must all have the same format "); return ErrorCode::INVALID_DATA_FORMAT; } + if (!outData->uniqueFormat()) + { + LOG_ERROR("Images in output batch must all have the same format "); + return ErrorCode::INVALID_DATA_FORMAT; + } - DataType data_type = helpers::GetLegacyDataType(inData->uniqueFormat()); + DataType data_type = helpers::GetLegacyDataType(inData->uniqueFormat()); + DataType out_data_type = GetLegacyDataType(outData->uniqueFormat()); if (!(data_type == kCV_8U || data_type == kCV_16U || data_type == kCV_16S || data_type == kCV_32S || data_type == kCV_32F)) { LOG_ERROR("Invalid DataType " << data_type); return ErrorCode::INVALID_DATA_TYPE; } + if (data_type != out_data_type) + { + LOG_ERROR("DataType of input and output must be equal, but got " << data_type << " and " << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } DataType anchor_data_type = GetLegacyDataType(anchor.dtype()); if (anchor_data_type != kCV_32S) diff --git a/src/cvcuda/priv/legacy/filter.cu b/src/cvcuda/priv/legacy/filter.cu index 3ec059bec..643b5dbe4 100644 --- a/src/cvcuda/priv/legacy/filter.cu +++ b/src/cvcuda/priv/legacy/filter.cu @@ -26,6 +26,8 @@ #include "CvCudaUtils.cuh" #include "filter_utils.cuh" +#include + using namespace nvcv::legacy::cuda_op; using namespace nvcv::legacy::helpers; @@ -65,39 +67,53 @@ __global__ void filter2D(SrcWrapper src, DstWrapper dst, Size2D dstSize, KernelW } template -void Filter2DCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, KernelWrapper kernel, - Size2D kernelSize, int2 kernelAnchor, float borderValue, cudaStream_t stream) +ErrorCode Filter2DCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + KernelWrapper kernel, Size2D kernelSize, int2 kernelAnchor, float borderValue, + cudaStream_t stream) { auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); NVCV_ASSERT(outAccess); - Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); - auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(borderValue)); - auto dst = cuda::CreateTensorWrapNHW(outData); + Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; dim3 block(16, 16); dim3 grid(divUp(dstSize.w, block.x), divUp(dstSize.h, block.y), outAccess->numSamples()); - filter2D<<>>(src, dst, dstSize, kernel, kernelSize, kernelAnchor); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(borderValue)); + auto dst = cuda::CreateTensorWrapNHW(outData); + filter2D<<>>(src, dst, dstSize, kernel, kernelSize, kernelAnchor); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); #endif + return ErrorCode::SUCCESS; } template -void Filter2D(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, KernelWrapper kernel, - Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, float borderValue, cudaStream_t stream) +ErrorCode Filter2D(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, KernelWrapper kernel, + Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, float borderValue, + cudaStream_t stream) { switch (borderMode) { -#define NVCV_FILTER_CASE(BORDERTYPE) \ - case BORDERTYPE: \ - Filter2DCaller(inData, outData, kernel, kernelSize, kernelAnchor, borderValue, stream); \ - break +#define NVCV_FILTER_CASE(BORDERTYPE) \ + case BORDERTYPE: \ + return Filter2DCaller(inData, outData, kernel, kernelSize, kernelAnchor, borderValue, stream); NVCV_FILTER_CASE(NVCV_BORDER_CONSTANT); NVCV_FILTER_CASE(NVCV_BORDER_REPLICATE); @@ -109,6 +125,7 @@ void Filter2D(const TensorDataStridedCuda &inData, const TensorDataStridedCuda & default: break; } + return ErrorCode::SUCCESS; } // Laplacian ------------------------------------------------------------------- @@ -159,7 +176,7 @@ ErrorCode Laplacian::infer(const TensorDataStridedCuda &inData, const TensorData if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -188,9 +205,9 @@ ErrorCode Laplacian::infer(const TensorDataStridedCuda &inData, const TensorData normalizeAnchor(kernelAnchor, kLaplacianKernelSize); float borderValue = .0f; - typedef void (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - cuda::math::Vector kernel, Size2D kernelSize, int2 kernelAnchor, - NVCVBorderType borderMode, float borderValue, cudaStream_t stream); + typedef ErrorCode (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + cuda::math::Vector kernel, Size2D kernelSize, int2 kernelAnchor, + NVCVBorderType borderMode, float borderValue, cudaStream_t stream); static const filter2D_t funcs[6][4] = { { Filter2D, 0, Filter2D, Filter2D}, @@ -217,10 +234,8 @@ ErrorCode Laplacian::infer(const TensorDataStridedCuda &inData, const TensorData kernel *= scale; } - funcs[data_type][channels - 1](inData, outData, kernel, kLaplacianKernelSize, kernelAnchor, borderMode, borderValue, - stream); - - return ErrorCode::SUCCESS; + return funcs[data_type][channels - 1](inData, outData, kernel, kLaplacianKernelSize, kernelAnchor, borderMode, + borderValue, stream); } // Gaussian -------------------------------------------------------------------- @@ -259,7 +274,7 @@ ErrorCode Gaussian::infer(const TensorDataStridedCuda &inData, const TensorDataS if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -321,9 +336,9 @@ ErrorCode Gaussian::infer(const TensorDataStridedCuda &inData, const TensorDataS normalizeAnchor(kernelAnchor, kernelSize); float borderValue = .0f; - typedef void (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, float *kernel, - Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, float borderValue, - cudaStream_t stream); + typedef ErrorCode (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + float *kernel, Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, + float borderValue, cudaStream_t stream); static const filter2D_t funcs[6][4] = { { Filter2D, 0, Filter2D, Filter2D}, @@ -334,10 +349,8 @@ ErrorCode Gaussian::infer(const TensorDataStridedCuda &inData, const TensorDataS { Filter2D, 0, Filter2D, Filter2D}, }; - funcs[data_type][channels - 1](inData, outData, m_kernel, kernelSize, kernelAnchor, borderMode, borderValue, - stream); - - return ErrorCode::SUCCESS; + return funcs[data_type][channels - 1](inData, outData, m_kernel, kernelSize, kernelAnchor, borderMode, borderValue, + stream); } // Average Blur ---------------------------------------------------------------- @@ -376,7 +389,7 @@ ErrorCode AverageBlur::infer(const TensorDataStridedCuda &inData, const TensorDa if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -424,9 +437,9 @@ ErrorCode AverageBlur::infer(const TensorDataStridedCuda &inData, const TensorDa normalizeAnchor(kernelAnchor, kernelSize); float borderValue = .0f; - typedef void (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, float *kernel, - Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, float borderValue, - cudaStream_t stream); + typedef ErrorCode (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + float *kernel, Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, + float borderValue, cudaStream_t stream); static const filter2D_t funcs[6][4] = { { Filter2D, 0, Filter2D, Filter2D}, @@ -448,10 +461,8 @@ ErrorCode AverageBlur::infer(const TensorDataStridedCuda &inData, const TensorDa m_curKernelSize = kernelSize; } - funcs[data_type][channels - 1](inData, outData, m_kernel, kernelSize, kernelAnchor, borderMode, borderValue, - stream); - - return ErrorCode::SUCCESS; + return funcs[data_type][channels - 1](inData, outData, m_kernel, kernelSize, kernelAnchor, borderMode, borderValue, + stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/filter_utils.cu b/src/cvcuda/priv/legacy/filter_utils.cu index d85de2afd..5a88b24b4 100644 --- a/src/cvcuda/priv/legacy/filter_utils.cu +++ b/src/cvcuda/priv/legacy/filter_utils.cu @@ -72,8 +72,9 @@ __global__ void computeGaussianKernel(float *kernel, Size2D kernelSize, double2 kernel[coord.y * kernelSize.w + coord.x] = computeSingleGaussianValue(coord, half, sigma); } -__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, cuda::Tensor1DWrap kernelSizeArr, - cuda::Tensor1DWrap kernelAnchorArr) +__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap kernelSizeArr, + cuda::Tensor1DWrap kernelAnchorArr) { int3 coord = cuda::StaticCast(blockIdx * blockDim + threadIdx); @@ -107,9 +108,9 @@ __global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, cuda kernel[coord] = 1.f / (kernelSize.x * kernelSize.y); } -__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, int dataKernelSize, - Size2D maxKernelSize, cuda::Tensor1DWrap kernelSizeArr, - cuda::Tensor1DWrap sigmaArr) +__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, int dataKernelSize, + Size2D maxKernelSize, cuda::Tensor1DWrap kernelSizeArr, + cuda::Tensor1DWrap sigmaArr) { int3 coord = cuda::StaticCast(blockIdx * blockDim + threadIdx); @@ -144,7 +145,8 @@ __global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, kernel[coord] = computeSingleGaussianValue(coord, half, sigma); } -__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, cuda::Tensor1DWrap blockSizeArr) +__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap blockSizeArr) { int3 coord = cuda::StaticCast(blockIdx * blockDim + threadIdx); @@ -158,7 +160,8 @@ __global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, cuda kernel[coord] = 1.f / (blockSize * blockSize); } -__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, cuda::Tensor1DWrap blockSizeArr) +__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap blockSizeArr) { int3 coord = cuda::StaticCast(blockIdx * blockDim + threadIdx); diff --git a/src/cvcuda/priv/legacy/filter_utils.cuh b/src/cvcuda/priv/legacy/filter_utils.cuh index 1263be31d..a6248dc44 100644 --- a/src/cvcuda/priv/legacy/filter_utils.cuh +++ b/src/cvcuda/priv/legacy/filter_utils.cuh @@ -26,16 +26,19 @@ __global__ void computeMeanKernel(float *kernel_ptr, int k_size); __global__ void computeGaussianKernel(float *kernel, Size2D kernelSize, double2 sigma); -__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, cuda::Tensor1DWrap kernelSizeArr, - cuda::Tensor1DWrap kernelAnchorArr); +__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap kernelSizeArr, + cuda::Tensor1DWrap kernelAnchorArr); -__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, int dataKernelSize, - Size2D maxKernelSize, cuda::Tensor1DWrap kernelSizeArr, - cuda::Tensor1DWrap sigmaArr); +__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, int dataKernelSize, + Size2D maxKernelSize, cuda::Tensor1DWrap kernelSizeArr, + cuda::Tensor1DWrap sigmaArr); -__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, cuda::Tensor1DWrap blockSizeArr); +__global__ void computeMeanKernelVarShape(cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap blockSizeArr); -__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, cuda::Tensor1DWrap blockSizeArr); +__global__ void computeGaussianKernelVarShape(cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap blockSizeArr); } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/filter_var_shape.cu b/src/cvcuda/priv/legacy/filter_var_shape.cu index e86114238..9be8f0434 100644 --- a/src/cvcuda/priv/legacy/filter_var_shape.cu +++ b/src/cvcuda/priv/legacy/filter_var_shape.cu @@ -34,7 +34,7 @@ namespace nvcv::legacy::cuda_op { template __global__ void filter2D(const SrcWrapper src, DstWrapper dst, cuda::ImageBatchVarShapeWrap kernel, - cuda::Tensor1DWrap kernelAnchor) + cuda::Tensor1DWrap kernelAnchor) { using work_type = cuda::ConvertBaseTypeTo; work_type res = cuda::SetAll(0); @@ -81,7 +81,7 @@ void Filter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, const Image cuda::BorderVarShapeWrap src(inData, cuda::SetAll(borderValue)); cuda::ImageBatchVarShapeWrap dst(outData); cuda::ImageBatchVarShapeWrap kernel(kernelData); - cuda::Tensor1DWrap kernelAnchor(kernelAnchorData); + cuda::Tensor1DWrap kernelAnchor(kernelAnchorData); using work_type = cuda::ConvertBaseTypeTo; @@ -138,7 +138,7 @@ ErrorCode Conv2DVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData, if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -218,8 +218,8 @@ __device__ cuda::math::Vector kLaplacianKernel3{ // Laplacian kernels are either one or the other (above) template -__global__ void laplacianFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tensor1DWrap ksize, - cuda::Tensor1DWrap scale) +__global__ void laplacianFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tensor1DWrap ksize, + cuda::Tensor1DWrap scale) { using work_type = cuda::ConvertBaseTypeTo; work_type res = cuda::SetAll(0); @@ -268,8 +268,8 @@ void LaplacianFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, { cuda::BorderVarShapeWrap src(inData, cuda::SetAll(borderValue)); cuda::ImageBatchVarShapeWrap dst(outData); - cuda::Tensor1DWrap kernelApertureSize(ksize); - cuda::Tensor1DWrap kernelScale(scale); + cuda::Tensor1DWrap kernelApertureSize(ksize); + cuda::Tensor1DWrap kernelScale(scale); using work_type = cuda::ConvertBaseTypeTo; @@ -323,7 +323,7 @@ ErrorCode LaplacianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDa if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -384,8 +384,8 @@ ErrorCode LaplacianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDa // GaussianVarShape ------------------------------------------------------------ template -__global__ void gaussianFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tensor3DWrap kernel, - cuda::Tensor1DWrap kernelSizeArr) +__global__ void gaussianFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap kernelSizeArr) { using work_type = cuda::ConvertBaseTypeTo; work_type res = cuda::SetAll(0); @@ -419,10 +419,11 @@ __global__ void gaussianFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Ten } template -void GaussianFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, - const cuda::Tensor1DWrap &kernelSizeTensor, float borderValue, cudaStream_t stream) +void GaussianFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, + const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, float borderValue, + cudaStream_t stream) { cuda::BorderVarShapeWrap src(inData, cuda::SetAll(borderValue)); cuda::ImageBatchVarShapeWrap dst(outData); @@ -447,13 +448,14 @@ void GaussianFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, template void GaussianFilter2D(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, const cuda::Tensor1DWrap &kernelSizeTensor, - NVCVBorderType borderMode, float borderValue, cudaStream_t stream) + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, NVCVBorderType borderMode, + float borderValue, cudaStream_t stream) { - typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, - const cuda::Tensor1DWrap &kernelSizeTensor, float borderValue, cudaStream_t stream); + typedef void (*func_t)( + const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, float borderValue, cudaStream_t stream); static const func_t funcs[] = {GaussianFilter2DCaller, GaussianFilter2DCaller, @@ -503,7 +505,7 @@ ErrorCode GaussianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDat if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -544,23 +546,24 @@ ErrorCode GaussianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDat dim3 block(32, 4); dim3 grid(divUp(m_maxKernelSize.w, block.x), divUp(m_maxKernelSize.h, block.y), outData.numImages()); - cuda::Tensor1DWrap kernelSizeTensor(kernelSize); - cuda::Tensor1DWrap sigmaTensor(sigma); + cuda::Tensor1DWrap kernelSizeTensor(kernelSize); + cuda::Tensor1DWrap sigmaTensor(sigma); int kernelPitch2 = static_cast(m_maxKernelSize.w * sizeof(float)); int kernelPitch1 = m_maxKernelSize.h * kernelPitch2; - cuda::Tensor3DWrap kernelTensor(m_kernel, kernelPitch1, kernelPitch2); + cuda::Tensor3DWrap kernelTensor(m_kernel, kernelPitch1, kernelPitch2); computeGaussianKernelVarShape<<>>(kernelTensor, dataKernelSize, m_maxKernelSize, kernelSizeTensor, sigmaTensor); checkKernelErrors(); - typedef void (*filter2D_t)( - const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, const cuda::Tensor1DWrap &kernelSizeTensor, - NVCVBorderType borderMode, float borderValue, cudaStream_t stream); + typedef void (*filter2D_t)(const ImageBatchVarShapeDataStridedCuda &inData, + const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, NVCVBorderType borderMode, + float borderValue, cudaStream_t stream); static const filter2D_t funcs[6][4] = { { GaussianFilter2D, 0, GaussianFilter2D, GaussianFilter2D}, @@ -583,8 +586,9 @@ ErrorCode GaussianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDat // AverageBlurVarShape --------------------------------------------------------- template -__global__ void avgBlurFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tensor3DWrap kernel, - cuda::Tensor1DWrap kernelSizeArr, cuda::Tensor1DWrap kernelAnchorArr) +__global__ void avgBlurFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tensor3DWrap kernel, + cuda::Tensor1DWrap kernelSizeArr, + cuda::Tensor1DWrap kernelAnchorArr) { using work_type = cuda::ConvertBaseTypeTo; work_type res = cuda::SetAll(0); @@ -618,11 +622,11 @@ __global__ void avgBlurFilter2D(const SrcWrapper src, DstWrapper dst, cuda::Tens } template -void AverageBlurFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, - const cuda::Tensor1DWrap &kernelSizeTensor, - const cuda::Tensor1DWrap &kernelAnchorTensor, float borderValue, +void AverageBlurFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, + const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, + const cuda::Tensor1DWrap &kernelAnchorTensor, float borderValue, cudaStream_t stream) { cuda::BorderVarShapeWrap src(inData, cuda::SetAll(borderValue)); @@ -647,17 +651,18 @@ void AverageBlurFilter2DCaller(const ImageBatchVarShapeDataStridedCuda &inData, } template -void AverageBlurFilter2D(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, - const cuda::Tensor1DWrap &kernelSizeTensor, - const cuda::Tensor1DWrap &kernelAnchorTensor, NVCVBorderType borderMode, +void AverageBlurFilter2D(const ImageBatchVarShapeDataStridedCuda &inData, + const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, + const cuda::Tensor1DWrap &kernelAnchorTensor, NVCVBorderType borderMode, float borderValue, cudaStream_t stream) { typedef void (*func_t)( const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, const cuda::Tensor1DWrap &kernelSizeTensor, - const cuda::Tensor1DWrap &kernelAnchorTensor, float borderValue, cudaStream_t stream); + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, + const cuda::Tensor1DWrap &kernelAnchorTensor, float borderValue, cudaStream_t stream); static const func_t funcs[] = {AverageBlurFilter2DCaller, AverageBlurFilter2DCaller, @@ -707,7 +712,7 @@ ErrorCode AverageBlurVarShape::infer(const ImageBatchVarShapeDataStridedCuda &in if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -746,23 +751,27 @@ ErrorCode AverageBlurVarShape::infer(const ImageBatchVarShapeDataStridedCuda &in dim3 block(32, 4); dim3 grid(divUp(m_maxKernelSize.w, block.x), divUp(m_maxKernelSize.h, block.y), outData.numImages()); - cuda::Tensor1DWrap kernelSizeTensor(kernelSize); - cuda::Tensor1DWrap kernelAnchorTensor(kernelAnchor); + cuda::Tensor1DWrap kernelSizeTensor(kernelSize); + cuda::Tensor1DWrap kernelAnchorTensor(kernelAnchor); int kernelPitch2 = static_cast(m_maxKernelSize.w * sizeof(float)); int kernelPitch1 = m_maxKernelSize.h * kernelPitch2; - cuda::Tensor3DWrap kernelTensor(m_kernel, kernelPitch1, kernelPitch2); + cuda::Tensor3DWrap kernelTensor(m_kernel, kernelPitch1, kernelPitch2); computeMeanKernelVarShape<<>>(kernelTensor, kernelSizeTensor, kernelAnchorTensor); checkKernelErrors(); - - typedef void (*filter2D_t)( - const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor3DWrap &kernelTensor, const cuda::Tensor1DWrap &kernelSizeTensor, - const cuda::Tensor1DWrap &kernelAnchorTensor, NVCVBorderType borderMode, float borderValue, - cudaStream_t stream); + // clang-format off + typedef void (*filter2D_t)(const ImageBatchVarShapeDataStridedCuda &inData, + const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor3DWrap &kernelTensor, + const cuda::Tensor1DWrap &kernelSizeTensor, + const cuda::Tensor1DWrap &kernelAnchorTensor, + NVCVBorderType borderMode, + float borderValue, + cudaStream_t stream); + // clang-format on static const filter2D_t funcs[6][4] = { { AverageBlurFilter2D, 0, AverageBlurFilter2D, AverageBlurFilter2D}, diff --git a/src/cvcuda/priv/legacy/flip.cu b/src/cvcuda/priv/legacy/flip.cu index eaba1ccfd..f7fded805 100644 --- a/src/cvcuda/priv/legacy/flip.cu +++ b/src/cvcuda/priv/legacy/flip.cu @@ -76,22 +76,13 @@ __global__ void flipHorizontalVertical(SrcWrapper src, DstWrapper dst, Size2D ds } } -template -void flip(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode, - cudaStream_t stream) +template +void runFlipKernel(SrcWrap src, DstWrap dst, Size2D dstSize, int numSamples, int32_t flipCode, cudaStream_t stream) { constexpr uint32_t BLOCK = 32; - auto outputWrapper = TensorDataAccessStridedImagePlanar::Create(output); - NVCV_ASSERT(outputWrapper); - - Size2D dstSize{outputWrapper->numCols(), outputWrapper->numRows()}; - - auto src = cuda::CreateTensorWrapNHW(input); - auto dst = cuda::CreateTensorWrapNHW(output); - dim3 blockSize(BLOCK, BLOCK / 4, 1); - dim3 gridSize(divUp(dstSize.w, blockSize.x), divUp(dstSize.h, blockSize.y), outputWrapper->numSamples()); + dim3 gridSize(divUp(dstSize.w, blockSize.x), divUp(dstSize.h, blockSize.y), numSamples); if (flipCode > 0) { @@ -108,11 +99,41 @@ void flip(const TensorDataStridedCuda &input, const TensorDataStridedCuda &outpu flipHorizontalVertical<<>>(src, dst, dstSize); checkKernelErrors(); } +} + +template +ErrorCode flip(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode, + cudaStream_t stream) +{ + auto outAccess = TensorDataAccessStridedImagePlanar::Create(output); + NVCV_ASSERT(outAccess); + + auto inAccess = TensorDataAccessStridedImagePlanar::Create(input); + NVCV_ASSERT(inAccess); + + Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; + + int64_t inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateTensorWrapNHW(input); + auto dst = cuda::CreateTensorWrapNHW(output); + + runFlipKernel(src, dst, dstSize, outAccess->numSamples(), flipCode, stream); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); #endif // CUDA_DEBUG_LOG + + return ErrorCode::SUCCESS; } ErrorCode Flip::infer(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, const int32_t flipCode, @@ -135,7 +156,7 @@ ErrorCode Flip::infer(const TensorDataStridedCuda &input, const TensorDataStride DataFormat format = inputFormat; if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -159,8 +180,8 @@ ErrorCode Flip::infer(const TensorDataStridedCuda &input, const TensorDataStride // using flip_t = void(const TensorDataStridedCuda & input, // const TensorDataStridedCuda & output, // const int32_t flipCode, cudaStream_t stream); - typedef void (*flip_t)(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, - const int32_t flipCode, cudaStream_t stream); + typedef ErrorCode (*flip_t)(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, + const int32_t flipCode, cudaStream_t stream); static const flip_t funcs[6][4] = { { flip, 0, flip, flip}, @@ -172,9 +193,7 @@ ErrorCode Flip::infer(const TensorDataStridedCuda &input, const TensorDataStride }; const int32_t channels = inputShape.C; - funcs[dataType][channels - 1](input, output, flipCode, stream); - - return ErrorCode::SUCCESS; + return funcs[dataType][channels - 1](input, output, flipCode, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu b/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu index 595e06723..4cf143b65 100644 --- a/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu +++ b/src/cvcuda/priv/legacy/flip_or_copy_var_shape.cu @@ -101,7 +101,7 @@ ErrorCode FlipOrCopyVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inp DataFormat format = inputFormat; if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/gaussian_noise.cu b/src/cvcuda/priv/legacy/gaussian_noise.cu index 77d09fef1..2af397b5b 100644 --- a/src/cvcuda/priv/legacy/gaussian_noise.cu +++ b/src/cvcuda/priv/legacy/gaussian_noise.cu @@ -33,9 +33,10 @@ using namespace nvcv::cuda; #define BLOCK 512 -template -__global__ void gaussian_noise_kernel(const Tensor3DWrap src, Tensor3DWrap dst, curandState *state, - Tensor1DWrap mu, Tensor1DWrap sigma, int rows, int cols) +template +__global__ void gaussian_noise_kernel(const Tensor3DWrap src, Tensor3DWrap dst, + curandState *state, Tensor1DWrap mu, + Tensor1DWrap sigma, int rows, int cols) { int offset = threadIdx.x; int batch_idx = blockIdx.x; @@ -54,10 +55,11 @@ __global__ void gaussian_noise_kernel(const Tensor3DWrap src, Tensor3DWrap state[id] = localState; } -template -__global__ void gaussian_noise_per_channel_kernel(const Tensor4DWrap src, Tensor4DWrap dst, curandState *state, - Tensor1DWrap mu, Tensor1DWrap sigma, int rows, int cols, - int channel) +template +__global__ void gaussian_noise_per_channel_kernel(const Tensor4DWrap src, + Tensor4DWrap dst, curandState *state, + Tensor1DWrap mu, Tensor1DWrap sigma, + int rows, int cols, int channel) { int offset = threadIdx.x; int batch_idx = blockIdx.x; @@ -79,9 +81,10 @@ __global__ void gaussian_noise_per_channel_kernel(const Tensor4DWrap src, Ten state[id] = localState; } -template -__global__ void gaussian_noise_float_kernel(const Tensor3DWrap src, Tensor3DWrap dst, curandState *state, - Tensor1DWrap mu, Tensor1DWrap sigma, int rows, int cols) +template +__global__ void gaussian_noise_float_kernel(const Tensor3DWrap src, Tensor3DWrap dst, + curandState *state, Tensor1DWrap mu, + Tensor1DWrap sigma, int rows, int cols) { int offset = threadIdx.x; int batch_idx = blockIdx.x; @@ -101,10 +104,12 @@ __global__ void gaussian_noise_float_kernel(const Tensor3DWrap src, Tensor3DW state[id] = localState; } -template -__global__ void gaussian_noise_float_per_channel_kernel(const Tensor4DWrap src, Tensor4DWrap dst, - curandState *state, Tensor1DWrap mu, - Tensor1DWrap sigma, int rows, int cols, int channel) +template +__global__ void gaussian_noise_float_per_channel_kernel(const Tensor4DWrap src, + Tensor4DWrap dst, curandState *state, + Tensor1DWrap mu, + Tensor1DWrap sigma, int rows, int cols, + int channel) { int offset = threadIdx.x; int batch_idx = blockIdx.x; @@ -127,60 +132,60 @@ __global__ void gaussian_noise_float_per_channel_kernel(const Tensor4DWrap sr state[id] = localState; } -template +template void gaussian_noise(const nvcv::TensorDataStridedCuda &d_in, const nvcv::TensorDataStridedCuda &d_out, int batch, int rows, int cols, curandState *m_states, const nvcv::TensorDataStridedCuda &_mu, const nvcv::TensorDataStridedCuda &_sigma, cudaStream_t stream) { - auto src_ptr = CreateTensorWrapNHW(d_in); - auto dst_ptr = CreateTensorWrapNHW(d_out); - Tensor1DWrap mu(_mu); - Tensor1DWrap sigma(_sigma); + auto src_ptr = CreateTensorWrapNHW(d_in); + auto dst_ptr = CreateTensorWrapNHW(d_out); + Tensor1DWrap mu(_mu); + Tensor1DWrap sigma(_sigma); gaussian_noise_kernel<<>>(src_ptr, dst_ptr, m_states, mu, sigma, rows, cols); checkKernelErrors(); } -template +template void gaussian_noise_per_channel(const nvcv::TensorDataStridedCuda &d_in, const nvcv::TensorDataStridedCuda &d_out, int batch, int channels, int rows, int cols, curandState *m_states, const nvcv::TensorDataStridedCuda &_mu, const nvcv::TensorDataStridedCuda &_sigma, cudaStream_t stream) { - auto src_ptr = CreateTensorWrapNHWC(d_in); - auto dst_ptr = CreateTensorWrapNHWC(d_out); - Tensor1DWrap mu(_mu); - Tensor1DWrap sigma(_sigma); + auto src_ptr = CreateTensorWrapNHWC(d_in); + auto dst_ptr = CreateTensorWrapNHWC(d_out); + Tensor1DWrap mu(_mu); + Tensor1DWrap sigma(_sigma); gaussian_noise_per_channel_kernel <<>>(src_ptr, dst_ptr, m_states, mu, sigma, rows, cols, channels); checkKernelErrors(); } -template +template void gaussian_noise_float(const nvcv::TensorDataStridedCuda &d_in, const nvcv::TensorDataStridedCuda &d_out, int batch, int rows, int cols, curandState *m_states, const nvcv::TensorDataStridedCuda &_mu, const nvcv::TensorDataStridedCuda &_sigma, cudaStream_t stream) { - auto src_ptr = CreateTensorWrapNHW(d_in); - auto dst_ptr = CreateTensorWrapNHW(d_out); - Tensor1DWrap mu(_mu); - Tensor1DWrap sigma(_sigma); + auto src_ptr = CreateTensorWrapNHW(d_in); + auto dst_ptr = CreateTensorWrapNHW(d_out); + Tensor1DWrap mu(_mu); + Tensor1DWrap sigma(_sigma); gaussian_noise_float_kernel<<>>(src_ptr, dst_ptr, m_states, mu, sigma, rows, cols); checkKernelErrors(); } -template +template void gaussian_noise_float_per_channel(const nvcv::TensorDataStridedCuda &d_in, const nvcv::TensorDataStridedCuda &d_out, int batch, int channels, int rows, int cols, curandState *m_states, const nvcv::TensorDataStridedCuda &_mu, const nvcv::TensorDataStridedCuda &_sigma, cudaStream_t stream) { - auto src_ptr = CreateTensorWrapNHWC(d_in); - auto dst_ptr = CreateTensorWrapNHWC(d_out); - Tensor1DWrap mu(_mu); - Tensor1DWrap sigma(_sigma); + auto src_ptr = CreateTensorWrapNHWC(d_in); + auto dst_ptr = CreateTensorWrapNHWC(d_out); + Tensor1DWrap mu(_mu); + Tensor1DWrap sigma(_sigma); gaussian_noise_float_per_channel_kernel <<>>(src_ptr, dst_ptr, m_states, mu, sigma, rows, cols, channels); @@ -224,12 +229,12 @@ ErrorCode GaussianNoise::infer(const TensorDataStridedCuda &inData, const Tensor DataFormat out_format = GetLegacyDataFormat(outData.layout()); if (!(in_format == kNHWC || in_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << in_format); + LOG_ERROR("Invalid input DataFormat " << in_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } if (!(out_format == kNHWC || out_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << out_format); + LOG_ERROR("Invalid output DataFormat " << out_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -242,6 +247,23 @@ ErrorCode GaussianNoise::infer(const TensorDataStridedCuda &inData, const Tensor return ErrorCode::INVALID_DATA_SHAPE; } + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (inMaxStride > cuda::TypeTraits::max) + { + LOG_ERROR("Input size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + + auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(outAccess); + + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (outMaxStride > cuda::TypeTraits::max) + { + LOG_ERROR("Output size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + DataType in_data_type = GetLegacyDataType(inData.dtype()); if (!(in_data_type == kCV_8U || in_data_type == kCV_16U || in_data_type == kCV_16S || in_data_type == kCV_32S || in_data_type == kCV_32F)) diff --git a/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu b/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu index c97515cfa..526e4ad84 100644 --- a/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu +++ b/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu @@ -205,7 +205,7 @@ GaussianNoiseVarShape::GaussianNoiseVarShape(DataShape max_input_shape, DataShap if (maxBatchSize < 0) { LOG_ERROR("Invalid num of max batch size " << maxBatchSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "maxBatchSize must be >= 0"); } cudaError_t err = cudaMalloc((void **)&m_states, sizeof(curandState) * BLOCK * maxBatchSize); if (err != cudaSuccess) @@ -231,12 +231,12 @@ ErrorCode GaussianNoiseVarShape::infer(const ImageBatchVarShapeDataStridedCuda & DataFormat out_format = helpers::GetLegacyDataFormat(outData); if (!(in_format == kNHWC || in_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << in_format); + LOG_ERROR("Invalid input DataFormat " << in_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } if (!(out_format == kNHWC || out_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << out_format); + LOG_ERROR("Invalid output DataFormat " << out_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/histogram_eq.cu b/src/cvcuda/priv/legacy/histogram_eq.cu index 5a4157101..89ce26f05 100644 --- a/src/cvcuda/priv/legacy/histogram_eq.cu +++ b/src/cvcuda/priv/legacy/histogram_eq.cu @@ -178,7 +178,7 @@ HistogramEq::HistogramEq(int maxBatchSize) if (maxBatchSize < 0) { LOG_ERROR("Invalid num of max batch size " << maxBatchSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "maxBatchSize must be >= 0"); } m_maxBatchSize = maxBatchSize; @@ -201,6 +201,63 @@ HistogramEq::~HistogramEq() } } +template +ErrorCode infer_histogram(SrcWrap src, DstWrap dst, HistWrap histo, int batch, nvcv::Size2D dstSize, int channels, + cudaStream_t stream) +{ + { + //compute the histogram for each image in the batch into m_histoArray + int bsX = 32; //1024 ( 4 ch of 256 bins) + int bsY = 32; + + switch (channels) + { + case 1: + bsX = 16; // 256 (1 ch) + bsY = 16; + break; + case 2: + bsX = 32; // 512 (2 ch) + bsY = 16; + break; + case 3: + bsX = 32; // 768 (3 ch) + bsY = 24; + break; + default: + break; + } + + // each block is going to be 256bins * channels = threads + dim3 histBlockSize(bsX, bsY, 1); + dim3 histGridSize(divUp(dstSize.w, histBlockSize.x), divUp(dstSize.h, histBlockSize.y), batch); + size_t sharedMemSize = 256 * channels * sizeof(int); + hist_kernel<<>>(src, histo, channels, dstSize); + checkKernelErrors(); + } + + //compute cfd + { + int bsX = 256; + int bsY = 1; + int bsZ = 1; + dim3 prefixSumBlockSize(bsX, bsY, bsZ); + dim3 prefixSumGridSize(channels, 1, batch); + prefix_sum_with_norm_kernel<<>>(histo, dstSize); + checkKernelErrors(); + } + + { + dim3 lookupBlockSize(32, 32, 1); + dim3 lookupGridSize(divUp(dstSize.w, lookupBlockSize.x), divUp(dstSize.h, lookupBlockSize.y), batch); + lookup<<>>(src, dst, histo, channels, + dstSize); + checkKernelErrors(); + } + + return ErrorCode::SUCCESS; +} + ErrorCode HistogramEq::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, cudaStream_t stream) { @@ -261,62 +318,24 @@ ErrorCode HistogramEq::infer(const TensorDataStridedCuda &inData, const TensorDa //clear the histogram. checkCudaErrors(cudaMemsetAsync(m_histoArray, 0, m_sizeOfHisto, stream)); - auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); - auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); // 2d wrap since its an array of [256 * channels] = width, height = samples - auto histo = nvcv::cuda::Tensor2DWrap(m_histoArray, (int)(256 * channels * sizeof(int))); + auto histo = nvcv::cuda::Tensor2DWrap(m_histoArray, (int)(256 * channels * sizeof(int))); - { - //compute the histogram for each image in the batch into m_histoArray - int bsX = 32; //1024 ( 4 ch of 256 bins) - int bsY = 32; - - switch (channels) - { - case 1: - bsX = 16; // 256 (1 ch) - bsY = 16; - break; - case 2: - bsX = 32; // 512 (2 ch) - bsY = 16; - break; - case 3: - bsX = 32; // 768 (3 ch) - bsY = 24; - break; - default: - break; - } + int64_t srcMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t dstMaxStride = outAccess->sampleStride() * outAccess->numSamples(); - // each block is going to be 256bins * channels = threads - dim3 histBlockSize(bsX, bsY, 1); - dim3 histGridSize(divUp(width, histBlockSize.x), divUp(height, histBlockSize.y), batch); - size_t sharedMemSize = 256 * channels * sizeof(int); - hist_kernel<<>>(src, histo, channels, dstSize); - checkKernelErrors(); - } - - //compute cfd + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) { - int bsX = 256; - int bsY = 1; - int bsZ = 1; - dim3 prefixSumBlockSize(bsX, bsY, bsZ); - dim3 prefixSumGridSize(channels, 1, batch); - prefix_sum_with_norm_kernel<<>>(histo, dstSize); - checkKernelErrors(); - } + auto src = nvcv::cuda::CreateTensorWrapNHWC(inData); + auto dst = nvcv::cuda::CreateTensorWrapNHWC(outData); + return infer_histogram(src, dst, histo, batch, dstSize, channels, stream); + } + else { - dim3 lookupBlockSize(32, 32, 1); - dim3 lookupGridSize(divUp(width, lookupBlockSize.x), divUp(height, lookupBlockSize.y), batch); - lookup<<>>(src, dst, histo, channels, - dstSize); - checkKernelErrors(); + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; } - - return ErrorCode::SUCCESS; } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu index b9b4b4d3c..3d81c13fc 100644 --- a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu +++ b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu @@ -174,7 +174,7 @@ HistogramEqVarShape::HistogramEqVarShape(int maxBatchSize) if (maxBatchSize < 0) { LOG_ERROR("Invalid num of max batch size " << maxBatchSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "maxBatchSize must be >= 0"); } m_maxBatchSize = maxBatchSize; @@ -259,7 +259,7 @@ ErrorCode HistogramEqVarShape::infer(const nvcv::ImageBatchVarShapeDataStridedCu cuda::ImageBatchVarShapeWrapNHWC dst(outData, channels); cuda::ImageBatchVarShapeWrapNHWC src(inData, channels); - auto histo = nvcv::cuda::Tensor2DWrap(m_histoArray, (int)(256 * channels * sizeof(int))); + auto histo = nvcv::cuda::Tensor2DWrap(m_histoArray, (int)(256 * channels * sizeof(int))); { //compute the histogram for each image in the batch into m_histoArray diff --git a/src/cvcuda/priv/legacy/inpaint.cu b/src/cvcuda/priv/legacy/inpaint.cu index 909c2c987..19c77e64f 100644 --- a/src/cvcuda/priv/legacy/inpaint.cu +++ b/src/cvcuda/priv/legacy/inpaint.cu @@ -26,6 +26,8 @@ #include "inpaint_utils.cuh" #include "reduce_kernel_utils.cuh" +#include + using namespace nvcv::legacy::helpers; using namespace nvcv::legacy::cuda_op; @@ -41,9 +43,8 @@ using namespace nvcv::cuda; #define BLOCK_S 16 #define REDUCE_GRID_SIZE 64 -template -__global__ void copy_mask_data(Tensor4DWrap src, Ptr2dNHWC dst, int row_offset, int col_offset, int value, - int2 size) +template +__global__ void copy_mask_data(MaskWrapper src, Ptr2dNHWC dst, int row_offset, int col_offset, int value, int2 size) { int src_x = blockIdx.x * blockDim.x + threadIdx.x; int src_y = blockIdx.y * blockDim.y + threadIdx.y; @@ -61,9 +62,8 @@ __global__ void copy_mask_data(Tensor4DWrap src, Ptr2dNHWC dst, int row_of } } -template -__device__ void inpaint(Ptr2dNHWC f, Ptr2dNHWC t, Tensor4DWrap out, int i, int j, int range, - int ch) +template +__device__ void inpaint(Ptr2dNHWC f, Ptr2dNHWC t, OutWrapper out, int i, int j, int range, int ch) { const int batch_idx = get_batch_idx(); @@ -206,8 +206,8 @@ __device__ void inpaint(Ptr2dNHWC f, Ptr2dNHWC t, Tensor4D } } -template -__global__ void TeleaInpaintFMM(Ptr2dNHWC f, Ptr2dNHWC t, Tensor4DWrap out, int range, +template +__global__ void TeleaInpaintFMM(Ptr2dNHWC f, Ptr2dNHWC t, OutWrapper out, int range, Ptr2dNHWC band, int ch) { int i = 0, j = 0; @@ -446,18 +446,14 @@ inline int finish_flag_reduce(Ptr2dNHWC src_ptr, int *d_out, int } template -void inpaint_helper(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &mask, - const nvcv::TensorDataStridedCuda &outData, void *workspace, unsigned char *kernel_ptr, int range, - bool &init_flag, int batch, int height, int width, int channel, int maxBatchSize, - cudaStream_t stream) +ErrorCode inpaint_helper(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &mask, + const nvcv::TensorDataStridedCuda &outData, void *workspace, unsigned char *kernel_ptr, + int range, bool &init_flag, int batch, int height, int width, int channel, int maxBatchSize, + cudaStream_t stream) { dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(width + 2, blockSize.x), divUp(height + 2, blockSize.y), batch); - auto dst = CreateTensorWrapNHWC(outData); - // data type for mask is 8UC1 - auto org_mask = CreateTensorWrapNHWC(mask); - // create t and f pointer int ecols = width + 2; int erows = height + 2; @@ -499,8 +495,20 @@ void inpaint_helper(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tenso stream)); // cvSet(mask,cvScalar(KNOWN,0,0,0)); // copy !=0 value to mask int2 size = {width, height}; - copy_mask_data<<>>( - org_mask, inpaint_mask, 1, 1, INSIDE, size); // COPY_MASK_BORDER1_C1(inpaint_mask,mask,uchar); + + auto maskAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(mask); + NVCV_ASSERT(maskAccess); + if (maskAccess->sampleStride() * batch <= nvcv::cuda::TypeTraits::max) + { + auto org_mask = CreateTensorWrapNHWC(mask); + copy_mask_data<<>>(org_mask, inpaint_mask, 1, 1, INSIDE, + size); // COPY_MASK_BORDER1_C1(inpaint_mask,mask,uchar); + } + else + { + LOG_ERROR("Mask size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); // set border to 0 @@ -545,17 +553,27 @@ void inpaint_helper(const nvcv::TensorDataStridedCuda &inData, const nvcv::Tenso dim3 grid(divUp(f.rows, block.x), divUp(f.cols, block.y), f.batches); int flag = 1; - while (flag) + if (outAccess->sampleStride() * batch <= nvcv::cuda::TypeTraits::max) { - for (int i = 0; i < iteration; i++) + auto dst = CreateTensorWrapNHWC(outData); + while (flag) { - TeleaInpaintFMM<<>>( - inpaint_mask, t, dst, range, band, channel); // icvTeleaInpaintFMM(mask,t,output_img,range,Heap); + for (int i = 0; i < iteration; i++) + { + TeleaInpaintFMM<<>>(inpaint_mask, t, dst, range, band, channel); + /* icvTeleaInpaintFMM(mask,t,output_img,range,Heap); */ + } + flag = finish_flag_reduce(band, block_reduce_buffer1, block_reduce_buffer2, stream); } - flag = finish_flag_reduce(band, block_reduce_buffer1, block_reduce_buffer2, stream); + } + else + { + LOG_ERROR("Output size exceeds " << nvcv::cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; } checkKernelErrors(); + return ErrorCode::SUCCESS; } namespace nvcv::legacy::cuda_op { @@ -604,7 +622,7 @@ ErrorCode Inpaint::infer(const TensorDataStridedCuda &inData, const TensorDataSt DataType in_data_type = GetLegacyDataType(inData.dtype()); if (!(in_format == kNHWC || in_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << in_format); + LOG_ERROR("Invalid input DataFormat " << in_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -628,7 +646,7 @@ ErrorCode Inpaint::infer(const TensorDataStridedCuda &inData, const TensorDataSt DataType out_data_type = GetLegacyDataType(outData.dtype()); if (!(out_format == kNHWC || out_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << out_format); + LOG_ERROR("Invalid output DataFormat " << out_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -663,10 +681,10 @@ ErrorCode Inpaint::infer(const TensorDataStridedCuda &inData, const TensorDataSt return ErrorCode::INVALID_DATA_SHAPE; } - typedef void (*inpaint_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &mask, - const TensorDataStridedCuda &outData, void *workspace, unsigned char *kernel_ptr, - int range, bool &init_flag, int batch, int height, int width, int channel, - int maxBatchSize, cudaStream_t stream); + typedef ErrorCode (*inpaint_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &mask, + const TensorDataStridedCuda &outData, void *workspace, unsigned char *kernel_ptr, + int range, bool &init_flag, int batch, int height, int width, int channel, + int maxBatchSize, cudaStream_t stream); static const inpaint_t funcs[6] = { inpaint_helper, inpaint_helper, 0, 0, inpaint_helper, inpaint_helper, @@ -675,9 +693,9 @@ ErrorCode Inpaint::infer(const TensorDataStridedCuda &inData, const TensorDataSt int range = (int)std::round(inpaintRadius); range = std::max(range, 1); range = std::min(range, 100); - funcs[in_data_type](inData, masks, outData, m_workspace, m_kernel_ptr, range, m_init_dilate, inAccess->numSamples(), - inAccess->numRows(), inAccess->numCols(), in_channels, m_maxBatchSize, stream); - return SUCCESS; + return funcs[in_data_type](inData, masks, outData, m_workspace, m_kernel_ptr, range, m_init_dilate, + inAccess->numSamples(), inAccess->numRows(), inAccess->numCols(), in_channels, + m_maxBatchSize, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/inpaint_var_shape.cu b/src/cvcuda/priv/legacy/inpaint_var_shape.cu index 193f19e7e..b4304323e 100644 --- a/src/cvcuda/priv/legacy/inpaint_var_shape.cu +++ b/src/cvcuda/priv/legacy/inpaint_var_shape.cu @@ -619,7 +619,7 @@ ErrorCode InpaintVarShape::infer(const nvcv::ImageBatchVarShape &inBatc DataType in_data_type = helpers::GetLegacyDataType(inData->uniqueFormat()); if (!(in_format == kNHWC || in_format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << in_format); + LOG_ERROR("Invalid input DataFormat " << in_format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -641,7 +641,7 @@ ErrorCode InpaintVarShape::infer(const nvcv::ImageBatchVarShape &inBatc DataType out_data_type = helpers::GetLegacyDataType(outData->uniqueFormat()); if (out_format != in_format) { - LOG_ERROR("Invalid DataFormat " << out_format); + LOG_ERROR("Invalid DataFormat between input (" << in_format << ") and output (" << out_format << ")"); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/joint_bilateral_filter.cu b/src/cvcuda/priv/legacy/joint_bilateral_filter.cu index ce64c3779..226d9182a 100644 --- a/src/cvcuda/priv/legacy/joint_bilateral_filter.cu +++ b/src/cvcuda/priv/legacy/joint_bilateral_filter.cu @@ -167,29 +167,42 @@ __global__ void JointBilateralFilterKernel(SrcWrapper src, SrcWrapper srcColor, } template -void JointBilateralFilterCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &inColorData, - const TensorDataStridedCuda &outData, const int batch, int rows, int columns, - int radius, float sigmaColor, float sigmaSpace, float borderValue, cudaStream_t stream) +ErrorCode JointBilateralFilterCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &inColorData, + const TensorDataStridedCuda &outData, const int batch, int rows, int columns, + int radius, float sigmaColor, float sigmaSpace, float borderValue, + cudaStream_t stream) { dim3 block(8, 8); dim3 grid(divUp(columns, block.x * 2), divUp(rows, block.y * 2), batch); - auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(borderValue)); - auto srcColor = cuda::CreateBorderWrapNHW(inColorData, cuda::SetAll(borderValue)); - auto dst = cuda::CreateTensorWrapNHW(outData); + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); #endif - JointBilateralFilterKernel<<>>(src, srcColor, dst, radius, sigmaColor, sigmaSpace, rows, - columns); + if (inAccess->sampleStride() * inAccess->numSamples() <= cuda::TypeTraits::max) + { + auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(borderValue)); + auto srcColor = cuda::CreateBorderWrapNHW(inColorData, cuda::SetAll(borderValue)); + auto dst = cuda::CreateTensorWrapNHW(outData); + + JointBilateralFilterKernel<<>>(src, srcColor, dst, radius, sigmaColor, sigmaSpace, rows, + columns); + } + else + { + LOG_ERROR("Input size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); #endif + return ErrorCode::SUCCESS; } ErrorCode JointBilateralFilter::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &inColorData, @@ -296,7 +309,7 @@ ErrorCode JointBilateralFilter::infer(const TensorDataStridedCuda &inData, const float borderValue = .0f; - typedef void (*joint_bilateral_filter_t)( + typedef ErrorCode (*joint_bilateral_filter_t)( const TensorDataStridedCuda &inData, const TensorDataStridedCuda &inColorData, const TensorDataStridedCuda &outData, int batch, int rows, int columns, int radius, float sigmaColor, float sigmaSpace, float borderValue, cudaStream_t stream); @@ -428,9 +441,8 @@ ErrorCode JointBilateralFilter::infer(const TensorDataStridedCuda &inData, const JointBilateralFilterCaller}, }, }; - funcs[borderMode][data_type][channels - 1](inData, inColorData, outData, batch, rows, columns, radius, sigmaColor, - sigmaSpace, borderValue, stream); - return ErrorCode::SUCCESS; + return funcs[borderMode][data_type][channels - 1](inData, inColorData, outData, batch, rows, columns, radius, + sigmaColor, sigmaSpace, borderValue, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/median_blur.cu b/src/cvcuda/priv/legacy/median_blur.cu index 3f05eeb13..9ee0540bc 100644 --- a/src/cvcuda/priv/legacy/median_blur.cu +++ b/src/cvcuda/priv/legacy/median_blur.cu @@ -379,7 +379,7 @@ ErrorCode MedianBlur::infer(const TensorDataStridedCuda &inData, const TensorDat if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/median_blur_var_shape.cu b/src/cvcuda/priv/legacy/median_blur_var_shape.cu index 3abf39adb..f73216373 100644 --- a/src/cvcuda/priv/legacy/median_blur_var_shape.cu +++ b/src/cvcuda/priv/legacy/median_blur_var_shape.cu @@ -434,7 +434,7 @@ ErrorCode MedianBlurVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inD if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/morphology.cu b/src/cvcuda/priv/legacy/morphology.cu index 968850e72..4f4074265 100644 --- a/src/cvcuda/priv/legacy/morphology.cu +++ b/src/cvcuda/priv/legacy/morphology.cu @@ -85,24 +85,12 @@ __global__ void erode(SrcWrapper src, DstWrapper dst, Size2D dstSize, Size2D ker *dst.ptr(batch_idx, y, x) = cuda::SaturateCast(res); } -template -void MorphFilter2DCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - NVCVMorphologyType morph_type, Size2D kernelSize, int2 kernelAnchor, cudaStream_t stream) +template +void MorphFilter2DCaller(const SrcWrapper &src, const DstWrapper &dst, NVCVMorphologyType morph_type, Size2D kernelSize, + int2 kernelAnchor, BT maxmin, Size2D dstSize, int numSamples, cudaStream_t stream) { - using BT = cuda::BaseType; - - BT val = (morph_type == NVCVMorphologyType::NVCV_DILATE) ? std::numeric_limits::min() - : std::numeric_limits::max(); - - auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(val)); - auto dst = cuda::CreateTensorWrapNHW(outData); - - auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); - NVCV_ASSERT(outAccess); - Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; - dim3 block(16, 16); - dim3 grid(divUp(dstSize.w, block.x), divUp(dstSize.h, block.y), outAccess->numSamples()); - + dim3 block(16, 16); + dim3 grid(divUp(dstSize.w, block.x), divUp(dstSize.h, block.y), numSamples); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); @@ -110,12 +98,12 @@ void MorphFilter2DCaller(const TensorDataStridedCuda &inData, const TensorDataSt if (morph_type == NVCVMorphologyType::NVCV_ERODE) { - erode<<>>(src, dst, dstSize, kernelSize, kernelAnchor, val); + erode<<>>(src, dst, dstSize, kernelSize, kernelAnchor, maxmin); checkKernelErrors(); } else if (morph_type == NVCVMorphologyType::NVCV_DILATE) { - dilate<<>>(src, dst, dstSize, kernelSize, kernelAnchor, val); + dilate<<>>(src, dst, dstSize, kernelSize, kernelAnchor, maxmin); checkKernelErrors(); } @@ -125,17 +113,50 @@ void MorphFilter2DCaller(const TensorDataStridedCuda &inData, const TensorDataSt #endif } +template +ErrorCode MorphFilter2DCaller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + NVCVMorphologyType morph_type, Size2D kernelSize, int2 kernelAnchor, cudaStream_t stream) +{ + using BT = cuda::BaseType; + + BT val = (morph_type == NVCVMorphologyType::NVCV_DILATE) ? std::numeric_limits::min() + : std::numeric_limits::max(); + + auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(outAccess); + + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); + Size2D dstSize{outAccess->numCols(), outAccess->numRows()}; + int numSamples = outAccess->numSamples(); + + auto outMaxStride = outAccess->sampleStride() * numSamples; + auto inMaxStride = inAccess->sampleStride() * numSamples; + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateBorderWrapNHW(inData, cuda::SetAll(val)); + auto dst = cuda::CreateTensorWrapNHW(outData); + + MorphFilter2DCaller(src, dst, morph_type, kernelSize, kernelAnchor, val, dstSize, numSamples, stream); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; +} + template -void MorphFilter2D(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - NVCVMorphologyType morph_type, Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, - cudaStream_t stream) +ErrorCode MorphFilter2D(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + NVCVMorphologyType morph_type, Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, + cudaStream_t stream) { switch (borderMode) { -#define NVCV_MORPH_CASE(BORDERTYPE) \ - case BORDERTYPE: \ - MorphFilter2DCaller(inData, outData, morph_type, kernelSize, kernelAnchor, stream); \ - break +#define NVCV_MORPH_CASE(BORDERTYPE) \ + case BORDERTYPE: \ + return MorphFilter2DCaller(inData, outData, morph_type, kernelSize, kernelAnchor, stream); NVCV_MORPH_CASE(NVCV_BORDER_CONSTANT); NVCV_MORPH_CASE(NVCV_BORDER_REPLICATE); @@ -148,6 +169,7 @@ void MorphFilter2D(const TensorDataStridedCuda &inData, const TensorDataStridedC NVCV_ASSERT("Unknown bortertype"); break; } + return ErrorCode::SUCCESS; } ErrorCode Morphology::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, @@ -176,7 +198,7 @@ ErrorCode Morphology::infer(const TensorDataStridedCuda &inData, const TensorDat DataFormat format = input_format; if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -233,9 +255,9 @@ ErrorCode Morphology::infer(const TensorDataStridedCuda &inData, const TensorDat return SUCCESS; } - typedef void (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - NVCVMorphologyType morph_type, Size2D kernelSize, int2 kernelAnchor, - NVCVBorderType borderMode, cudaStream_t stream); + typedef ErrorCode (*filter2D_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + NVCVMorphologyType morph_type, Size2D kernelSize, int2 kernelAnchor, + NVCVBorderType borderMode, cudaStream_t stream); static const filter2D_t funcs[6][4] = { { MorphFilter2D, 0, MorphFilter2D, MorphFilter2D}, @@ -246,9 +268,7 @@ ErrorCode Morphology::infer(const TensorDataStridedCuda &inData, const TensorDat { MorphFilter2D, 0, MorphFilter2D, MorphFilter2D}, }; - funcs[data_type][channels - 1](inData, outData, morph_type, mask_size_, anchor_, borderMode, stream); - - return SUCCESS; + return funcs[data_type][channels - 1](inData, outData, morph_type, mask_size_, anchor_, borderMode, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/morphology_var_shape.cu b/src/cvcuda/priv/legacy/morphology_var_shape.cu index 04f5a2aa5..2792412fe 100644 --- a/src/cvcuda/priv/legacy/morphology_var_shape.cu +++ b/src/cvcuda/priv/legacy/morphology_var_shape.cu @@ -218,7 +218,7 @@ ErrorCode MorphologyVarShape::infer(const nvcv::ImageBatchVarShape &inBatch, con DataFormat format = input_format; if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/normalize.cu b/src/cvcuda/priv/legacy/normalize.cu index 60eaf3cdf..d9498996e 100644 --- a/src/cvcuda/priv/legacy/normalize.cu +++ b/src/cvcuda/priv/legacy/normalize.cu @@ -23,10 +23,12 @@ #include "CvCudaUtils.cuh" -#include // for CVCUDA_NORMALIZE_SCALE_IS_STDDEV, etc. +#include // for CVCUDA_NORMALIZE_SCALE_IS_STDDEV, etc. +#include // for TypeTraits using namespace nvcv::legacy::cuda_op; using namespace nvcv::legacy::helpers; +namespace cuda = nvcv::cuda; // (float3 - float3) * float3 / (float3 - float) * float3 / (float3 - float3) * float / (float3 - float) * float template @@ -98,8 +100,8 @@ void normalizeWrap(WrapInput srcWrap, WrapOutput dstWrap, DataShape input_shape, dim3 block(32, 8); dim3 grid(divUp(input_shape.W, block.x), divUp(input_shape.H, block.y), input_shape.N); - auto baseWrap = nvcv::cuda::CreateTensorWrapNHW(baseData); - auto scaleWrap = nvcv::cuda::CreateTensorWrapNHW(scaleData); + auto baseWrap = nvcv::cuda::CreateTensorWrapNHW(baseData); + auto scaleWrap = nvcv::cuda::CreateTensorWrapNHW(scaleData); auto baseAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(baseData); NVCV_ASSERT(baseAccess); @@ -126,8 +128,8 @@ void normalizeInvStdDevWrap(WrapInput srcWrap, WrapOutput dstWrap, DataShape inp dim3 block(32, 8); dim3 grid(divUp(input_shape.W, block.x), divUp(input_shape.H, block.y), input_shape.N); - auto baseWrap = nvcv::cuda::CreateTensorWrapNHW(baseData); - auto scaleWrap = nvcv::cuda::CreateTensorWrapNHW(scaleData); + auto baseWrap = nvcv::cuda::CreateTensorWrapNHW(baseData); + auto scaleWrap = nvcv::cuda::CreateTensorWrapNHW(scaleData); auto baseAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(baseData); NVCV_ASSERT(baseAccess); @@ -146,108 +148,154 @@ void normalizeInvStdDevWrap(WrapInput srcWrap, WrapOutput dstWrap, DataShape inp checkKernelErrors(); } -template -void normalize(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &baseData, - const nvcv::TensorDataStridedCuda &scaleData, const nvcv::TensorDataStridedCuda &outData, - float global_scale, float shift, cudaStream_t stream) +template +void callNormalizeWrap(const input_wrapper &input, const DataShape &inputShape, + const nvcv::TensorDataStridedCuda &baseData, const nvcv::TensorDataStridedCuda &scaleData, + const output_wrapper &output, float global_scale, float shift, cudaStream_t stream) { - auto srcWrap = nvcv::cuda::CreateTensorWrapNHW(inData); - auto dstWrap = nvcv::cuda::CreateTensorWrapNHW(outData); - - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); - NVCV_ASSERT(inAccess); - auto baseAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(baseData); NVCV_ASSERT(baseAccess); auto scaleAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(scaleData); NVCV_ASSERT(scaleAccess); - DataShape input_shape = GetLegacyDataShape(inAccess->infoShape()); - - using work_type = nvcv::cuda::ConvertBaseTypeTo; + using input_type = typename input_wrapper::ValueType; + using work_type = nvcv::cuda::ConvertBaseTypeTo; if (baseAccess->numChannels() != 1 && scaleAccess->numChannels() != 1) { using base_type = work_type; using scale_type = work_type; - normalizeWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, shift, + normalizeWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, stream); } else if (baseAccess->numChannels() != 1) { using base_type = work_type; using scale_type = float; - normalizeWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, shift, + normalizeWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, stream); } else if (scaleAccess->numChannels() != 1) { using base_type = float; using scale_type = work_type; - normalizeWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, shift, + normalizeWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, stream); } else { using base_type = float; using scale_type = float; - normalizeWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, shift, + normalizeWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, stream); } } template -void normalizeInvStdDev(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &baseData, - const nvcv::TensorDataStridedCuda &scaleData, const nvcv::TensorDataStridedCuda &outData, - float global_scale, float shift, float epsilon, cudaStream_t stream) +ErrorCode normalize(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &baseData, + const nvcv::TensorDataStridedCuda &scaleData, const nvcv::TensorDataStridedCuda &outData, + float global_scale, float shift, cudaStream_t stream) { - auto srcWrap = nvcv::cuda::CreateTensorWrapNHW(inData); - auto dstWrap = nvcv::cuda::CreateTensorWrapNHW(outData); - auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(outAccess); + + DataShape inputShape = GetLegacyDataShape(inAccess->infoShape()); + + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto srcWrap = nvcv::cuda::CreateTensorWrapNHW(inData); + auto dstWrap = nvcv::cuda::CreateTensorWrapNHW(outData); + callNormalizeWrap(srcWrap, inputShape, baseData, scaleData, dstWrap, global_scale, shift, stream); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; +} + +template +void callNormalizeInvStdDevWrap(const input_wrapper &input, const DataShape &inputShape, + const nvcv::TensorDataStridedCuda &baseData, + const nvcv::TensorDataStridedCuda &scaleData, const output_wrapper &output, + float global_scale, float shift, float epsilon, cudaStream_t stream) +{ auto baseAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(baseData); NVCV_ASSERT(baseAccess); auto scaleAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(scaleData); NVCV_ASSERT(scaleAccess); - DataShape input_shape = GetLegacyDataShape(inAccess->infoShape()); - - using work_type = nvcv::cuda::ConvertBaseTypeTo; + using input_type = typename input_wrapper::ValueType; + using work_type = nvcv::cuda::ConvertBaseTypeTo; if (baseAccess->numChannels() != 1 && scaleAccess->numChannels() != 1) { using base_type = work_type; using scale_type = work_type; - normalizeInvStdDevWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, + normalizeInvStdDevWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, epsilon, stream); } else if (baseAccess->numChannels() != 1) { using base_type = work_type; using scale_type = float; - normalizeInvStdDevWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, + normalizeInvStdDevWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, epsilon, stream); } else if (scaleAccess->numChannels() != 1) { using base_type = float; using scale_type = work_type; - normalizeInvStdDevWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, + normalizeInvStdDevWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, epsilon, stream); } else { using base_type = float; using scale_type = float; - normalizeInvStdDevWrap(srcWrap, dstWrap, input_shape, baseData, scaleData, global_scale, + normalizeInvStdDevWrap(input, output, inputShape, baseData, scaleData, global_scale, shift, epsilon, stream); } } +template +ErrorCode normalizeInvStdDev(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &baseData, + const nvcv::TensorDataStridedCuda &scaleData, const nvcv::TensorDataStridedCuda &outData, + float global_scale, float shift, float epsilon, cudaStream_t stream) +{ + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); + + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(outAccess); + + DataShape inputShape = GetLegacyDataShape(inAccess->infoShape()); + + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto srcWrap = nvcv::cuda::CreateTensorWrapNHW(inData); + auto dstWrap = nvcv::cuda::CreateTensorWrapNHW(outData); + callNormalizeInvStdDevWrap(srcWrap, inputShape, baseData, scaleData, dstWrap, global_scale, shift, epsilon, + stream); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; +} + namespace nvcv::legacy::cuda_op { void Normalize::checkParamShape(DataShape input_shape, DataShape param_shape) @@ -267,7 +315,7 @@ ErrorCode Normalize::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -322,13 +370,14 @@ ErrorCode Normalize::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv checkParamShape(input_shape, base_param_shape); checkParamShape(input_shape, scale_param_shape); - typedef void (*normalize_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &baseData, - const TensorDataStridedCuda &scaleData, const TensorDataStridedCuda &outData, - float global_scale, float shift, cudaStream_t stream); + typedef ErrorCode (*normalize_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &baseData, + const TensorDataStridedCuda &scaleData, const TensorDataStridedCuda &outData, + float global_scale, float shift, cudaStream_t stream); - typedef void (*normalizeInvStdDev_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &baseData, - const TensorDataStridedCuda &scaleData, const TensorDataStridedCuda &outData, - float global_scale, float shift, float epsilon, cudaStream_t stream); + typedef ErrorCode (*normalizeInvStdDev_t)( + const TensorDataStridedCuda &inData, const TensorDataStridedCuda &baseData, + const TensorDataStridedCuda &scaleData, const TensorDataStridedCuda &outData, float global_scale, float shift, + float epsilon, cudaStream_t stream); static const normalize_t funcs_normalize[6][4] = { { normalize, 0 /*normalize*/, normalize, normalize}, @@ -355,15 +404,14 @@ ErrorCode Normalize::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv if (flags & CVCUDA_NORMALIZE_SCALE_IS_STDDEV) { - funcs_normalize_stddev[data_type][channels - 1](inData, baseData, scaleData, outData, global_scale, shift, - epsilon, stream); + return funcs_normalize_stddev[data_type][channels - 1](inData, baseData, scaleData, outData, global_scale, + shift, epsilon, stream); } else { - funcs_normalize[data_type][channels - 1](inData, baseData, scaleData, outData, global_scale, shift, stream); + return funcs_normalize[data_type][channels - 1](inData, baseData, scaleData, outData, global_scale, shift, + stream); } - - return SUCCESS; } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/pad_and_stack.cu b/src/cvcuda/priv/legacy/pad_and_stack.cu index 623dd4a33..98641b9c3 100644 --- a/src/cvcuda/priv/legacy/pad_and_stack.cu +++ b/src/cvcuda/priv/legacy/pad_and_stack.cu @@ -42,14 +42,12 @@ __global__ void padAndStack(SrcWrapper src, DstWrapper dst, VecWrapper topVec, V } template -void padAndStackCaller(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, const float borderValue, - cudaStream_t stream) +ErrorCode padAndStackCaller(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, + const float borderValue, cudaStream_t stream) { cuda::BorderVarShapeWrap src(inData, cuda::SetAll(borderValue)); - auto dst = cuda::CreateTensorWrapNHW(outData); - auto topVec = cuda::CreateTensorWrapNHW(top); auto leftVec = cuda::CreateTensorWrapNHW(left); @@ -61,24 +59,35 @@ void padAndStackCaller(const ImageBatchVarShapeDataStridedCuda &inData, const Te dim3 block(16, 16); dim3 grid(divUp(dstSize.x, block.x), divUp(dstSize.y, block.y), outAccess->numSamples()); - padAndStack<<>>(src, dst, topVec, leftVec, dstSize); + if (outAccess->sampleStride() * outAccess->numSamples() <= cuda::TypeTraits::max) + { + auto dst = cuda::CreateTensorWrapNHW(outData); + padAndStack<<>>(src, dst, topVec, leftVec, dstSize); + } + else + { + LOG_ERROR("Output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; } template -void padAndStack(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, const NVCVBorderType borderMode, - const float borderValue, cudaStream_t stream) +ErrorCode padAndStack(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, + const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream) { - typedef void (*padAndStack_caller)(const ImageBatchVarShapeDataStridedCuda &inData, - const TensorDataStridedCuda &outData, const TensorDataStridedCuda &top, - const TensorDataStridedCuda &left, const float borderValue, cudaStream_t stream); + typedef ErrorCode (*padAndStack_caller)(const ImageBatchVarShapeDataStridedCuda &inData, + const TensorDataStridedCuda &outData, const TensorDataStridedCuda &top, + const TensorDataStridedCuda &left, const float borderValue, + cudaStream_t stream); static const padAndStack_caller funcs[] = {padAndStackCaller, padAndStackCaller, padAndStackCaller, padAndStackCaller, padAndStackCaller}; - funcs[borderMode](inData, outData, top, left, borderValue, stream); + return funcs[borderMode](inData, outData, top, left, borderValue, stream); } ErrorCode PadAndStack::infer(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, @@ -90,7 +99,7 @@ ErrorCode PadAndStack::infer(const ImageBatchVarShapeDataStridedCuda &inData, co if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -156,9 +165,9 @@ ErrorCode PadAndStack::infer(const ImageBatchVarShapeDataStridedCuda &inData, co return ErrorCode::INVALID_DATA_SHAPE; } - typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, - const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream); + typedef ErrorCode (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const TensorDataStridedCuda &top, const TensorDataStridedCuda &left, + const NVCVBorderType borderMode, const float borderValue, cudaStream_t stream); static const func_t funcs[6][4] = { { padAndStack, padAndStack, padAndStack, padAndStack}, @@ -172,9 +181,7 @@ ErrorCode PadAndStack::infer(const ImageBatchVarShapeDataStridedCuda &inData, co const func_t func = funcs[data_type][channels - 1]; NVCV_ASSERT(func != 0); - func(inData, outData, top, left, borderMode, borderValue, stream); - - return SUCCESS; + return func(inData, outData, top, left, borderMode, borderValue, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/pillow_resize.cu b/src/cvcuda/priv/legacy/pillow_resize.cu index fcf583e54..6c05ef187 100644 --- a/src/cvcuda/priv/legacy/pillow_resize.cu +++ b/src/cvcuda/priv/legacy/pillow_resize.cu @@ -404,7 +404,7 @@ ErrorCode PillowResize::infer(const TensorDataStridedCuda &inData, const TensorD } if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu index 7539e30f5..473851350 100644 --- a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu +++ b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu @@ -579,7 +579,7 @@ ErrorCode PillowResizeVarShape::infer(const nvcv::ImageBatchVarShape &inDataBase if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/random_resized_crop.cu b/src/cvcuda/priv/legacy/random_resized_crop.cu index 4d3240b14..af270b46e 100644 --- a/src/cvcuda/priv/legacy/random_resized_crop.cu +++ b/src/cvcuda/priv/legacy/random_resized_crop.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +/* Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES * SPDX-License-Identifier: Apache-2.0 @@ -146,8 +146,9 @@ __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, int2 src const int top = top_[batch_idx]; const int left = left_[batch_idx]; - const int sx = cuda::min(cuda::round(dst_x * scale_x + left), srcSize.x - 1); - const int sy = cuda::min(cuda::round(dst_y * scale_y + top), srcSize.y - 1); + const int sx = cuda::min(__float2int_rd((dst_x + 0.5f) * scale_x) + left, srcSize.x - 1); + const int sy = cuda::min(__float2int_rd((dst_y + 0.5f) * scale_y) + top, srcSize.y - 1); + *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, sy, sx); } } @@ -221,27 +222,14 @@ __global__ void resize_cubic_v1(const SrcWrapper src, DstWrapper dst, int2 srcSi } } -template -void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const NVCVInterpolationType interpolation, cudaStream_t stream, const int *top, const int *left, - const float *scale_x, const float *scale_y) +template +void resize(const SrcWrapper &src, const DstWrapper &dst, const NVCVInterpolationType interpolation, + cudaStream_t stream, const int *top, const int *left, const float *scale_x, const float *scale_y, + int2 srcSize, int2 dstSize, int batchSize) { - auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); - NVCV_ASSERT(inAccess); - - auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); - NVCV_ASSERT(outAccess); - - const int2 srcSize{inAccess->numCols(), inAccess->numRows()}; - const int2 dstSize{outAccess->numCols(), outAccess->numRows()}; - const int batchSize{static_cast(outAccess->numSamples())}; - dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(dstSize.x, blockSize.x), divUp(dstSize.y, blockSize.y), batchSize); - auto src = cuda::CreateTensorWrapNHW(inData); - auto dst = cuda::CreateTensorWrapNHW(outData); - if (interpolation == NVCV_INTERP_LINEAR) { resize_linear_v1<<>>(src, dst, srcSize, dstSize, top, left, scale_x, scale_y); @@ -264,6 +252,38 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou #endif } +template +ErrorCode resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const NVCVInterpolationType interpolation, cudaStream_t stream, const int *top, const int *left, + const float *scale_x, const float *scale_y) +{ + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); + + auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(outAccess); + + const int2 srcSize{inAccess->numCols(), inAccess->numRows()}; + const int2 dstSize{outAccess->numCols(), outAccess->numRows()}; + const int batchSize{static_cast(outAccess->numSamples())}; + + int64_t srcMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t dstMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateTensorWrapNHW(inData); + auto dst = cuda::CreateTensorWrapNHW(outData); + resize(src, dst, interpolation, stream, top, left, scale_x, scale_y, srcSize, dstSize, batchSize); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + return ErrorCode::SUCCESS; +} + RandomResizedCrop::RandomResizedCrop(DataShape max_input_shape, DataShape max_output_shape, const double min_scale, const double max_scale, const double min_ratio, const double max_ratio, int32_t maxBatchSize, uint32_t seed) @@ -383,7 +403,7 @@ ErrorCode RandomResizedCrop::infer(const TensorDataStridedCuda &inData, const Te if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -455,9 +475,9 @@ ErrorCode RandomResizedCrop::infer(const TensorDataStridedCuda &inData, const Te checkCudaErrors( cudaMemcpyAsync((void *)m_gpuCropParams, (void *)m_cpuCropParams, buffer_size, cudaMemcpyHostToDevice, stream)); - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - const NVCVInterpolationType interpolation, cudaStream_t stream, const int *top, - const int *left, const float *scale_x, const float *scale_y); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + const NVCVInterpolationType interpolation, cudaStream_t stream, const int *top, + const int *left, const float *scale_x, const float *scale_y); static const func_t funcs[6][4] = { { resize, 0 /*resize*/, resize, resize}, @@ -469,8 +489,7 @@ ErrorCode RandomResizedCrop::infer(const TensorDataStridedCuda &inData, const Te }; const func_t func = funcs[in_data_type][channels - 1]; - func(inData, outData, interpolation, stream, tops_gpu, lefts_gpu, scale_x_gpu, scale_y_gpu); - return SUCCESS; + return func(inData, outData, interpolation, stream, tops_gpu, lefts_gpu, scale_x_gpu, scale_y_gpu); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu index f759a1a02..0c65b0be4 100644 --- a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu +++ b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +/* Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES * SPDX-License-Identifier: Apache-2.0 @@ -154,8 +154,8 @@ __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, const in const int top = top_[batch_idx]; const int left = left_[batch_idx]; - const int sx = cuda::min(cuda::round(dst_x * scale_x + left), width - 1); - const int sy = cuda::min(cuda::round(dst_y * scale_y + top), height - 1); + const int sx = cuda::min(__float2int_rd((dst_x + 0.5f) * scale_x) + left, width - 1); + const int sy = cuda::min(__float2int_rd((dst_y + 0.5f) * scale_y) + top, height - 1); *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, sy, sx); } @@ -316,7 +316,7 @@ ErrorCode RandomResizedCropVarShape::infer(const ImageBatchVarShape &in, const I if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/reformat.cu b/src/cvcuda/priv/legacy/reformat.cu index 826e0f0c0..a37f81471 100644 --- a/src/cvcuda/priv/legacy/reformat.cu +++ b/src/cvcuda/priv/legacy/reformat.cu @@ -69,9 +69,11 @@ __global__ void transformFormat(const SrcWrapper src, DstWrapper dst, int3 inout } } +namespace nvcv::legacy::cuda_op { + template // k(N)CHW k(N)HWC, uchar float -void transform(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, - cudaStream_t stream) +ErrorCode transform(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorDataStridedCuda &outData, + cudaStream_t stream) { auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); @@ -84,10 +86,20 @@ void transform(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorData dim3 block(32, 8); dim3 grid(cuda_op::divUp(inout_size.x, block.x), cuda_op::divUp(inout_size.y, block.y), inAccess->numSamples()); - cuda::TensorNDWrap> src(inData); - cuda::TensorNDWrap> dst(outData); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + cuda::TensorNDWrap, int32_t> src(inData); + cuda::TensorNDWrap, int32_t> dst(outData); - transformFormat<<>>(src, dst, inout_size); + transformFormat<<>>(src, dst, inout_size); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); @@ -95,10 +107,9 @@ void transform(const nvcv::TensorDataStridedCuda &inData, const nvcv::TensorData checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaGetLastError()); #endif + return ErrorCode::SUCCESS; } -namespace nvcv::legacy::cuda_op { - void Reformat::checkDataFormat(DataFormat format) { NVCV_ASSERT(format == kNHWC || format == kHWC || format == kNCHW || format == kCHW); @@ -160,8 +171,8 @@ ErrorCode Reformat::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv: return ErrorCode::INVALID_DATA_TYPE; } - typedef void (*transform_t)(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, - cudaStream_t stream); + typedef ErrorCode (*transform_t)(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, + cudaStream_t stream); static const transform_t funcs[4][7] = { {transform, transform, transform, transform, @@ -175,9 +186,7 @@ ErrorCode Reformat::infer(const nvcv::TensorDataStridedCuda &inData, const nvcv: }; transform_t func = funcs[input_format][data_type]; - func(inData, outData, stream); - - return SUCCESS; + return func(inData, outData, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/resize_var_shape.cu b/src/cvcuda/priv/legacy/resize_var_shape.cu index e4627ab58..bec72da1f 100644 --- a/src/cvcuda/priv/legacy/resize_var_shape.cu +++ b/src/cvcuda/priv/legacy/resize_var_shape.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +/* Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES * SPDX-License-Identifier: Apache-2.0 @@ -104,8 +104,8 @@ __global__ void resize_NN(cuda::ImageBatchVarShapeWrap src, cuda::Image const float scale_x = static_cast(width) / dstWidth; const float scale_y = static_cast(height) / dstHeight; - const int sx = cuda::min(cuda::round(dst_x * scale_x), width - 1); - const int sy = cuda::min(cuda::round(dst_y * scale_y), height - 1); + const int sx = cuda::min(__float2int_rd((dst_x + 0.5f) * scale_x), width - 1); + const int sy = cuda::min(__float2int_rd((dst_y + 0.5f) * scale_y), height - 1); *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, sy, sx); } @@ -494,7 +494,7 @@ ErrorCode ResizeVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData, if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/rotate.cu b/src/cvcuda/priv/legacy/rotate.cu index a4dc8950a..90bb893fc 100644 --- a/src/cvcuda/priv/legacy/rotate.cu +++ b/src/cvcuda/priv/legacy/rotate.cu @@ -54,7 +54,7 @@ __global__ void rotate(SrcWrapper src, DstWrapper dst, int2 dstSize, const doubl static_cast(dst_x_shift * (-d_aCoeffs[3]) + dst_y_shift * d_aCoeffs[4]), static_cast(dstCoord.z)}; - const int2 srcSize{src.borderWrap().tensorShape()[1], src.borderWrap().tensorShape()[0]}; + const long2 srcSize{src.borderWrap().tensorShape()[1], src.borderWrap().tensorShape()[0]}; if (srcCoord.x > -0.5 && srcCoord.x < srcSize.x && srcCoord.y > -0.5 && srcCoord.y < srcSize.y) { @@ -63,12 +63,15 @@ __global__ void rotate(SrcWrapper src, DstWrapper dst, int2 dstSize, const doubl } template -void rotate(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, double *d_aCoeffs, - const double angleDeg, const double2 shift, cudaStream_t stream) +ErrorCode rotate(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, double *d_aCoeffs, + const double angleDeg, const double2 shift, cudaStream_t stream) { auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); NVCV_ASSERT(outAccess); + auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); + NVCV_ASSERT(inAccess); + const int2 dstSize{outAccess->numCols(), outAccess->numRows()}; const int batchSize{static_cast(outAccess->numSamples())}; @@ -78,40 +81,48 @@ void rotate(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou dim3 blockSize(BLOCK, BLOCK / 4, 1); dim3 gridSize(divUp(dstSize.x, blockSize.x), divUp(dstSize.y, blockSize.y), batchSize); - auto src = cuda::CreateInterpolationWrapNHW(inData); - auto dst = cuda::CreateTensorWrapNHW(outData); - - rotate<<>>(src, dst, dstSize, d_aCoeffs); + int64_t inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + int64_t outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + if (std::max(inMaxStride, outMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateInterpolationWrapNHW(inData); + auto dst = cuda::CreateTensorWrapNHW(outData); + rotate<<>>(src, dst, dstSize, d_aCoeffs); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); #endif + return ErrorCode::SUCCESS; } template // uchar3 float3 uchar1 float3 -void rotate(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, double *d_aCoeffs, - const double angleDeg, const double2 shift, const NVCVInterpolationType interpolation, cudaStream_t stream) +ErrorCode rotate(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, double *d_aCoeffs, + const double angleDeg, const double2 shift, const NVCVInterpolationType interpolation, + cudaStream_t stream) { switch (interpolation) { case NVCV_INTERP_NEAREST: - rotate(inData, outData, d_aCoeffs, angleDeg, shift, stream); - break; + return rotate(inData, outData, d_aCoeffs, angleDeg, shift, stream); case NVCV_INTERP_LINEAR: - rotate(inData, outData, d_aCoeffs, angleDeg, shift, stream); - break; + return rotate(inData, outData, d_aCoeffs, angleDeg, shift, stream); case NVCV_INTERP_CUBIC: - rotate(inData, outData, d_aCoeffs, angleDeg, shift, stream); - break; + return rotate(inData, outData, d_aCoeffs, angleDeg, shift, stream); default: LOG_ERROR("Invalid rotate interpolation " << interpolation); - break; + return ErrorCode::INVALID_PARAMETER; } } @@ -163,7 +174,7 @@ ErrorCode Rotate::infer(const TensorDataStridedCuda &inData, const TensorDataStr if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -194,9 +205,9 @@ ErrorCode Rotate::infer(const TensorDataStridedCuda &inData, const TensorDataStr return ErrorCode::INVALID_PARAMETER; } - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, double *d_aCoeffs, - const double angleDeg, const double2 shift, const NVCVInterpolationType interpolation, - cudaStream_t stream); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + double *d_aCoeffs, const double angleDeg, const double2 shift, + const NVCVInterpolationType interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { { rotate, 0 /*rotate*/, rotate, rotate}, @@ -210,9 +221,7 @@ ErrorCode Rotate::infer(const TensorDataStridedCuda &inData, const TensorDataStr const func_t func = funcs[data_type][channels - 1]; NVCV_ASSERT(func != 0); - func(inData, outData, d_aCoeffs, angleDeg, shift, interpolation, stream); - - return SUCCESS; + return func(inData, outData, d_aCoeffs, angleDeg, shift, interpolation, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/rotate_var_shape.cu b/src/cvcuda/priv/legacy/rotate_var_shape.cu index 53b605c2b..3ac80828a 100644 --- a/src/cvcuda/priv/legacy/rotate_var_shape.cu +++ b/src/cvcuda/priv/legacy/rotate_var_shape.cu @@ -180,7 +180,7 @@ ErrorCode RotateVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData, if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } diff --git a/src/cvcuda/priv/legacy/threshold.cu b/src/cvcuda/priv/legacy/threshold.cu index fd6c85791..443fa6c26 100644 --- a/src/cvcuda/priv/legacy/threshold.cu +++ b/src/cvcuda/priv/legacy/threshold.cu @@ -31,10 +31,13 @@ using namespace nvcv::legacy::cuda_op; using namespace nvcv::cuda; -template -__global__ void Binary_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, - Tensor1DWrap _maxval, int height, int width, int channel) +template +__global__ void Binary_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, + Tensor1DWrap _maxval, int height, int width, int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -80,10 +83,12 @@ __global__ void Binary_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor *((P *)dst.ptr(batch, h, w, c)) = out; } -template -__global__ void Binary_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, - Tensor1DWrap _maxval, int height, int width, int channel) +template +__global__ void Binary_Generic(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, + Tensor1DWrap _maxval, int height, int width, int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -111,10 +116,13 @@ __global__ void Binary_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1 return; } -template -__global__ void BinaryInv_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, - Tensor1DWrap _maxval, int height, int width, int channel) +template +__global__ void BinaryInv_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, + Tensor1DWrap _maxval, int height, int width, int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -159,10 +167,13 @@ __global__ void BinaryInv_overflow(Tensor4DWrap src, Tensor4DWrap dst, Ten *((P *)dst.ptr(batch, h, w, c)) = out; } -template -__global__ void BinaryInv_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, - Tensor1DWrap _maxval, int height, int width, int channel) +template +__global__ void BinaryInv_Generic(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, + Tensor1DWrap _maxval, int height, int width, int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -190,10 +201,13 @@ __global__ void BinaryInv_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tens return; } -template -__global__ void Trunc_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, int height, - int width, int channel) +template +__global__ void Trunc_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, int height, int width, + int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -234,10 +248,13 @@ __global__ void Trunc_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor1 *((P *)dst.ptr(batch, h, w, c)) = *((P *)src.ptr(batch, h, w, c)); } -template -__global__ void Trunc_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, int height, - int width, int channel) +template +__global__ void Trunc_Generic(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, int height, int width, + int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -263,10 +280,13 @@ __global__ void Trunc_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1D return; } -template -__global__ void Tozero_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, int height, - int width, int channel) +template +__global__ void Tozero_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, int height, int width, + int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -307,10 +327,13 @@ __global__ void Tozero_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor *((P *)dst.ptr(batch, h, w, c)) = out; } -template -__global__ void Tozero_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, int height, - int width, int channel) +template +__global__ void Tozero_Generic(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, int height, int width, + int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -337,10 +360,13 @@ __global__ void Tozero_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1 return; } -template -__global__ void TozeroInv_overflow(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, int height, +template +__global__ void TozeroInv_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, int height, int width, int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -381,10 +407,13 @@ __global__ void TozeroInv_overflow(Tensor4DWrap src, Tensor4DWrap dst, Ten *((P *)dst.ptr(batch, h, w, c)) = *((P *)src.ptr(batch, h, w, c)); } -template -__global__ void TozeroInv_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tensor1DWrap _thresh, int height, +template +__global__ void TozeroInv_Generic(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, int height, int width, int channel) { + static_assert(std::is_same_v); + using T = typename SrcWrap::ValueType; + int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; if (globalid * cn >= height * width * channel) @@ -411,7 +440,7 @@ __global__ void TozeroInv_Generic(Tensor4DWrap src, Tensor4DWrap dst, Tens return; } -__global__ void hist_kernel(Tensor3DWrap img, int *histogram, int rows, int cols) +__global__ void hist_kernel(Tensor3DWrap img, int *histogram, int rows, int cols) { __shared__ int hist[256]; int localid = threadIdx.x; @@ -444,7 +473,7 @@ __global__ void hist_kernel(Tensor3DWrap img, int *histogram, int rows, i atomicAdd(&histogram[blockIdx.z * 256 + localid], val); } -__global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, int size) +__global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, int size) { int localid = threadIdx.y * blockDim.x + threadIdx.x; __shared__ int hist[256]; @@ -665,20 +694,30 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, int size) } template -void thresholdDispatch(const nvcv::TensorDataStridedCuda &input, const nvcv::TensorDataStridedCuda &output, - const nvcv::TensorDataStridedCuda &_thresh, const nvcv::TensorDataStridedCuda &_maxval, - int batch, int rows, int cols, int channel, NVCVThresholdType type, DataType data_type, - cudaStream_t stream) +ErrorCode thresholdDispatch(const nvcv::TensorDataStridedCuda &input, const nvcv::TensorDataStridedCuda &output, + const nvcv::TensorDataStridedCuda &_thresh, const nvcv::TensorDataStridedCuda &_maxval, + int batch, int rows, int cols, int channel, NVCVThresholdType type, DataType data_type, + cudaStream_t stream) { - int size = rows * cols * channel; - Tensor1DWrap thresh(_thresh); - Tensor1DWrap maxval(_maxval); + int size = rows * cols * channel; + Tensor1DWrap thresh(_thresh); + Tensor1DWrap maxval(_maxval); - using vectype = nvcv::cuda::MakeType; - auto src_ptr = CreateTensorWrapNHWC(input); - auto dst_ptr = CreateTensorWrapNHWC(output); auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(input); auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(output); + + auto outMaxStride = outAccess->sampleStride() * outAccess->numSamples(); + auto inMaxStride = inAccess->sampleStride() * inAccess->numSamples(); + if (std::max(outMaxStride, inMaxStride) > TypeTraits::max) + { + LOG_ERROR("Input or output size exceeds " << TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } + + using vectype = nvcv::cuda::MakeType; + using StrideType = int32_t; + auto src_ptr = CreateTensorWrapNHWC(input); + auto dst_ptr = CreateTensorWrapNHWC(output); dim3 block(256); dim3 grid(divUp(size, block.x * N), 1, batch); @@ -686,63 +725,65 @@ void thresholdDispatch(const nvcv::TensorDataStridedCuda &input, const nvcv::Ten { case NVCV_THRESH_BINARY: if (data_type == kCV_32F || data_type == kCV_64F) - Binary_Generic - <<>>(src_ptr, dst_ptr, thresh, maxval, rows, cols, channel); + Binary_Generic<<>>(src_ptr, dst_ptr, thresh, maxval, rows, cols, channel); else - Binary_overflow - <<>>(src_ptr, dst_ptr, thresh, maxval, rows, cols, channel); + Binary_overflow<<>>(src_ptr, dst_ptr, thresh, maxval, rows, cols, channel); break; case NVCV_THRESH_BINARY_INV: if (data_type == kCV_32F || data_type == kCV_64F) - BinaryInv_Generic + BinaryInv_Generic <<>>(src_ptr, dst_ptr, thresh, maxval, rows, cols, channel); else - BinaryInv_overflow + BinaryInv_overflow <<>>(src_ptr, dst_ptr, thresh, maxval, rows, cols, channel); break; case NVCV_THRESH_TRUNC: if (data_type == kCV_32F || data_type == kCV_64F) - Trunc_Generic<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); + Trunc_Generic<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); else - Trunc_overflow<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); + Trunc_overflow<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); break; case NVCV_THRESH_TOZERO: if (data_type == kCV_32F || data_type == kCV_64F) - Tozero_Generic<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); + Tozero_Generic<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); else - Tozero_overflow<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); + Tozero_overflow<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); break; default: //NVCV_THRESH_TOZERO_INV if (data_type == kCV_32F || data_type == kCV_64F) - TozeroInv_Generic<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); + TozeroInv_Generic<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); else - TozeroInv_overflow<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); + TozeroInv_overflow<<>>(src_ptr, dst_ptr, thresh, rows, cols, channel); break; } checkKernelErrors(); + return ErrorCode::SUCCESS; } template -void thresholdScale(const nvcv::TensorDataStridedCuda &input, const nvcv::TensorDataStridedCuda &output, - const nvcv::TensorDataStridedCuda &threshold, const nvcv::TensorDataStridedCuda &maxval, int batch, - int rows, int cols, int channel, NVCVThresholdType type, DataType data_type, cudaStream_t stream) +ErrorCode thresholdScale(const nvcv::TensorDataStridedCuda &input, const nvcv::TensorDataStridedCuda &output, + const nvcv::TensorDataStridedCuda &threshold, const nvcv::TensorDataStridedCuda &maxval, + int batch, int rows, int cols, int channel, NVCVThresholdType type, DataType data_type, + cudaStream_t stream) { int stride = cols * channel; if (stride % 4 == 0) { if (std::is_same::value) - thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, data_type, - stream); + return thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, + data_type, stream); else - thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, data_type, - stream); + return thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, + data_type, stream); } else if (stride % 2 == 0) - thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, data_type, stream); + return thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, data_type, + stream); else - thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, data_type, stream); + return thresholdDispatch(input, output, threshold, maxval, batch, rows, cols, channel, type, data_type, + stream); } static void getThreshVal_Triangle(const nvcv::TensorDataStridedCuda &inData, @@ -751,8 +792,8 @@ static void getThreshVal_Triangle(const nvcv::TensorDataStridedCuda &inData, { checkCudaErrors(cudaMemsetAsync(histogram, 0, sizeof(int) * 256 * batch, stream)); - auto wrap = CreateTensorWrapNHW(inData); - Tensor1DWrap thresh(threshold); + auto wrap = CreateTensorWrapNHW(inData); + Tensor1DWrap thresh(threshold); dim3 block(256); int td = divUp(cols, 16) * rows; @@ -770,8 +811,8 @@ static void getThreshVal_Otsu(const nvcv::TensorDataStridedCuda &inData, const n int size = rows * cols; checkCudaErrors(cudaMemsetAsync(histogram, 0, sizeof(int) * 256 * batch, stream)); - auto wrap = CreateTensorWrapNHW(inData); - Tensor1DWrap thresh(threshold); + auto wrap = CreateTensorWrapNHW(inData); + Tensor1DWrap thresh(threshold); dim3 block(256); int td = divUp(cols, 16) * rows; @@ -891,6 +932,11 @@ ErrorCode Threshold::infer(const TensorDataStridedCuda &inData, const TensorData LOG_ERROR("Only support 1 channel"); return ErrorCode::INVALID_DATA_FORMAT; } + if (inAccess->sampleStride() * inAccess->numSamples() > TypeTraits::max) + { + LOG_ERROR("Input size exceeds " << TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } getThreshVal_Otsu(inData, thresh, m_histogram, inAccess->numRows(), inAccess->numCols(), inAccess->numSamples(), stream); } @@ -906,24 +952,27 @@ ErrorCode Threshold::infer(const TensorDataStridedCuda &inData, const TensorData LOG_ERROR("Only support 1 channel"); return ErrorCode::INVALID_DATA_FORMAT; } + if (inAccess->sampleStride() * inAccess->numSamples() > TypeTraits::max) + { + LOG_ERROR("Input size exceeds " << TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } getThreshVal_Triangle(inData, thresh, m_histogram, inAccess->numRows(), inAccess->numCols(), inAccess->numSamples(), stream); } - typedef void (*threshold_t)(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, - const TensorDataStridedCuda &threshold, const TensorDataStridedCuda &maxval, int batch, - int rows, int cols, int channel, NVCVThresholdType type, DataType data_type, - cudaStream_t stream); + typedef ErrorCode (*threshold_t)(const TensorDataStridedCuda &input, const TensorDataStridedCuda &output, + const TensorDataStridedCuda &threshold, const TensorDataStridedCuda &maxval, + int batch, int rows, int cols, int channel, NVCVThresholdType type, + DataType data_type, cudaStream_t stream); static const threshold_t funcs[7] = {thresholdScale, 0, thresholdScale, thresholdScale, 0, thresholdScale, thresholdScale}; threshold_t func = funcs[in_data_type]; NVCVThresholdType th_type = NVCVThresholdType(m_type); - func(inData, outData, thresh, maxval, inAccess->numSamples(), inAccess->numRows(), inAccess->numCols(), - inAccess->numChannels(), th_type, in_data_type, stream); - - return SUCCESS; + return func(inData, outData, thresh, maxval, inAccess->numSamples(), inAccess->numRows(), inAccess->numCols(), + inAccess->numChannels(), th_type, in_data_type, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/threshold_util.cu b/src/cvcuda/priv/legacy/threshold_util.cu index 77a55a415..6956d124c 100644 --- a/src/cvcuda/priv/legacy/threshold_util.cu +++ b/src/cvcuda/priv/legacy/threshold_util.cu @@ -21,7 +21,7 @@ #include "CvCudaUtils.cuh" #include "cub/cub.cuh" -__global__ void triangle_cal(int *histogram, nvcv::cuda::Tensor1DWrap thresh) +__global__ void triangle_cal(int *histogram, nvcv::cuda::Tensor1DWrap thresh) { int localid = threadIdx.y * blockDim.x + threadIdx.x; __shared__ int hist[256]; diff --git a/src/cvcuda/priv/legacy/threshold_util.cuh b/src/cvcuda/priv/legacy/threshold_util.cuh index 6da4963b8..374be7bf4 100644 --- a/src/cvcuda/priv/legacy/threshold_util.cuh +++ b/src/cvcuda/priv/legacy/threshold_util.cuh @@ -24,6 +24,6 @@ #include "CvCudaUtils.cuh" #include "cub/cub.cuh" -__global__ void triangle_cal(int *histogram, nvcv::cuda::Tensor1DWrap thresh); +__global__ void triangle_cal(int *histogram, nvcv::cuda::Tensor1DWrap thresh); #endif // THRESHOLD_UTILS_CUH diff --git a/src/cvcuda/priv/legacy/threshold_var_shape.cu b/src/cvcuda/priv/legacy/threshold_var_shape.cu index deaf70db3..737278876 100644 --- a/src/cvcuda/priv/legacy/threshold_var_shape.cu +++ b/src/cvcuda/priv/legacy/threshold_var_shape.cu @@ -33,7 +33,8 @@ using namespace nvcv::cuda; template> __global__ void Binary_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, Tensor1DWrap _maxval, int channel) + Tensor1DWrap _thresh, Tensor1DWrap _maxval, + int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -112,7 +113,8 @@ __global__ void Binary_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVar template> __global__ void Binary_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, Tensor1DWrap _maxval, int channel) + Tensor1DWrap _thresh, Tensor1DWrap _maxval, + int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -158,7 +160,8 @@ __global__ void Binary_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarS template> __global__ void BinaryInv_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, Tensor1DWrap _maxval, int channel) + Tensor1DWrap _thresh, Tensor1DWrap _maxval, + int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -234,7 +237,8 @@ __global__ void BinaryInv_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatch template> __global__ void BinaryInv_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, Tensor1DWrap _maxval, int channel) + Tensor1DWrap _thresh, Tensor1DWrap _maxval, + int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -280,7 +284,7 @@ __global__ void BinaryInv_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchV template> __global__ void Trunc_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, int channel) + Tensor1DWrap _thresh, int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -355,7 +359,7 @@ __global__ void Trunc_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarS template> __global__ void Trunc_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, int channel) + Tensor1DWrap _thresh, int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -400,7 +404,7 @@ __global__ void Trunc_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarSh template> __global__ void Tozero_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, int channel) + Tensor1DWrap _thresh, int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -475,7 +479,7 @@ __global__ void Tozero_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVar template> __global__ void Tozero_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, int channel) + Tensor1DWrap _thresh, int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -520,7 +524,7 @@ __global__ void Tozero_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarS template> __global__ void TozeroInv_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, int channel) + Tensor1DWrap _thresh, int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -595,7 +599,7 @@ __global__ void TozeroInv_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatch template> __global__ void TozeroInv_Generic(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, - Tensor1DWrap _thresh, int channel) + Tensor1DWrap _thresh, int channel) { int cn = NumElements

; int globalid = blockIdx.x * blockDim.x + threadIdx.x; @@ -673,7 +677,8 @@ __global__ void hist_kernel(ImageBatchVarShapeWrapNHWC img, int *histogra atomicAdd(&histogram[blockIdx.z * 256 + localid], val); } -__global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap thresh, ImageBatchVarShapeWrapNHWC img) +__global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap thresh, + ImageBatchVarShapeWrapNHWC img) { int localid = threadIdx.y * blockDim.x + threadIdx.x; int size = img.width((int)blockIdx.z) * img.height((int)blockIdx.z); @@ -900,8 +905,8 @@ void thresholdDispatch(const nvcv::ImageBatchVarShapeDataStridedCuda &input, const nvcv::TensorDataStridedCuda &_thresh, const nvcv::TensorDataStridedCuda &_maxval, NVCVThresholdType type, DataType data_type, cudaStream_t stream) { - Tensor1DWrap thresh(_thresh); - Tensor1DWrap maxval(_maxval); + Tensor1DWrap thresh(_thresh); + Tensor1DWrap maxval(_maxval); nvcv::Size2D maxsize = input.maxSize(); int batch = input.numImages(); @@ -959,7 +964,7 @@ static void getThreshVal_Triangle(const nvcv::ImageBatchVarShapeDataStridedCuda checkCudaErrors(cudaMemsetAsync(histogram, 0, sizeof(int) * 256 * batch, stream)); ImageBatchVarShapeWrapNHWC wrap(inData, inData.uniqueFormat().numChannels()); - Tensor1DWrap thresh(threshold); + Tensor1DWrap thresh(threshold); nvcv::Size2D maxsize = inData.maxSize(); dim3 block(256); @@ -979,7 +984,7 @@ static void getThreshVal_Otsu(const nvcv::ImageBatchVarShapeDataStridedCuda &inD checkCudaErrors(cudaMemsetAsync(histogram, 0, sizeof(int) * 256 * batch, stream)); ImageBatchVarShapeWrapNHWC wrap(inData, inData.uniqueFormat().numChannels()); - Tensor1DWrap thresh(threshold); + Tensor1DWrap thresh(threshold); nvcv::Size2D maxsize = inData.maxSize(); dim3 block(256); diff --git a/src/cvcuda/priv/legacy/warp.cu b/src/cvcuda/priv/legacy/warp.cu index e1dbc8a17..2c89e2c2b 100644 --- a/src/cvcuda/priv/legacy/warp.cu +++ b/src/cvcuda/priv/legacy/warp.cu @@ -51,12 +51,15 @@ __global__ void warp(SrcWrapper src, DstWrapper dst, int2 dstSize, Transform tra template struct WarpDispatcher { - static void call(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Transform transform, - const float4 &borderValue, cudaStream_t stream) + static ErrorCode call(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + Transform transform, const float4 &borderValue, cudaStream_t stream) { auto outAccess = TensorDataAccessStridedImagePlanar::Create(outData); NVCV_ASSERT(outAccess); + auto inAccess = TensorDataAccessStridedImagePlanar::Create(outData); + NVCV_ASSERT(inAccess); + const int2 dstSize{outAccess->numCols(), outAccess->numRows()}; const int batchSize{static_cast(outAccess->numSamples())}; @@ -65,22 +68,34 @@ struct WarpDispatcher auto bVal = cuda::StaticCast>(cuda::DropCast>(borderValue)); - auto src = cuda::CreateInterpolationWrapNHW(inData, bVal); - auto dst = cuda::CreateTensorWrapNHW(outData); - int smem_size = 9 * sizeof(float); - warp<<>>(src, dst, dstSize, transform); + int64_t srcMaxStride = inAccess->sampleStride() * batchSize; + int64_t dstMaxStride = outAccess->sampleStride() * batchSize; + + if (std::max(srcMaxStride, dstMaxStride) <= cuda::TypeTraits::max) + { + auto src = cuda::CreateInterpolationWrapNHW(inData, bVal); + auto dst = cuda::CreateTensorWrapNHW(outData); + + warp<<>>(src, dst, dstSize, transform); + } + else + { + LOG_ERROR("Input or output size exceeds " << cuda::TypeTraits::max << ". Tensor is too large."); + return ErrorCode::INVALID_PARAMETER; + } checkKernelErrors(); + return ErrorCode::SUCCESS; } }; template -void warp_caller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Transform transform, - int interpolation, int borderMode, const float4 &borderValue, cudaStream_t stream) +ErrorCode warp_caller(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Transform transform, + int interpolation, int borderMode, const float4 &borderValue, cudaStream_t stream) { - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - Transform transform, const float4 &borderValue, cudaStream_t stream); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + Transform transform, const float4 &borderValue, cudaStream_t stream); static const func_t funcs[3][5] = { {WarpDispatcher::call, @@ -100,23 +115,25 @@ void warp_caller(const TensorDataStridedCuda &inData, const TensorDataStridedCud WarpDispatcher::call}, }; - funcs[interpolation][borderMode](inData, outData, transform, borderValue, stream); + return funcs[interpolation][borderMode](inData, outData, transform, borderValue, stream); } template -void warpAffine(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - WarpAffineTransform transform, const int interpolation, int borderMode, const float4 &borderValue, - cudaStream_t stream) +ErrorCode warpAffine(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + WarpAffineTransform transform, const int interpolation, int borderMode, const float4 &borderValue, + cudaStream_t stream) { - warp_caller(inData, outData, transform, interpolation, borderMode, borderValue, stream); + return warp_caller(inData, outData, transform, interpolation, borderMode, borderValue, + stream); } template -void warpPerspective(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - PerspectiveTransform transform, const int interpolation, int borderMode, const float4 &borderValue, - cudaStream_t stream) +ErrorCode warpPerspective(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + PerspectiveTransform transform, const int interpolation, int borderMode, + const float4 &borderValue, cudaStream_t stream) { - warp_caller(inData, outData, transform, interpolation, borderMode, borderValue, stream); + return warp_caller(inData, outData, transform, interpolation, borderMode, borderValue, + stream); } static void invertMat(const float *M, float *h_aCoeffs) @@ -149,7 +166,7 @@ ErrorCode WarpAffine::infer(const TensorDataStridedCuda &inData, const TensorDat if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -181,9 +198,9 @@ ErrorCode WarpAffine::infer(const TensorDataStridedCuda &inData, const TensorDat || borderMode == NVCV_BORDER_CONSTANT || borderMode == NVCV_BORDER_REFLECT || borderMode == NVCV_BORDER_WRAP); - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - WarpAffineTransform transform, const int interpolation, int borderMode, - const float4 &borderValue, cudaStream_t stream); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + WarpAffineTransform transform, const int interpolation, int borderMode, + const float4 &borderValue, cudaStream_t stream); static const func_t funcs[6][4] = { { warpAffine, 0, warpAffine, warpAffine}, @@ -211,9 +228,7 @@ ErrorCode WarpAffine::infer(const TensorDataStridedCuda &inData, const TensorDat invertMat(xform, transform.xform); } - func(inData, outData, transform, interpolation, borderMode, borderValue, stream); - - return ErrorCode::SUCCESS; + return func(inData, outData, transform, interpolation, borderMode, borderValue, stream); } ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, @@ -233,7 +248,7 @@ ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const Tens if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -265,9 +280,9 @@ ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const Tens || borderMode == NVCV_BORDER_CONSTANT || borderMode == NVCV_BORDER_REFLECT || borderMode == NVCV_BORDER_WRAP); - typedef void (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, - PerspectiveTransform transform, const int interpolation, int borderMode, - const float4 &borderValue, cudaStream_t stream); + typedef ErrorCode (*func_t)(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, + PerspectiveTransform transform, const int interpolation, int borderMode, + const float4 &borderValue, cudaStream_t stream); static const func_t funcs[6][4] = { { warpPerspective, 0 /*warpPerspective*/, warpPerspective,warpPerspective }, @@ -296,9 +311,7 @@ ErrorCode WarpPerspective::infer(const TensorDataStridedCuda &inData, const Tens tempMatrixForInverse.store(transform.xform); } - func(inData, outData, transform, interpolation, borderMode, borderValue, stream); - - return ErrorCode::SUCCESS; + return func(inData, outData, transform, interpolation, borderMode, borderValue, stream); } } // namespace nvcv::legacy::cuda_op diff --git a/src/cvcuda/priv/legacy/warp_var_shape.cu b/src/cvcuda/priv/legacy/warp_var_shape.cu index e99dd18c3..58ecb0a26 100644 --- a/src/cvcuda/priv/legacy/warp_var_shape.cu +++ b/src/cvcuda/priv/legacy/warp_var_shape.cu @@ -30,8 +30,8 @@ namespace nvcv::legacy::cuda_op { -__global__ void inverseMatWarpPerspective(const int numImages, const cuda::Tensor2DWrap in, - cuda::Tensor2DWrap out) +__global__ void inverseMatWarpPerspective(const int numImages, const cuda::Tensor2DWrap in, + cuda::Tensor2DWrap out) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= numImages) @@ -48,8 +48,8 @@ __global__ void inverseMatWarpPerspective(const int numImages, const cuda::Tenso transMatrix.store(out.ptr(index)); } -__global__ void inverseMatWarpAffine(const int numImages, const cuda::Tensor2DWrap in, - cuda::Tensor2DWrap out) +__global__ void inverseMatWarpAffine(const int numImages, const cuda::Tensor2DWrap in, + cuda::Tensor2DWrap out) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= numImages) @@ -77,7 +77,7 @@ __global__ void inverseMatWarpAffine(const int numImages, const cuda::Tensor2DWr } template -__global__ void warp(SrcWrapper src, DstWrapper dst, const cuda::Tensor2DWrap coeffs) +__global__ void warp(SrcWrapper src, DstWrapper dst, const cuda::Tensor2DWrap coeffs) { int3 dstCoord = cuda::StaticCast(blockDim * blockIdx + threadIdx); const int lid = threadIdx.y * blockDim.x + threadIdx.x; @@ -104,7 +104,7 @@ template struct WarpDispatcher { static void call(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - const cuda::Tensor2DWrap transform, const float4 &borderValue, cudaStream_t stream) + const cuda::Tensor2DWrap transform, const float4 &borderValue, cudaStream_t stream) { Size2D outMaxSize = outData.maxSize(); @@ -125,12 +125,12 @@ struct WarpDispatcher template void warp_caller(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, + cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, const float4 &borderValue, cudaStream_t stream) { - typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, cuda::Tensor2DWrap transform, - const float4 &borderValue, cudaStream_t stream); + typedef void (*func_t)( + const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, + cuda::Tensor2DWrap transform, const float4 &borderValue, cudaStream_t stream); static const func_t funcs[3][5] = { {WarpDispatcher::call, @@ -155,7 +155,7 @@ void warp_caller(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBat template void warpAffine(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, + cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, const float4 &borderValue, cudaStream_t stream) { warp_caller(inData, outData, transform, interpolation, borderMode, borderValue, stream); @@ -163,7 +163,7 @@ void warpAffine(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatc template void warpPerspective(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, - cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, + cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, const float4 &borderValue, cudaStream_t stream) { warp_caller(inData, outData, transform, interpolation, borderMode, borderValue, stream); @@ -222,7 +222,7 @@ ErrorCode WarpAffineVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inD if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -261,8 +261,8 @@ ErrorCode WarpAffineVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inD bool performInverse = !(flags & NVCV_WARP_INVERSE_MAP); // Wrap the matrix in 2D wrappers with proper pitch - cuda::Tensor2DWrap transMatrixInput(transMatrix); - cuda::Tensor2DWrap transMatrixOutput(m_transformationMatrix, static_cast(sizeof(float) * 9)); + cuda::Tensor2DWrap transMatrixInput(transMatrix); + cuda::Tensor2DWrap transMatrixOutput(m_transformationMatrix, static_cast(sizeof(float) * 9)); if (performInverse) { @@ -278,9 +278,9 @@ ErrorCode WarpAffineVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inD } typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, const cuda::Tensor2DWrap transform, - const int interpolation, const int borderMode, const float4 &borderValue, - cudaStream_t stream); + const ImageBatchVarShapeDataStridedCuda &outData, + const cuda::Tensor2DWrap transform, const int interpolation, + const int borderMode, const float4 &borderValue, cudaStream_t stream); static const func_t funcs[6][4] = { { warpAffine, 0 /*warpAffine*/, warpAffine, warpAffine}, @@ -349,7 +349,7 @@ ErrorCode WarpPerspectiveVarShape::infer(const ImageBatchVarShapeDataStridedCuda if (!(format == kNHWC || format == kHWC)) { - LOG_ERROR("Invalid DataFormat " << format); + LOG_ERROR("Invalid input DataFormat " << format << ", the valid DataFormats are: \"NHWC\", \"HWC\""); return ErrorCode::INVALID_DATA_FORMAT; } @@ -388,8 +388,8 @@ ErrorCode WarpPerspectiveVarShape::infer(const ImageBatchVarShapeDataStridedCuda bool performInverse = flags & NVCV_WARP_INVERSE_MAP; // Wrap the matrix in 2D wrappers with proper pitch - cuda::Tensor2DWrap transMatrixInput(transMatrix); - cuda::Tensor2DWrap transMatrixOutput(m_transformationMatrix, static_cast(sizeof(float) * 9)); + cuda::Tensor2DWrap transMatrixInput(transMatrix); + cuda::Tensor2DWrap transMatrixOutput(m_transformationMatrix, static_cast(sizeof(float) * 9)); if (!performInverse) { @@ -405,9 +405,9 @@ ErrorCode WarpPerspectiveVarShape::infer(const ImageBatchVarShapeDataStridedCuda } typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, - const ImageBatchVarShapeDataStridedCuda &outData, cuda::Tensor2DWrap transform, - const int interpolation, const int borderMode, const float4 &borderValue, - cudaStream_t stream); + const ImageBatchVarShapeDataStridedCuda &outData, + cuda::Tensor2DWrap transform, const int interpolation, const int borderMode, + const float4 &borderValue, cudaStream_t stream); static const func_t funcs[6][4] = { { warpPerspective, 0 /*warpPerspective*/, warpPerspective,warpPerspective }, diff --git a/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp b/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp index eb0009f4a..ce2a02820 100644 --- a/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp @@ -136,6 +136,7 @@ class BorderWrapImpl public: using TensorWrapper = TW; using ValueType = typename TensorWrapper::ValueType; + using StrideType = typename TensorWrapper::StrideType; static constexpr int kNumDimensions = TensorWrapper::kNumDimensions; static constexpr NVCVBorderType kBorderType = B; @@ -170,7 +171,7 @@ class BorderWrapImpl template explicit __host__ __device__ BorderWrapImpl(TensorWrapper tensorWrap, Args... tensorShape) : m_tensorWrap(tensorWrap) - , m_tensorShape{std::forward(tensorShape)...} + , m_tensorShape{std::forward(tensorShape)...} { if constexpr (sizeof...(Args) == 0) { @@ -187,7 +188,7 @@ class BorderWrapImpl } else { - static_assert(std::conjunction_v...>); + static_assert((IsIndexType && ...)); static_assert(sizeof...(Args) == kNumActiveDimensions); } } @@ -215,7 +216,7 @@ class BorderWrapImpl return m_tensorWrap; } - inline __host__ __device__ const int *tensorShape() const + inline const __host__ __device__ StrideType *tensorShape() const { return m_tensorShape; } @@ -227,7 +228,7 @@ class BorderWrapImpl protected: const TensorWrapper m_tensorWrap = {}; - int m_tensorShape[kNumActiveDimensions] = {0}; + StrideType m_tensorShape[kNumActiveDimensions] = {0}; }; } // namespace detail @@ -270,6 +271,7 @@ class BorderWrap : public detail::BorderWrapImpl using Base = detail::BorderWrapImpl; public: + using typename Base::StrideType; using typename Base::TensorWrapper; using typename Base::ValueType; @@ -372,8 +374,8 @@ class BorderWrap : public detail::BorderWrapImpl template inline __host__ __device__ ValueType *doGetPtr(std::index_sequence, Args... c) const { - return Base::m_tensorWrap.ptr( - GetIndexWithBorder(c, Base::m_tensorShape[kMap.from[Is]])...); + return Base::m_tensorWrap.ptr(GetIndexWithBorder( + static_cast(c), Base::m_tensorShape[kMap.from[Is]])...); } }; @@ -390,6 +392,7 @@ class BorderWrap using Base = detail::BorderWrapImpl; public: + using typename Base::StrideType; using typename Base::TensorWrapper; using typename Base::ValueType; @@ -513,7 +516,7 @@ class BorderWrap template inline __host__ __device__ ValueType *doGetPtr(std::index_sequence, Args... c) const { - if ((IsOutside(c, Base::m_tensorShape[kMap.from[Is]]) || ...)) + if ((IsOutside(static_cast(c), Base::m_tensorShape[kMap.from[Is]]) || ...)) { return nullptr; } @@ -537,24 +540,26 @@ class BorderWrap * * @tparam T Type of the values to be accessed in the border wrap. * @tparam B Border extension to be used when accessing H and W, one of \ref NVCVBorderType + * @tparam StrideType Type of the strdies used in the underlying TensorWrap. * * @param[in] tensor Reference to the tensor that will be wrapped. * @param[in] borderValue Border value to be used when accessing outside elements in constant border type * * @return Border wrap useful to access tensor data border aware in H and W in CUDA kernels. */ -template>> +template>> __host__ auto CreateBorderWrapNHW(const TensorDataStridedCuda &tensor, T borderValue = {}) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); assert(tensorAccess); - assert(tensorAccess->numRows() <= TypeTraits::max); - assert(tensorAccess->numCols() <= TypeTraits::max); + assert(tensorAccess->numRows() <= TypeTraits::max); + assert(tensorAccess->numCols() <= TypeTraits::max); - auto tensorWrap = CreateTensorWrapNHW(tensor); + auto tensorWrap = CreateTensorWrapNHW(tensor); - return BorderWrap( - tensorWrap, borderValue, static_cast(tensorAccess->numRows()), static_cast(tensorAccess->numCols())); + return BorderWrap(tensorWrap, borderValue, + static_cast(tensorAccess->numRows()), + static_cast(tensorAccess->numCols())); } /** @@ -569,24 +574,26 @@ __host__ auto CreateBorderWrapNHW(const TensorDataStridedCuda &tensor, T borderV * * @tparam T Type of the values to be accessed in the border wrap. * @tparam B Border extension to be used when accessing H and W, one of \ref NVCVBorderType + * @tparam StrideType Type of the strdies used in the underlying TensorWrap. * * @param[in] tensor Reference to the tensor that will be wrapped. * @param[in] borderValue Border value to be used when accessing outside elements in constant border type * * @return Border wrap useful to access tensor data border aware in H and W in CUDA kernels. */ -template>> +template>> __host__ auto CreateBorderWrapNHWC(const TensorDataStridedCuda &tensor, T borderValue = {}) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); assert(tensorAccess); - assert(tensorAccess->numRows() <= TypeTraits::max); - assert(tensorAccess->numCols() <= TypeTraits::max); + assert(tensorAccess->numRows() <= TypeTraits::max); + assert(tensorAccess->numCols() <= TypeTraits::max); - auto tensorWrap = CreateTensorWrapNHWC(tensor); + auto tensorWrap = CreateTensorWrapNHWC(tensor); return BorderWrap( - tensorWrap, borderValue, static_cast(tensorAccess->numRows()), static_cast(tensorAccess->numCols())); + tensorWrap, borderValue, static_cast(tensorAccess->numRows()), + static_cast(tensorAccess->numCols())); } } // namespace nvcv::cuda diff --git a/src/nvcv_types/include/nvcv/cuda/DropCast.hpp b/src/nvcv_types/include/nvcv/cuda/DropCast.hpp index 4caf13e75..9d69754e8 100644 --- a/src/nvcv_types/include/nvcv/cuda/DropCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/DropCast.hpp @@ -64,11 +64,13 @@ __host__ __device__ auto DropCast(T v) { RT out{}; -#pragma unroll - for (int e = 0; e < NumElements; ++e) - { - GetElement(out, e) = GetElement(v, e); - } + GetElement<0>(out) = GetElement<0>(v); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = GetElement<1>(v); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = GetElement<2>(v); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = GetElement<3>(v); return out; } diff --git a/src/nvcv_types/include/nvcv/cuda/FullTensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/FullTensorWrap.hpp index ce315a39d..7953160b5 100644 --- a/src/nvcv_types/include/nvcv/cuda/FullTensorWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/FullTensorWrap.hpp @@ -61,7 +61,8 @@ class FullTensorWrap public: // The type provided as template parameter is the value type, i.e. the type of each element inside this wrapper. - using ValueType = const T; + using ValueType = const T; + using StrideType = int32_t; // The number of dimensions is provided as a template parameter. static constexpr int kNumDimensions = N; @@ -220,6 +221,7 @@ class FullTensorWrap : public FullTensorWrap public: using ValueType = T; + using typename Base::StrideType; using Base::kConstantStrides; using Base::kNumDimensions; diff --git a/src/nvcv_types/include/nvcv/cuda/InterpolationVarShapeWrap.hpp b/src/nvcv_types/include/nvcv/cuda/InterpolationVarShapeWrap.hpp index 80f003223..fa2fe717b 100644 --- a/src/nvcv_types/include/nvcv/cuda/InterpolationVarShapeWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/InterpolationVarShapeWrap.hpp @@ -421,28 +421,27 @@ class InterpolationVarShapeWrap std::is_same_v, float> && (NumElements == 3 || NumElements == 4)>> inline __host__ __device__ ValueType operator[](DimType c) const { - const int xmin = GetIndexForInterpolation(c.x - 2.f); - const int xmax = GetIndexForInterpolation(c.x + 2.f); - const int ymin = GetIndexForInterpolation(c.y - 2.f); - const int ymax = GetIndexForInterpolation(c.y + 2.f); + const int ix = GetIndexForInterpolation(c.x); + const int iy = GetIndexForInterpolation(c.y); using FT = ConvertBaseTypeTo>; auto sum = SetAll(0); - float w, wsum = 0.f; + float wx[4]; + GetCubicCoeffs(c.x - ix, wx[0], wx[1], wx[2], wx[3]); + float wy[4]; + GetCubicCoeffs(c.y - iy, wy[0], wy[1], wy[2], wy[3]); - for (int cy = ymin; cy <= ymax; cy++) +#pragma unroll + for (int cy = -1; cy <= 2; cy++) { - for (int cx = xmin; cx <= xmax; cx++) +#pragma unroll + for (int cx = -1; cx <= 2; cx++) { - w = GetCubicCoeff(c.x - cx) * GetCubicCoeff(c.y - cy); - sum += w * Base::doGetValue(c, cx, cy); - wsum += w; + sum += Base::doGetValue(c, ix + cx, iy + cy) * (wx[cx + 1] * wy[cy + 1]); } } - sum = (wsum == 0.f) ? SetAll(0) : sum / wsum; - return SaturateCast(sum); } }; diff --git a/src/nvcv_types/include/nvcv/cuda/InterpolationWrap.hpp b/src/nvcv_types/include/nvcv/cuda/InterpolationWrap.hpp index 39908fe55..bb9cebd5b 100644 --- a/src/nvcv_types/include/nvcv/cuda/InterpolationWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/InterpolationWrap.hpp @@ -43,13 +43,14 @@ namespace nvcv::cuda { * * @tparam I Interpolation type, one of \ref NVCVInterpolationType. * @tparam Position Interpolation position, 1 for the first index and 2 for the second index. + * @tparam IndexType Type of the returned value * * @param[in] c Coordinate in floating-point to convert to index in integer. * * @return Index in integer suitable for interpolation computation. */ -template -constexpr inline int __host__ __device__ GetIndexForInterpolation(float c) +template +constexpr inline IndexType __host__ __device__ GetIndexForInterpolation(float c) { static_assert( I == NVCV_INTERP_NEAREST || I == NVCV_INTERP_LINEAR || I == NVCV_INTERP_CUBIC || I == NVCV_INTERP_AREA, @@ -58,38 +59,45 @@ constexpr inline int __host__ __device__ GetIndexForInterpolation(float c) if constexpr (I == NVCV_INTERP_NEAREST) { - return cuda::round(c); + return cuda::round(c); } else if constexpr (I == NVCV_INTERP_LINEAR) { - return cuda::round(c); + return cuda::round(c); } - else if constexpr (I == NVCV_INTERP_CUBIC || I == NVCV_INTERP_AREA) + else if constexpr (I == NVCV_INTERP_CUBIC) + { + return cuda::round(c); + } + else if constexpr (I == NVCV_INTERP_AREA) { if constexpr (Position == 1) - return cuda::round(c); + return cuda::round(c); else if constexpr (Position == 2) - return cuda::round(c); + return cuda::round(c); } - return static_cast(c); + return static_cast(c); } -inline float __host__ __device__ GetCubicCoeff(float c) +inline void __host__ __device__ GetCubicCoeffs(float delta, float &w0, float &w1, float &w2, float &w3) { - c = cuda::abs(c); - if (c <= 1.0f) - { - return c * c * (1.5f * c - 2.5f) + 1.0f; - } - else if (c < 2.0f) - { - return c * (c * (-0.5f * c + 2.5f) - 4.0f) + 2.0f; - } - else - { - return 0.0f; - } + w0 = -.5f; + w0 = w0 * delta + 1.f; + w0 = w0 * delta - .5f; + w0 = w0 * delta; + + w1 = 1.5f; + w1 = w1 * delta - 2.5f; + w1 = w1 * delta; + w1 = w1 * delta + 1.f; + + w2 = -1.5f; + w2 = w2 * delta + 2.f; + w2 = w2 * delta + .5f; + w2 = w2 * delta; + + w3 = 1 - w0 - w1 - w2; } /**@}*/ @@ -103,6 +111,7 @@ class InterpolationWrapImpl using BorderWrapper = BW; using TensorWrapper = typename BorderWrapper::TensorWrapper; using ValueType = typename BorderWrapper::ValueType; + using StrideType = typename BorderWrapper::StrideType; static constexpr int kNumDimensions = BorderWrapper::kNumDimensions; static constexpr NVCVInterpolationType kInterpolationType = I; @@ -180,15 +189,15 @@ class InterpolationWrapImpl protected: template - inline const __host__ __device__ ValueType &doGetValue(DimType c, int x, int y) const + inline const __host__ __device__ ValueType &doGetValue(DimType c, StrideType x, StrideType y) const { cuda::ConvertBaseTypeTo ic; - GetElement(ic, kCoordMap.id[0]) = x; - GetElement(ic, kCoordMap.id[1]) = y; + GetElement(ic) = x; + GetElement(ic) = y; if constexpr (NumElements >= 3) - GetElement(ic, kCoordMap.id[2]) = static_cast(GetElement(c, kCoordMap.id[2])); + GetElement(ic) = static_cast(GetElement(c)); if constexpr (NumElements == 4) - GetElement(ic, kCoordMap.id[3]) = static_cast(GetElement(c, kCoordMap.id[3])); + GetElement(ic) = static_cast(GetElement(c)); return m_borderWrap[ic]; } @@ -246,6 +255,7 @@ class InterpolationWrap : public detail::InterpolationW public: using typename Base::BorderWrapper; + using typename Base::StrideType; using typename Base::TensorWrapper; using typename Base::ValueType; @@ -321,8 +331,10 @@ class InterpolationWrap : public detail::InterpolationW float> && 2 <= NumElements && NumElements <= kNumDimensions>> inline __host__ __device__ ValueType operator[](DimType c) const { - const int x = GetIndexForInterpolation(GetElement(c, kCoordMap.id[0]) + .5f); - const int y = GetIndexForInterpolation(GetElement(c, kCoordMap.id[1]) + .5f); + const StrideType x + = GetIndexForInterpolation(GetElement(c) + .5f); + const StrideType y + = GetIndexForInterpolation(GetElement(c) + .5f); return Base::doGetValue(c, x, y); } @@ -340,6 +352,7 @@ class InterpolationWrap : public detail::InterpolationWr public: using typename Base::BorderWrapper; + using typename Base::StrideType; using typename Base::TensorWrapper; using typename Base::ValueType; @@ -415,12 +428,12 @@ class InterpolationWrap : public detail::InterpolationWr float> && 2 <= NumElements && NumElements <= kNumDimensions>> inline __host__ __device__ ValueType operator[](DimType c) const { - const float x = GetElement(c, kCoordMap.id[0]); - const float y = GetElement(c, kCoordMap.id[1]); - const int x1 = GetIndexForInterpolation(x); - const int x2 = x1 + 1; - const int y1 = GetIndexForInterpolation(y); - const int y2 = y1 + 1; + const float x = GetElement(c); + const float y = GetElement(c); + const StrideType x1 = GetIndexForInterpolation(x); + const StrideType x2 = x1 + 1; + const StrideType y1 = GetIndexForInterpolation(y); + const StrideType y2 = y1 + 1; auto out = SetAll>>(0); @@ -445,6 +458,7 @@ class InterpolationWrap : public detail::InterpolationWra public: using typename Base::BorderWrapper; + using typename Base::StrideType; using typename Base::TensorWrapper; using typename Base::ValueType; @@ -520,30 +534,29 @@ class InterpolationWrap : public detail::InterpolationWra float> && 2 <= NumElements && NumElements <= kNumDimensions>> inline __host__ __device__ ValueType operator[](DimType c) const { - const float x = GetElement(c, kCoordMap.id[0]); - const float y = GetElement(c, kCoordMap.id[1]); - const int xmin = GetIndexForInterpolation(x - 2.f); - const int xmax = GetIndexForInterpolation(x + 2.f); - const int ymin = GetIndexForInterpolation(y - 2.f); - const int ymax = GetIndexForInterpolation(y + 2.f); + const float x = GetElement(c); + const float y = GetElement(c); + const StrideType ix = GetIndexForInterpolation(x); + const StrideType iy = GetIndexForInterpolation(y); + + float wx[4]; + GetCubicCoeffs(x - ix, wx[0], wx[1], wx[2], wx[3]); + float wy[4]; + GetCubicCoeffs(y - iy, wy[0], wy[1], wy[2], wy[3]); using FT = ConvertBaseTypeTo>; auto sum = SetAll(0); - float w, wsum = 0.f; - - for (int cy = ymin; cy <= ymax; cy++) +#pragma unroll + for (StrideType cy = -1; cy <= 2; cy++) { - for (int cx = xmin; cx <= xmax; cx++) +#pragma unroll + for (StrideType cx = -1; cx <= 2; cx++) { - w = GetCubicCoeff(x - cx) * GetCubicCoeff(y - cy); - sum += w * Base::doGetValue(c, cx, cy); - wsum += w; + sum += Base::doGetValue(c, ix + cx, iy + cy) * (wx[cx + 1] * wy[cy + 1]); } } - sum = (wsum == 0.f) ? SetAll(0) : sum / wsum; - return SaturateCast(sum); } }; @@ -560,6 +573,7 @@ class InterpolationWrap : public detail::InterpolationWrap public: using typename Base::BorderWrapper; + using typename Base::StrideType; using typename Base::TensorWrapper; using typename Base::ValueType; @@ -653,14 +667,14 @@ class InterpolationWrap : public detail::InterpolationWrap float> && 2 <= NumElements && NumElements <= kNumDimensions>> inline __host__ __device__ ValueType operator[](DimType c) const { - const float fsx1 = GetElement(c, kCoordMap.id[0]) * m_scaleX; - const float fsy1 = GetElement(c, kCoordMap.id[1]) * m_scaleY; - const float fsx2 = fsx1 + m_scaleX; - const float fsy2 = fsy1 + m_scaleY; - const int xmin = GetIndexForInterpolation(fsx1); - const int xmax = GetIndexForInterpolation(fsx2); - const int ymin = GetIndexForInterpolation(fsy1); - const int ymax = GetIndexForInterpolation(fsy2); + const float fsx1 = GetElement(c) * m_scaleX; + const float fsy1 = GetElement(c) * m_scaleY; + const float fsx2 = fsx1 + m_scaleX; + const float fsy2 = fsy1 + m_scaleY; + const StrideType xmin = GetIndexForInterpolation(fsx1); + const StrideType xmax = GetIndexForInterpolation(fsx2); + const StrideType ymin = GetIndexForInterpolation(fsy1); + const StrideType ymax = GetIndexForInterpolation(fsy2); auto out = SetAll>>(0); @@ -668,9 +682,9 @@ class InterpolationWrap : public detail::InterpolationWrap { const float scale = 1.f / (m_scaleX * m_scaleY); - for (int cy = ymin; cy < ymax; ++cy) + for (StrideType cy = ymin; cy < ymax; ++cy) { - for (int cx = xmin; cx < xmax; ++cx) + for (StrideType cx = xmin; cx < xmax; ++cx) { out += Base::doGetValue(c, cx, cy) * scale; } @@ -680,14 +694,14 @@ class InterpolationWrap : public detail::InterpolationWrap { // There are 2 active dimensions (0, 1) and the coordinates are inverted (y, x) // so y corresponds to dimension 0 and x corresponds to dimension 1 - const int w = Base::m_borderWrap.tensorShape()[1]; - const int h = Base::m_borderWrap.tensorShape()[0]; + const StrideType w = Base::m_borderWrap.tensorShape()[1]; + const StrideType h = Base::m_borderWrap.tensorShape()[0]; const float scale = 1.f / (min(m_scaleX, w - fsx1) * min(m_scaleY, h - fsy1)); - for (int cy = ymin; cy < ymax; ++cy) + for (StrideType cy = ymin; cy < ymax; ++cy) { - for (int cx = xmin; cx < xmax; ++cx) + for (StrideType cx = xmin; cx < xmax; ++cx) { out += Base::doGetValue(c, cx, cy) * scale; } @@ -705,7 +719,7 @@ class InterpolationWrap : public detail::InterpolationWrap if (ymin > fsy1) { - for (int cx = xmin; cx < xmax; ++cx) + for (StrideType cx = xmin; cx < xmax; ++cx) { out += Base::doGetValue(c, cx, (ymin - 1)) * ((ymin - fsy1) * scale); } @@ -723,7 +737,7 @@ class InterpolationWrap : public detail::InterpolationWrap if (ymax < fsy2) { - for (int cx = xmin; cx < xmax; ++cx) + for (StrideType cx = xmin; cx < xmax; ++cx) { out += Base::doGetValue(c, cx, ymax) * ((fsy2 - ymax) * scale); } @@ -769,6 +783,7 @@ class InterpolationWrap : public detail::InterpolationWrap * @tparam T Type of the values to be accessed in the interpolation wrap. * @tparam B Border extension to be used when accessing H and W, one of \ref NVCVBorderType * @tparam I Interpolation to be used when accessing H and W, one of \ref NVCVInterpolationType + * @tparam StrideType Stride type used when accessing underlying tensor data * * @param[in] tensor Reference to the tensor that will be wrapped. * @param[in] borderValue Border value to be used when accessing outside elements in constant border type @@ -777,11 +792,12 @@ class InterpolationWrap : public detail::InterpolationWrap * * @return Interpolation wrap useful to access tensor data interpolation-border aware in H and W in CUDA kernels. */ -template>> +template>> __host__ auto CreateInterpolationWrapNHW(const TensorDataStridedCuda &tensor, T borderValue = {}, float scaleX = {}, float scaleY = {}) { - auto borderWrap = CreateBorderWrapNHW(tensor, borderValue); + auto borderWrap = CreateBorderWrapNHW(tensor, borderValue); return InterpolationWrap(borderWrap, scaleX, scaleY); } @@ -800,6 +816,7 @@ __host__ auto CreateInterpolationWrapNHW(const TensorDataStridedCuda &tensor, T * @tparam T Type of the values to be accessed in the interpolation wrap. * @tparam B Border extension to be used when accessing H and W, one of \ref NVCVBorderType * @tparam I Interpolation to be used when accessing H and W, one of \ref NVCVInterpolationType + * @tparam StrideType Stride type used when accessing underlying tensor data * * @param[in] tensor Reference to the tensor that will be wrapped. * @param[in] borderValue Border value to be used when accessing outside elements in constant border type @@ -808,11 +825,12 @@ __host__ auto CreateInterpolationWrapNHW(const TensorDataStridedCuda &tensor, T * * @return Interpolation wrap useful to access tensor data interpolation-border aware in H and W in CUDA kernels. */ -template>> +template>> __host__ auto CreateInterpolationWrapNHWC(const TensorDataStridedCuda &tensor, T borderValue = {}, float scaleX = {}, float scaleY = {}) { - auto borderWrap = CreateBorderWrapNHWC(tensor, borderValue); + auto borderWrap = CreateBorderWrapNHWC(tensor, borderValue); return InterpolationWrap(borderWrap, scaleX, scaleY); } diff --git a/src/nvcv_types/include/nvcv/cuda/MathWrappers.hpp b/src/nvcv_types/include/nvcv/cuda/MathWrappers.hpp index 3b4a364a6..46d66da28 100644 --- a/src/nvcv_types/include/nvcv/cuda/MathWrappers.hpp +++ b/src/nvcv_types/include/nvcv/cuda/MathWrappers.hpp @@ -60,11 +60,13 @@ inline __host__ __device__ RT RoundImpl(U u) { RT out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = RoundImpl, static_cast(RM)>(GetElement(u, e)); - } + GetElement<0>(out) = RoundImpl, static_cast(RM)>(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = RoundImpl, static_cast(RM)>(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = RoundImpl, static_cast(RM)>(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = RoundImpl, static_cast(RM)>(GetElement<3>(u)); return out; } @@ -184,11 +186,15 @@ inline __host__ __device__ U min(U a, U b) #endif { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::MinImpl(GetElement(a, e), GetElement(b, e)); - } + + GetElement<0>(out) = detail::MinImpl(GetElement<0>(a), GetElement<0>(b)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::MinImpl(GetElement<1>(a), GetElement<1>(b)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::MinImpl(GetElement<2>(a), GetElement<2>(b)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::MinImpl(GetElement<3>(a), GetElement<3>(b)); + return out; } // clang-format on @@ -227,11 +233,15 @@ inline __host__ __device__ U max(U a, U b) #endif { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::MaxImpl(GetElement(a, e), GetElement(b, e)); - } + + GetElement<0>(out) = detail::MaxImpl(GetElement<0>(a), GetElement<0>(b)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::MaxImpl(GetElement<1>(a), GetElement<1>(b)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::MaxImpl(GetElement<2>(a), GetElement<2>(b)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::MaxImpl(GetElement<3>(a), GetElement<3>(b)); + return out; } // clang-format on @@ -260,11 +270,13 @@ inline __host__ __device__ U pow(U x, S y) { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::PowImpl(GetElement(x, e), GetElement(y, e)); - } + GetElement<0>(out) = detail::PowImpl(GetElement<0>(x), GetElement<0>(y)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::PowImpl(GetElement<1>(x), GetElement<1>(y)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::PowImpl(GetElement<2>(x), GetElement<2>(y)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::PowImpl(GetElement<3>(x), GetElement<3>(y)); return out; } @@ -286,11 +298,13 @@ inline __host__ __device__ U exp(U u) { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::ExpImpl(GetElement(u, e)); - } + GetElement<0>(out) = detail::ExpImpl(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::ExpImpl(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::ExpImpl(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::ExpImpl(GetElement<3>(u)); return out; } @@ -312,11 +326,13 @@ inline __host__ __device__ U sqrt(U u) { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::SqrtImpl(GetElement(u, e)); - } + GetElement<0>(out) = detail::SqrtImpl(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::SqrtImpl(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::SqrtImpl(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::SqrtImpl(GetElement<3>(u)); return out; } @@ -359,11 +375,15 @@ inline __host__ __device__ U abs(U u) #endif { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::AbsImpl(GetElement(u, e)); - } + + GetElement<0>(out) = detail::AbsImpl(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::AbsImpl(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::AbsImpl(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::AbsImpl(GetElement<3>(u)); + return out; } // clang-format on @@ -393,11 +413,13 @@ inline __host__ __device__ U clamp(U u, S lo, S hi) { U out{}; -#pragma unroll - for (int e = 0; e < nvcv::cuda::NumElements; ++e) - { - GetElement(out, e) = detail::ClampImpl(GetElement(u, e), GetElement(lo, e), GetElement(hi, e)); - } + GetElement<0>(out) = detail::ClampImpl(GetElement<0>(u), GetElement<0>(lo), GetElement<0>(hi)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::ClampImpl(GetElement<1>(u), GetElement<1>(lo), GetElement<1>(hi)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::ClampImpl(GetElement<2>(u), GetElement<2>(lo), GetElement<2>(hi)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::ClampImpl(GetElement<3>(u), GetElement<3>(lo), GetElement<3>(hi)); return out; } diff --git a/src/nvcv_types/include/nvcv/cuda/Printer.hpp b/src/nvcv_types/include/nvcv/cuda/Printer.hpp new file mode 100644 index 000000000..fa93583d0 --- /dev/null +++ b/src/nvcv_types/include/nvcv/cuda/Printer.hpp @@ -0,0 +1,71 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file Printer.hpp + * + * @brief Defines printer operator to print CUDA compound types. + */ + +#ifndef NVCV_CUDA_PRINTER_HPP +#define NVCV_CUDA_PRINTER_HPP + +#include "TypeTraits.hpp" // for Require, etc. + +#include // for std::ostream, etc. + +/** + * Metaoperator to insert a pixel into an output stream. + * + * The pixel may be a CUDA compound type with 1 to 4 components. This operator returns the output stream + * changed by an additional string with the name of the type followed by each component value in between + * parentheses. + * + * @code + * DataType pix = ...; + * std::cout << pix; + * @endcode + * + * @tparam T Type of the pixel to be inserted in the output stream. + * + * @param[in, out] out Output stream to be changed and returned. + * @param[in] v Pixel value to be inserted formatted in the output stream. + * + * @return Output stream with the data type and values. + */ +template>> +__host__ std::ostream &operator<<(std::ostream &out, const T &v) +{ + using BT = nvcv::cuda::BaseType; + using OutType = std::conditional_t; + + out << nvcv::cuda::GetTypeName() << "("; + + out << static_cast(nvcv::cuda::GetElement<0>(v)); + if constexpr (nvcv::cuda::NumComponents >= 2) + out << ", " << static_cast(nvcv::cuda::GetElement<1>(v)); + if constexpr (nvcv::cuda::NumComponents >= 3) + out << ", " << static_cast(nvcv::cuda::GetElement<2>(v)); + if constexpr (nvcv::cuda::NumComponents == 4) + out << ", " << static_cast(nvcv::cuda::GetElement<3>(v)); + + out << ")"; + + return out; +} + +#endif // NVCV_CUDA_PRINTER_HPP diff --git a/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp b/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp index a423d495f..510825fbf 100644 --- a/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp @@ -77,11 +77,13 @@ __host__ __device__ auto RangeCast(U u) { RT out{}; -#pragma unroll - for (int e = 0; e < NumElements; ++e) - { - GetElement(out, e) = detail::RangeCastImpl>(GetElement(u, e)); - } + GetElement<0>(out) = detail::RangeCastImpl>(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::RangeCastImpl>(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::RangeCastImpl>(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::RangeCastImpl>(GetElement<3>(u)); return out; } diff --git a/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp b/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp index 41dace2a2..b0cde23ad 100644 --- a/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp @@ -71,11 +71,13 @@ __host__ __device__ auto SaturateCast(U u) { RT out{}; -#pragma unroll - for (int e = 0; e < NumElements; ++e) - { - GetElement(out, e) = detail::SaturateCastImpl(GetElement(u, e)); - } + GetElement<0>(out) = detail::SaturateCastImpl(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = detail::SaturateCastImpl(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = detail::SaturateCastImpl(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = detail::SaturateCastImpl(GetElement<3>(u)); return out; } diff --git a/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp b/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp index 72f2929f9..a960cce0e 100644 --- a/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp +++ b/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp @@ -65,11 +65,13 @@ __host__ __device__ auto StaticCast(U u) { RT out{}; -#pragma unroll - for (int e = 0; e < NumElements; ++e) - { - GetElement(out, e) = static_cast(GetElement(u, e)); - } + GetElement<0>(out) = static_cast(GetElement<0>(u)); + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = static_cast(GetElement<1>(u)); + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = static_cast(GetElement<2>(u)); + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = static_cast(GetElement<3>(u)); return out; } diff --git a/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp index ebc838658..7023221cc 100644 --- a/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp @@ -73,31 +73,33 @@ namespace nvcv::cuda { * @tparam T Type (it can be const) of each element inside the tensor wrapper. * @tparam Strides Each compile-time (use -1 for run-time) pitch in bytes from first to last dimension. */ -template -class TensorBatchWrap; +template +class TensorBatchWrapT; -template -class TensorBatchWrap +template +class TensorBatchWrapT { static_assert(HasTypeTraits, "TensorBatchWrap can only be used if T has type traits"); + static_assert(IsStrideType, "StrideT must be a 64 or 32 bit signed integer type"); public: // The type provided as template parameter is the value type, i.e. the type of each element inside this wrapper. - using ValueType = const T; + using ValueType = const T; + using StrideType = StrideT; static constexpr int kNumDimensions = sizeof...(Strides); static constexpr int kVariableStrides = ((Strides == -1) + ...); static constexpr int kConstantStrides = kNumDimensions - kVariableStrides; - TensorBatchWrap() = default; + TensorBatchWrapT() = default; /** * Constructs a constant TensorBatchWrap by wrapping a \p data argument. * * @param[in] data Tensor batch data to wrap. */ - __host__ TensorBatchWrap(const TensorBatchDataStridedCuda &data) - : TensorBatchWrap(data.cdata()) + __host__ TensorBatchWrapT(const TensorBatchDataStridedCuda &data) + : TensorBatchWrapT(data.cdata()) { } @@ -106,7 +108,7 @@ class TensorBatchWrap * * @param[in] data Tensor batch data to wrap. */ - __host__ __device__ TensorBatchWrap(const NVCVTensorBatchData &data) + __host__ __device__ TensorBatchWrapT(const NVCVTensorBatchData &data) : m_numTensors(data.numTensors) , m_tensors(data.buffer.strided.tensors) { @@ -135,7 +137,7 @@ class TensorBatchWrap * * @return Accessed reference. */ - template>>> + template, StrideType>>> inline const __host__ __device__ T &operator[](DimType c) const { static_assert(NumElements == kNumDimensions + 1, @@ -169,7 +171,7 @@ class TensorBatchWrap */ inline const __host__ __device__ auto tensor(int t) const { - return TensorWrap(doGetPtr(t), strides(t)); + return TensorWrapT(doGetPtr(t), strides(t)); } /** @@ -206,19 +208,19 @@ class TensorBatchWrap template inline __host__ __device__ T *doGetPtr(int t, Args... c) const { - static_assert(std::conjunction_v...>); + static_assert((IsIndexType && ...)); static_assert(sizeof...(Args) <= kNumDimensions); - constexpr int kArgSize = sizeof...(Args); - constexpr int kVarSize = kArgSize < kVariableStrides ? kArgSize : kVariableStrides; - constexpr int kDimSize = kArgSize < kNumDimensions ? kArgSize : kNumDimensions; - constexpr int kStride[] = {std::forward(Strides)...}; + constexpr int kArgSize = sizeof...(Args); + constexpr int kVarSize = kArgSize < kVariableStrides ? kArgSize : kVariableStrides; + constexpr int kDimSize = kArgSize < kNumDimensions ? kArgSize : kNumDimensions; + constexpr StrideT kStride[] = {std::forward(Strides)...}; // Computing offset first potentially postpones or avoids 64-bit math during addressing - int offset = 0; + StrideType offset = 0; if constexpr (kArgSize > 0) { - int coords[] = {std::forward(c)...}; + StrideType coords[] = {std::forward(c)...}; const int64_t *strides = m_tensors[t].stride; #pragma unroll @@ -247,10 +249,10 @@ class TensorBatchWrap * @tparam T Type (non-const) of each element inside the tensor batch wrapper. * @tparam Strides Each compile-time (use -1 for run-time) pitch in bytes from first to last dimension. */ -template -class TensorBatchWrap : public TensorBatchWrap +template +class TensorBatchWrapT : public TensorBatchWrapT { - using Base = TensorBatchWrap; + using Base = TensorBatchWrapT; public: using ValueType = T; @@ -258,13 +260,14 @@ class TensorBatchWrap : public TensorBatchWrap using Base::kNumDimensions; using Base::m_tensors; using Base::strides; + using typename Base::StrideType; /** * Constructs a TensorBatchWrap by wrapping a \p data argument. * * @param[in] data Tensor batch data to wrap. */ - __host__ TensorBatchWrap(const TensorBatchDataStridedCuda &data) + __host__ TensorBatchWrapT(const TensorBatchDataStridedCuda &data) : Base(data) { } @@ -274,7 +277,7 @@ class TensorBatchWrap : public TensorBatchWrap * * @param[in] data Tensor batch data to wrap. */ - __host__ __device__ TensorBatchWrap(NVCVTensorBatchData &data) + __host__ __device__ TensorBatchWrapT(NVCVTensorBatchData &data) : Base(data) { } @@ -303,7 +306,7 @@ class TensorBatchWrap : public TensorBatchWrap */ inline __host__ __device__ auto tensor(int t) const { - return TensorWrap(doGetPtr(t), strides(t)); + return TensorWrapT(doGetPtr(t), strides(t)); } /** @@ -315,7 +318,7 @@ class TensorBatchWrap : public TensorBatchWrap * * @return Accessed reference. */ - template>>> + template, StrideType>>> inline __host__ __device__ T &operator[](DimType c) const { static_assert(NumElements == kNumDimensions + 1, @@ -340,6 +343,12 @@ class TensorBatchWrap : public TensorBatchWrap } }; +template +using TensorBatchWrap = TensorBatchWrapT; + +template +using TensorBatchWrap32 = TensorBatchWrapT; + /**@}*/ /** @@ -357,28 +366,29 @@ class TensorBatchWrap : public TensorBatchWrap * @{ */ -template -using TensorBatch1DWrap = TensorBatchWrap; +template +using TensorBatch1DWrap = TensorBatchWrapT; -template -using TensorBatch2DWrap = TensorBatchWrap; +template +using TensorBatch2DWrap = TensorBatchWrapT; -template -using TensorBatch3DWrap = TensorBatchWrap; +template +using TensorBatch3DWrap = TensorBatchWrapT; -template -using TensorBatch4DWrap = TensorBatchWrap; +template +using TensorBatch4DWrap = TensorBatchWrapT; -template -using TensorBatch5DWrap = TensorBatchWrap; +template +using TensorBatch5DWrap = TensorBatchWrapT; -template +template using TensorBatchNDWrap = std::conditional_t< - N == 1, TensorBatch1DWrap, - std::conditional_t, - std::conditional_t, - std::conditional_t, - std::conditional_t, void>>>>>; + N == 1, TensorBatch1DWrap, + std::conditional_t< + N == 2, TensorBatch2DWrap, + std::conditional_t, + std::conditional_t, + std::conditional_t, void>>>>>; /**@}*/ } // namespace nvcv::cuda diff --git a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp index e4afd3ca0..6e768f9f9 100644 --- a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp @@ -51,6 +51,7 @@ namespace nvcv::cuda { * * Template arguments: * - T type of the values inside the tensor + * - StrideT type of the stride used in the byte offset calculation * - Strides sequence of compile- or run-time pitches (-1 indicates run-time) * - Y compile-time pitches * - X run-time pitches @@ -78,22 +79,24 @@ namespace nvcv::cuda { * @tparam T Type (it can be const) of each element inside the tensor wrapper. * @tparam Strides Each compile-time (use -1 for run-time) pitch in bytes from first to last dimension. */ -template -class TensorWrap; +template +class TensorWrapT; -template -class TensorWrap +template +class TensorWrapT { static_assert(HasTypeTraits, "TensorWrap can only be used if T has type traits"); + static_assert(IsStrideType, "StrideT must be a 64 or 32 bit signed integer type"); public: - using ValueType = const T; + using ValueType = const T; + using StrideType = StrideT; static constexpr int kNumDimensions = sizeof...(Strides); static constexpr int kVariableStrides = ((Strides == -1) + ...); static constexpr int kConstantStrides = kNumDimensions - kVariableStrides; - TensorWrap() = default; + TensorWrapT() = default; /** * Constructs a constant TensorWrap by wrapping a const \p data pointer argument. @@ -102,11 +105,11 @@ class TensorWrap * @param[in] strides0..D Each run-time pitch in bytes from first to last dimension. */ template - explicit __host__ __device__ TensorWrap(const DataType *data, Args... strides) + explicit __host__ __device__ TensorWrapT(const DataType *data, Args... strides) : m_data(reinterpret_cast(data)) - , m_strides{std::forward(strides)...} + , m_strides{std::forward(strides)...} { - static_assert(std::conjunction_v...>); + static_assert((IsIndexType && ...)); static_assert(sizeof...(Args) == kVariableStrides); } @@ -118,7 +121,7 @@ class TensorWrap * @param[in] strides Pointer to stride data */ template - explicit __host__ __device__ TensorWrap(const DataType *data, StrideType *strides) + explicit __host__ __device__ TensorWrapT(const DataType *data, StrideType *strides) : m_data(reinterpret_cast(data)) { for (int i = 0; i < kVariableStrides; ++i) @@ -132,7 +135,7 @@ class TensorWrap * * @param[in] image Image reference to the image that will be wrapped. */ - __host__ TensorWrap(const ImageDataStridedCuda &image) + __host__ TensorWrapT(const ImageDataStridedCuda &image) { static_assert(kVariableStrides == 1 && kNumDimensions == 2); @@ -146,9 +149,9 @@ class TensorWrap * * @param[in] tensor Tensor reference to the tensor that will be wrapped. */ - __host__ TensorWrap(const TensorDataStridedCuda &tensor) + __host__ TensorWrapT(const TensorDataStridedCuda &tensor) { - constexpr int kStride[] = {std::forward(Strides)...}; + constexpr StrideT kStride[] = {std::forward(Strides)...}; assert(tensor.rank() >= kNumDimensions); @@ -163,7 +166,7 @@ class TensorWrap } else if (i < kVariableStrides) { - assert(tensor.stride(i) <= TypeTraits::max); + assert(tensor.stride(i) <= TypeTraits::max); m_strides[i] = tensor.stride(i); } @@ -175,7 +178,7 @@ class TensorWrap * * @return The const array (as a pointer) containing run-time pitches in bytes. */ - const __host__ __device__ int *strides() const + const __host__ __device__ StrideT *strides() const { return m_strides; } @@ -232,18 +235,18 @@ class TensorWrap template inline const __host__ __device__ T *doGetPtr(Args... c) const { - static_assert(std::conjunction_v...>); + static_assert((IsIndexType && ...)); static_assert(sizeof...(Args) <= kNumDimensions); - constexpr int kArgSize = sizeof...(Args); - constexpr int kVarSize = kArgSize < kVariableStrides ? kArgSize : kVariableStrides; - constexpr int kDimSize = kArgSize < kNumDimensions ? kArgSize : kNumDimensions; - constexpr int kStride[] = {std::forward(Strides)...}; + constexpr int kArgSize = sizeof...(Args); + constexpr int kVarSize = kArgSize < kVariableStrides ? kArgSize : kVariableStrides; + constexpr int kDimSize = kArgSize < kNumDimensions ? kArgSize : kNumDimensions; + constexpr StrideT kStride[] = {std::forward(Strides)...}; - int coords[] = {std::forward(c)...}; + StrideType coords[] = {std::forward(c)...}; // Computing offset first potentially postpones or avoids 64-bit math during addressing - int offset = 0; + StrideT offset = 0; #pragma unroll for (int i = 0; i < kVarSize; ++i) { @@ -260,7 +263,7 @@ class TensorWrap private: const std::byte *m_data = nullptr; - int m_strides[kVariableStrides] = {}; + StrideT m_strides[kVariableStrides] = {}; }; /** @@ -269,19 +272,20 @@ class TensorWrap * @tparam T Type (non-const) of each element inside the tensor wrapper. * @tparam Strides Each compile-time (use -1 for run-time) pitch in bytes from first to last dimension. */ -template -class TensorWrap : public TensorWrap +template +class TensorWrapT : public TensorWrapT { - using Base = TensorWrap; + using Base = TensorWrapT; public: - using ValueType = T; + using ValueType = T; + using StrideType = StrideT; using Base::kConstantStrides; using Base::kNumDimensions; using Base::kVariableStrides; - TensorWrap() = default; + TensorWrapT() = default; /** * Constructs a TensorWrap by wrapping a \p data pointer argument. @@ -290,7 +294,7 @@ class TensorWrap : public TensorWrap * @param[in] strides0..N Each run-time pitch in bytes from first to last dimension. */ template - explicit __host__ __device__ TensorWrap(DataType *data, Args... strides) + explicit __host__ __device__ TensorWrapT(DataType *data, Args... strides) : Base(data, strides...) { } @@ -303,7 +307,7 @@ class TensorWrap : public TensorWrap * @param[in] strides Pointer to stride data */ template - explicit __host__ __device__ TensorWrap(DataType *data, StrideType *strides) + explicit __host__ __device__ TensorWrapT(DataType *data, StrideType *strides) : Base(data, strides) { } @@ -313,7 +317,7 @@ class TensorWrap : public TensorWrap * * @param[in] image Image reference to the image that will be wrapped. */ - __host__ TensorWrap(const ImageDataStridedCuda &image) + __host__ TensorWrapT(const ImageDataStridedCuda &image) : Base(image) { } @@ -323,7 +327,7 @@ class TensorWrap : public TensorWrap * * @param[in] tensor Tensor reference to the tensor that will be wrapped. */ - __host__ TensorWrap(const TensorDataStridedCuda &tensor) + __host__ TensorWrapT(const TensorDataStridedCuda &tensor) : Base(tensor) { } @@ -385,6 +389,12 @@ class TensorWrap : public TensorWrap } }; +template +using TensorWrap = TensorWrapT; + +template +using TensorWrap32 = TensorWrapT; + /**@}*/ /** @@ -395,6 +405,7 @@ class TensorWrap : public TensorWrap * * Template arguments: * - T data type of each element in \ref TensorWrap + * - StrideType stride type used in the TensorWrap * - N (optional) number of dimensions * * @sa NVCV_CPP_CUDATOOLS_TENSORWRAP @@ -403,28 +414,29 @@ class TensorWrap : public TensorWrap * @{ */ -template -using Tensor1DWrap = TensorWrap; +template +using Tensor1DWrap = TensorWrapT; -template -using Tensor2DWrap = TensorWrap; +template +using Tensor2DWrap = TensorWrapT; -template -using Tensor3DWrap = TensorWrap; +template +using Tensor3DWrap = TensorWrapT; -template -using Tensor4DWrap = TensorWrap; +template +using Tensor4DWrap = TensorWrapT; -template -using Tensor5DWrap = TensorWrap; +template +using Tensor5DWrap = TensorWrapT; -template +template using TensorNDWrap = std::conditional_t< - N == 1, Tensor1DWrap, - std::conditional_t, - std::conditional_t, - std::conditional_t, - std::conditional_t, void>>>>>; + N == 1, Tensor1DWrap, + std::conditional_t< + N == 2, Tensor2DWrap, + std::conditional_t, + std::conditional_t, + std::conditional_t, void>>>>>; /**@}*/ @@ -438,21 +450,23 @@ using TensorNDWrap = std::conditional_t< * @sa NVCV_CPP_CUDATOOLS_TENSORWRAP * * @tparam T Type of the values to be accessed in the tensor wrap. + * @tparam StrideType Type of the stride used in the tensor wrap. * * @param[in] tensor Reference to the tensor that will be wrapped. * * @return Tensor wrap useful to access tensor data in CUDA kernels. */ -template>> + +template && IsStrideType>> __host__ auto CreateTensorWrapNHW(const TensorDataStridedCuda &tensor) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); assert(tensorAccess); - assert(tensorAccess->sampleStride() <= TypeTraits::max); - assert(tensorAccess->rowStride() <= TypeTraits::max); + assert(tensorAccess->sampleStride() <= TypeTraits::max); + assert(tensorAccess->rowStride() <= TypeTraits::max); - return Tensor3DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), - static_cast(tensorAccess->rowStride())); + return Tensor3DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), + static_cast(tensorAccess->rowStride())); } /** @@ -465,22 +479,24 @@ __host__ auto CreateTensorWrapNHW(const TensorDataStridedCuda &tensor) * @sa NVCV_CPP_CUDATOOLS_TENSORWRAP * * @tparam T Type of the values to be accessed in the tensor wrap. + * @tparam StrideType Type of the stride used in the tensor wrap. * * @param[in] tensor Reference to the tensor that will be wrapped. * * @return Tensor wrap useful to access tensor data in CUDA kernels. */ -template>> +template && IsStrideType>> __host__ auto CreateTensorWrapNHWC(const TensorDataStridedCuda &tensor) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); assert(tensorAccess); - assert(tensorAccess->sampleStride() <= TypeTraits::max); - assert(tensorAccess->rowStride() <= TypeTraits::max); - assert(tensorAccess->colStride() <= TypeTraits::max); + assert(tensorAccess->sampleStride() <= TypeTraits::max); + assert(tensorAccess->rowStride() <= TypeTraits::max); + assert(tensorAccess->colStride() <= TypeTraits::max); - return Tensor4DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), - static_cast(tensorAccess->rowStride()), static_cast(tensorAccess->colStride())); + return Tensor4DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), + static_cast(tensorAccess->rowStride()), + static_cast(tensorAccess->colStride())); } /** @@ -498,17 +514,18 @@ __host__ auto CreateTensorWrapNHWC(const TensorDataStridedCuda &tensor) * * @return Tensor wrap useful to access tensor data in CUDA kernels. */ -template>> +template && IsStrideType>> __host__ auto CreateTensorWrapNCHW(const TensorDataStridedCuda &tensor) { auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor); assert(tensorAccess); - assert(tensorAccess->sampleStride() <= TypeTraits::max); - assert(tensorAccess->chStride() <= TypeTraits::max); - assert(tensorAccess->rowStride() <= TypeTraits::max); + assert(tensorAccess->sampleStride() <= TypeTraits::max); + assert(tensorAccess->chStride() <= TypeTraits::max); + assert(tensorAccess->rowStride() <= TypeTraits::max); - return Tensor4DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), - static_cast(tensorAccess->chStride()), static_cast(tensorAccess->rowStride())); + return Tensor4DWrap(tensor.basePtr(), static_cast(tensorAccess->sampleStride()), + static_cast(tensorAccess->chStride()), + static_cast(tensorAccess->rowStride())); } } // namespace nvcv::cuda diff --git a/src/nvcv_types/include/nvcv/cuda/TypeTraits.hpp b/src/nvcv_types/include/nvcv/cuda/TypeTraits.hpp index fd0e6f681..500bd9b08 100644 --- a/src/nvcv_types/include/nvcv/cuda/TypeTraits.hpp +++ b/src/nvcv_types/include/nvcv/cuda/TypeTraits.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,7 +27,7 @@ #include "detail/Metaprogramming.hpp" // for detail::TypeTraits, etc. #include // for assert, etc. -#include // for std::ostream, etc. +#include // for int32_t, int64_t, etc. namespace nvcv::cuda { @@ -73,6 +73,12 @@ constexpr bool IsCompound = TypeTraits::components >= 1; template>> constexpr bool HasEnoughComponents = N <= TypeTraits::components; +template +constexpr bool IsStrideType = std::is_same_v || std::is_same_v; + +template +constexpr bool IsIndexType = std::is_integral_v && (TypeTraits::max <= TypeTraits::max); + /** * Metatype to get the base type of a CUDA compound types. * @@ -195,6 +201,29 @@ __host__ __device__ RT &GetElement(T &v, int eidx) } } +template, BaseType, T>>, + class = Require>> +__host__ __device__ RT &GetElement(T &v) +{ + if constexpr (IsCompound) + { + static_assert(EIDX < NumElements); + if constexpr (EIDX == 0) + return v.x; + else if constexpr (EIDX == 1) + return v.y; + else if constexpr (EIDX == 2) + return v.z; + else if constexpr (EIDX == 3) + return v.w; + } + else + { + return v; + } +} + /** * Metafunction to set all elements to the same value. * @@ -221,11 +250,13 @@ __host__ __device__ T SetAll(BaseType x) { T out{}; -#pragma unroll - for (int e = 0; e < NumElements; ++e) - { - GetElement(out, e) = x; - } + GetElement<0>(out) = x; + if constexpr (nvcv::cuda::NumElements >= 2) + GetElement<1>(out) = x; + if constexpr (nvcv::cuda::NumElements >= 3) + GetElement<2>(out) = x; + if constexpr (nvcv::cuda::NumElements == 4) + GetElement<3>(out) = x; return out; } @@ -260,46 +291,4 @@ __host__ const char *GetTypeName() } // namespace nvcv::cuda -/** - * Metaoperator to insert a pixel into an output stream. - * - * The pixel may be a CUDA compound type with 1 to 4 components. This operator returns the output stream - * changed by an additional string with the name of the type followed by each component value in between - * parentheses. - * - * @code - * DataType pix = ...; - * std::cout << pix; - * @endcode - * - * @tparam T Type of the pixel to be inserted in the output stream. - * - * @param[in, out] out Output stream to be changed and returned. - * @param[in] v Pixel value to be inserted formatted in the output stream. - * - * @return Output stream with the data type and values. - */ -template>> -__host__ std::ostream &operator<<(std::ostream &out, const T &v) -{ - using BT = nvcv::cuda::BaseType; - using OutType = std::conditional_t; - constexpr int NC = nvcv::cuda::NumComponents; - - out << nvcv::cuda::GetTypeName() << "("; - - for (int c = 0; c < NC; ++c) - { - if (c > 0) - { - out << ", "; - } - out << static_cast(nvcv::cuda::GetElement(v, c)); - } - - out << ")"; - - return out; -} - #endif // NVCV_CUDA_TYPE_TRAITS_HPP diff --git a/src/util/Assert.h b/src/util/Assert.h index a741b7eeb..c2628673d 100644 --- a/src/util/Assert.h +++ b/src/util/Assert.h @@ -43,7 +43,7 @@ NVCV_ASSERT_NORETURN void NvCVAssert(const char *file, int line, const char *con # define NVCV_SOURCE_FILE_LINENO __LINE__ # define NVCV_OPTIONAL_STRINGIFY(X) # X #else -# define NVCV_SOURCE_FILE_NAME NULL +# define NVCV_SOURCE_FILE_NAME "" # define NVCV_SOURCE_FILE_LINENO 0 # define NVCV_OPTIONAL_STRINGIFY(X) "" #endif diff --git a/tests/common/InterpUtils.hpp b/tests/common/InterpUtils.hpp index 7ce5592bb..956956e13 100644 --- a/tests/common/InterpUtils.hpp +++ b/tests/common/InterpUtils.hpp @@ -99,21 +99,24 @@ inline RT GetCoord(int x, int y, int z = 0, int k = 0) return RT{k, x, y, z}; } -inline float GetBicubicCoeff(float c) +inline void GetBicubicCoeffs(float delta, float &w0, float &w1, float &w2, float &w3) { - c = std::fabs(c); - if (c <= 1.0f) - { - return c * c * (1.5f * c - 2.5f) + 1.0f; - } - else if (c < 2.0f) - { - return c * (c * (-0.5f * c + 2.5f) - 4.0f) + 2.0f; - } - else - { - return 0.0f; - } + w0 = -.5f; + w0 = w0 * delta + 1.f; + w0 = w0 * delta - .5f; + w0 = w0 * delta; + + w1 = 1.5f; + w1 = w1 * delta - 2.5f; + w1 = w1 * delta; + w1 = w1 * delta + 1.f; + + w2 = -1.5f; + w2 = w2 * delta + 2.f; + w2 = w2 * delta + .5f; + w2 = w2 * delta; + + w3 = 1 - w0 - w1 - w2; } template @@ -149,28 +152,26 @@ inline ValueType GoldInterp(const std::vector &vec, const StridesType & } else if constexpr (I == NVCV_INTERP_CUBIC) { - int xmin = cuda::round(coord.x - 2.f); - int ymin = cuda::round(coord.y - 2.f); - int xmax = cuda::round(coord.x + 2.f); - int ymax = cuda::round(coord.y + 2.f); + int ix = cuda::round(coord.x); + int iy = cuda::round(coord.y); using FT = cuda::ConvertBaseTypeTo; auto sum = cuda::SetAll(0); - float w, wsum = 0.f; + float wx[4]; + test::GetBicubicCoeffs(coord.x - ix, wx[0], wx[1], wx[2], wx[3]); + float wy[4]; + test::GetBicubicCoeffs(coord.y - iy, wy[0], wy[1], wy[2], wy[3]); - for (int cy = ymin; cy <= ymax; cy++) + for (int cy = -1; cy <= 2; cy++) { - for (int cx = xmin; cx <= xmax; cx++) + for (int cx = -1; cx <= 2; cx++) { - w = GetBicubicCoeff(coord.x - cx) * GetBicubicCoeff(coord.y - cy); - sum += w * ValueAt(vec, strides, size, bValue, GetCoord(cx, cy, z, k)); - wsum += w; + sum += (wx[cx + 1] * wy[cy + 1]) + * ValueAt(vec, strides, size, bValue, GetCoord(ix + cx, iy + cy, z, k)); } } - sum = (wsum == 0.f) ? cuda::SetAll(0) : sum / wsum; - return cuda::SaturateCast(sum); } else if constexpr (I == NVCV_INTERP_AREA) diff --git a/tests/cvcuda/python/cvcuda_test_python.in b/tests/cvcuda/python/cvcuda_test_python.in index 94cc6651e..8b3e1bcd9 100755 --- a/tests/cvcuda/python/cvcuda_test_python.in +++ b/tests/cvcuda/python/cvcuda_test_python.in @@ -37,16 +37,16 @@ if [ ! -f "$tests_dir/cvcuda_util.py" ]; then fi # Verify if correct package dependencies are installed -------- -pip_depends="pytest torch" +pip_depends="pytest torch numpy==1.26" # Collect all python versions that are indeed installed and have proper dependencies installed # Two behaviors: # - default: skip Python versions that are not installed or don't have pytest and torch installed # - if NVCV_FORCE_PYTHON is set: exit with error for ver in $python_versions_tentative; do - if ! python$ver -c "import pytest, torch" > /dev/null 2>&1; then + if ! python$ver -c "import pytest, torch, numpy" > /dev/null 2>&1; then echo "WARNING: Python version $ver not installed or missing proper dependencies" - echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends" + echo "Please install Python version $ver and run the following commands before running tests: python$ver -m pip install $pip_depends" if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then echo "Exiting with FAILURE, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON" exit 2 #hard exit diff --git a/tests/cvcuda/python/test_opresizecropconvertreformat.py b/tests/cvcuda/python/test_opresizecropconvertreformat.py index 0f9f2a796..16f0456e6 100644 --- a/tests/cvcuda/python/test_opresizecropconvertreformat.py +++ b/tests/cvcuda/python/test_opresizecropconvertreformat.py @@ -27,7 +27,8 @@ @t.mark.parametrize( "tensor_params, resize_dim, resize_interpolation, crop_rect_params, " - "out_layout, out_dtype, manip, out_expected_shape, is_positive_test", + "out_layout, out_dtype, manip, out_expected_shape, scale_norm, offset_norm, " + "is_positive_test", [ ( ((4, 512, 512, 3), np.uint8, "NHWC"), # Basic test @@ -38,6 +39,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (4, 3, 224, 224), + 1, + 0, True, ), ( @@ -49,6 +52,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (4, 3, 224, 224), + 1, + 0, True, ), ( @@ -60,6 +65,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (4, 224, 224, 3), + 1, + 0, True, ), ( @@ -71,6 +78,8 @@ 0, # Zero means keep the same dtype as input cvcuda.ChannelManip.REVERSE, (4, 224, 224, 3), + 1, + 0, True, ), ( @@ -82,6 +91,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (17, 3, 22, 200), + 1, + 0, True, ), ( @@ -93,6 +104,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (17, 22, 200, 3), + 1, + 0, True, ), ( @@ -104,6 +117,8 @@ nvcv.Type.U8, # Same dtype as the input tensor cvcuda.ChannelManip.NO_OP, # No op here (3, 3, 35, 20), + 1, + 0, True, ), ( @@ -115,6 +130,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (3, 224, 224), + 1, + 0, True, ), ( @@ -126,6 +143,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (3, 224, 224), + 1, + 0, False, # Negative test ), ( @@ -137,6 +156,21 @@ nvcv.Type.F32, cvcuda.ChannelManip.NO_OP, (3, 224, 1024), + 1, + 0, + True, + ), + ( + ((4, 678, 1027, 3), np.uint8, "NHWC"), + (251, 256), + cvcuda.Interp.LINEAR, + (0, 0, 200, 22), + "NCHW", + nvcv.Type.F32, + cvcuda.ChannelManip.REVERSE, + (4, 3, 22, 200), + 127.5, # Normalize output to [-1:1] + -1, True, ), ( @@ -148,6 +182,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (3, 224, 1024), + 1, + 0, False, # Negative test ), ( @@ -159,6 +195,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (3, 5, 59), + 1, + 0, True, ), ( @@ -170,6 +208,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (3, 5, 59), + 1, + 0, False, # Negative test ), ( @@ -181,6 +221,8 @@ nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (4, 3, 224, 224), + 1, + 0, False, # Negative test ), ], @@ -194,6 +236,8 @@ def test_op_resize_crop_convert_reformat( out_dtype, manip, out_expected_shape, + scale_norm, + offset_norm, is_positive_test, ): @@ -210,6 +254,8 @@ def test_op_resize_crop_convert_reformat( layout=out_layout, data_type=out_dtype, manip=manip, + scale=scale_norm, + offset=offset_norm, ) except Exception as e: if is_positive_test: @@ -233,6 +279,8 @@ def test_op_resize_crop_convert_reformat( resize_interpolation, [crop_rect_params[1], crop_rect_params[0]], manip=manip, + scale=scale_norm, + offset=offset_norm, ) except Exception as e: if is_positive_test: @@ -256,7 +304,7 @@ def test_op_resize_crop_convert_reformat( @t.mark.parametrize( "num_images, min_size, max_size, resize_dim, resize_interpolation, crop_rect_params, " - "out_layout, out_dtype, manip, out_expected_shape, is_positive_test", + "out_layout, out_dtype, manip, out_expected_shape, scale_norm, offset_norm, is_positive_test", [ ( 10, # Basic test @@ -269,6 +317,8 @@ def test_op_resize_crop_convert_reformat( nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (10, 3, 224, 224), + 1, + 0, True, ), ( @@ -282,6 +332,8 @@ def test_op_resize_crop_convert_reformat( nvcv.Type.F32, cvcuda.ChannelManip.REVERSE, (1, 190, 224, 3), + 1, + 0, True, ), ( @@ -295,6 +347,8 @@ def test_op_resize_crop_convert_reformat( nvcv.Type.F32, cvcuda.ChannelManip.NO_OP, # No channels swapping (50, 190, 224, 3), + 1, + 0, True, ), ( @@ -308,6 +362,8 @@ def test_op_resize_crop_convert_reformat( nvcv.Type.U8, # Same uint8 dtype as the input cvcuda.ChannelManip.REVERSE, (50, 3, 190, 224), + 1, + 0, True, ), ( @@ -321,6 +377,8 @@ def test_op_resize_crop_convert_reformat( nvcv.Type.U8, cvcuda.ChannelManip.NO_OP, # NO_OP (50, 3, 190, 224), + 1, + 0, True, ), ( @@ -334,6 +392,8 @@ def test_op_resize_crop_convert_reformat( 0, # Same uint8 dtype as the input cvcuda.ChannelManip.REVERSE, (50, 190, 224, 3), + 1, + 0, True, ), ], @@ -349,6 +409,8 @@ def test_op_resize_crop_convert_reformat_varshape( out_dtype, manip, out_expected_shape, + scale_norm, + offset_norm, is_positive_test, ): @@ -378,6 +440,8 @@ def test_op_resize_crop_convert_reformat_varshape( layout=out_layout, data_type=out_dtype, manip=manip, + scale=scale_norm, + offset=offset_norm, ) except Exception as e: if is_positive_test: diff --git a/tests/cvcuda/system/ResizeUtils.cpp b/tests/cvcuda/system/ResizeUtils.cpp index 98bfa2792..45c04ece5 100644 --- a/tests/cvcuda/system/ResizeUtils.cpp +++ b/tests/cvcuda/system/ResizeUtils.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,18 +29,18 @@ namespace nvcv::test { template -void _Resize(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const std::vector &hSrc, int srcStep, - nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, bool isVarshape) +void resize(std::vector &dst, int dstStep, nvcv::Size2D dstSize, const std::vector &src, int srcStep, + nvcv::Size2D srcSize, nvcv::ImageFormat frmt, NVCVInterpolationType interp, bool isVarshape) { double scaleH = static_cast(srcSize.h) / dstSize.h; double scaleW = static_cast(srcSize.w) / dstSize.w; - assert(fmt.numPlanes() == 1); + assert(frmt.numPlanes() == 1); - int channels = fmt.numChannels(); + int channels = frmt.numChannels(); - T *dstPtr = hDst.data(); - const T *srcPtr = hSrc.data(); + T *dstPtr = dst.data(); + const T *srcPtr = src.data(); for (int dy = 0; dy < dstSize.h; dy++) { @@ -178,7 +178,10 @@ void _Resize(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const std: } } - out = std::rint(std::abs(out)); + if (std::numeric_limits::is_integer) + { + out = std::rint(std::numeric_limits::is_signed ? out : std::abs(out)); + } dstPtr[dy * dstStep + dx * channels + c] = out < MinVal ? MinVal : (out > MaxVal ? MaxVal : out); } @@ -188,19 +191,19 @@ void _Resize(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const std: } template -void _ResizedCrop(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const std::vector &hSrc, int srcStep, - nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat fmt, - NVCVInterpolationType interp) +void resizedCrop(std::vector &dst, int dstStep, nvcv::Size2D dstSize, const std::vector &src, int srcStep, + nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat frmt, + NVCVInterpolationType interp) { - double scaleH = static_cast(crop_rows) / dstSize.h; - double scaleW = static_cast(crop_cols) / dstSize.w; + float scaleH = static_cast(crop_rows) / dstSize.h; + float scaleW = static_cast(crop_cols) / dstSize.w; - assert(fmt.numPlanes() == 1); + assert(frmt.numPlanes() == 1); - int channels = fmt.numChannels(); + int channels = frmt.numChannels(); - T *dstPtr = hDst.data(); - const T *srcPtr = hSrc.data(); + T *dstPtr = dst.data(); + const T *srcPtr = src.data(); for (int dy = 0; dy < dstSize.h; dy++) { @@ -208,8 +211,8 @@ void _ResizedCrop(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const { if (interp == NVCV_INTERP_NEAREST) { - double fy = scaleH * dy + top; - double fx = scaleW * dx + left; + float fy = scaleH * (dy + 0.5f) + top; + float fx = scaleW * (dx + 0.5f) + left; int sy = std::floor(fy); int sx = std::floor(fx); @@ -308,68 +311,63 @@ void _ResizedCrop(std::vector &hDst, int dstStep, nvcv::Size2D dstSize, const } } -void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, - bool isVarShape) +template +void _Resize(std::vector &dst, int dstStride, nvcv::Size2D dstSize, const std::vector &src, int srcStride, + nvcv::Size2D srcSize, nvcv::ImageFormat frmt, NVCVInterpolationType interp, bool isVarShape) { - int dstStep = dstRowStride / sizeof(uint8_t); - int srcStep = srcRowStride / sizeof(uint8_t); + int dstStep = dstStride / sizeof(T); + int srcStep = srcStride / sizeof(T); if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) { - _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w, fmt, - interp); + resizedCrop(dst, dstStep, dstSize, src, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w, frmt, + interp); } else if (interp == NVCV_INTERP_AREA) { - _Resize(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, fmt, interp, isVarShape); + resize(dst, dstStep, dstSize, src, srcStep, srcSize, frmt, interp, isVarShape); } } -void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, - nvcv::ImageFormat fmt, NVCVInterpolationType interp) +void Resize(std::vector &dst, int dstStride, nvcv::Size2D dstSize, const std::vector &src, + int srcStride, nvcv::Size2D srcSize, nvcv::ImageFormat frmt, NVCVInterpolationType interp, bool isVarShape) { - int dstStep = dstRowStride / sizeof(uint8_t); - int srcStep = srcRowStride / sizeof(uint8_t); + _Resize(dst, dstStride, dstSize, src, srcStride, srcSize, frmt, interp, isVarShape); +} - if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) - { - _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, top, left, crop_rows, crop_cols, - fmt, interp); - } +void Resize(std::vector &dst, int dstStride, nvcv::Size2D dstSize, const std::vector &src, int srcStride, + nvcv::Size2D srcSize, nvcv::ImageFormat frmt, NVCVInterpolationType interp, bool isVarShape) +{ + _Resize(dst, dstStride, dstSize, src, srcStride, srcSize, frmt, interp, isVarShape); } -void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interp, - bool isVarShape) +template +void _ResizedCrop(std::vector &dst, int dstStride, nvcv::Size2D dstSize, const std::vector &src, int srcStride, + nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat frmt, + NVCVInterpolationType interp) { - int dstStep = dstRowStride / sizeof(float); - int srcStep = srcRowStride / sizeof(float); + int dstStep = dstStride / sizeof(T); + int srcStep = srcStride / sizeof(T); if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) { - _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, 0, 0, srcSize.h, srcSize.w, - fmt, interp); - } - else if (interp == NVCV_INTERP_AREA) - { - _Resize(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, fmt, interp, isVarShape); + resizedCrop(dst, dstStep, dstSize, src, srcStep, srcSize, top, left, crop_rows, crop_cols, + frmt, interp); } } -void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, - nvcv::ImageFormat fmt, NVCVInterpolationType interp) +void ResizedCrop(std::vector &dst, int dstStride, nvcv::Size2D dstSize, const std::vector &src, + int srcStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, + nvcv::ImageFormat frmt, NVCVInterpolationType interp) { - int dstStep = dstRowStride / sizeof(float); - int srcStep = srcRowStride / sizeof(float); + _ResizedCrop(dst, dstStride, dstSize, src, srcStride, srcSize, top, left, crop_rows, crop_cols, frmt, interp); +} - if (interp == NVCV_INTERP_NEAREST || interp == NVCV_INTERP_LINEAR || interp == NVCV_INTERP_CUBIC) - { - _ResizedCrop(hDst, dstStep, dstSize, hSrc, srcStep, srcSize, top, left, crop_rows, - crop_cols, fmt, interp); - } +void ResizedCrop(std::vector &dst, int dstStride, nvcv::Size2D dstSize, const std::vector &src, + int srcStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, + nvcv::ImageFormat frmt, NVCVInterpolationType interp) +{ + _ResizedCrop(dst, dstStride, dstSize, src, srcStride, srcSize, top, left, crop_rows, crop_cols, frmt, interp); } } // namespace nvcv::test diff --git a/tests/cvcuda/system/ResizeUtils.hpp b/tests/cvcuda/system/ResizeUtils.hpp index ad1ffd482..960830481 100644 --- a/tests/cvcuda/system/ResizeUtils.hpp +++ b/tests/cvcuda/system/ResizeUtils.hpp @@ -28,7 +28,7 @@ namespace nvcv::test { -// support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC/NVCV_INTERP_AREA +// Supports NVCV_INTERP_NEAREST / NVCV_INTERP_LINEAR / NVCV_INTERP_CUBIC / NVCV_INTERP_AREA void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, bool isVarShape); @@ -37,7 +37,7 @@ void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, co int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, bool isVarShape); -// only support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC +// Only supports NVCV_INTERP_NEAREST / NVCV_INTERP_LINEAR / NVCV_INTERP_CUBIC void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, int top, int left, int crop_rows, int crop_cols, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation); diff --git a/tests/cvcuda/system/TestOpErase.cpp b/tests/cvcuda/system/TestOpErase.cpp index a7126dcbf..ca249ea0d 100644 --- a/tests/cvcuda/system/TestOpErase.cpp +++ b/tests/cvcuda/system/TestOpErase.cpp @@ -25,27 +25,35 @@ #include #include +#include #include -NVCV_TEST_SUITE_P(OpErase, nvcv::test::ValueList{ - {1, false}, - {2, false}, - {1, true}, - {2, true} +NVCV_TEST_SUITE_P(OpErase, nvcv::test::ValueList{ + // N, random, isInplace + {1, false, false}, + {2, false, false}, + {1, true, false}, + {2, true, false}, + {1, false, true}, + {2, false, true}, + {1, true, true}, + {2, true, true} }); TEST_P(OpErase, correct_output) { int N = GetParamValue<0>(); bool random = GetParamValue<1>(); + bool isInplace = GetParamValue<2>(); int max_num_erasing_area = 2; unsigned int seed = 0; cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - nvcv::Tensor imgIn = nvcv::util::CreateTensor(N, 640, 480, nvcv::FMT_U8); - nvcv::Tensor imgOut = nvcv::util::CreateTensor(N, 640, 480, nvcv::FMT_U8); + nvcv::Tensor imgIn = nvcv::util::CreateTensor(N, 640, 480, nvcv::FMT_U8); + nvcv::Tensor _imgOut = nvcv::util::CreateTensor(N, 640, 480, nvcv::FMT_U8); + nvcv::Tensor &imgOut = isInplace ? imgIn : _imgOut; auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(imgIn.exportData()); ASSERT_TRUE(inAccess); @@ -72,7 +80,10 @@ TEST_P(OpErase, correct_output) int64_t outBufferSize = outSampleStride * outAccess->numSamples(); // Set output buffer to dummy value - EXPECT_EQ(cudaSuccess, cudaMemset(outAccess->sampleData(0), 0xFA, outBufferSize)); + if (!isInplace) + { + EXPECT_EQ(cudaSuccess, cudaMemset(outAccess->sampleData(0), 0xFA, outBufferSize)); + } //parameters int num_erasing_area = 2; @@ -153,109 +164,261 @@ TEST(OpErase, OpErase_Varshape) { cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - - std::vector imgSrc; - imgSrc.emplace_back(nvcv::Size2D{640, 480}, nvcv::FMT_U8); - - nvcv::ImageBatchVarShape batchSrc(1); - batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); - - for (int i = 0; i < 1; ++i) + std::deque isInplaces{true, false}; + for (bool isInplace : isInplaces) { - const auto srcData = imgSrc[i].exportData(); - assert(srcData->numPlanes() == 1); + std::vector imgSrc, imgDst; + imgSrc.emplace_back(nvcv::Size2D{640, 480}, nvcv::FMT_U8); + imgDst.emplace_back(nvcv::Size2D{640, 480}, nvcv::FMT_U8); + + nvcv::ImageBatchVarShape batchSrc(1); + nvcv::ImageBatchVarShape _batchDst(1); + nvcv::ImageBatchVarShape &batchDst = isInplace ? batchSrc : _batchDst; + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + _batchDst.pushBack(imgDst.begin(), imgDst.end()); + + for (int i = 0; i < 1; ++i) + { + const auto srcData = imgSrc[i].exportData(); + assert(srcData->numPlanes() == 1); + + int srcWidth = srcData->plane(0).width; + int srcHeight = srcData->plane(0).height; + + int srcRowStride = srcWidth * nvcv::FMT_U8.planePixelStrideBytes(0); + + EXPECT_EQ(cudaSuccess, cudaMemset2D(srcData->plane(0).basePtr, srcRowStride, 0, srcRowStride, srcHeight)); + } + + if (!isInplace) + { + for (int i = 0; i < 1; ++i) + { + const auto dstData = imgSrc[i].exportData(); + int dstWidth = dstData->plane(0).width; + int dstHeight = dstData->plane(0).height; + int dstRowStride = dstWidth * nvcv::FMT_U8.planePixelStrideBytes(0); + EXPECT_EQ(cudaSuccess, + cudaMemset2D(dstData->plane(0).basePtr, dstRowStride, 0, dstRowStride, dstHeight)); + } + } + + //parameters + int num_erasing_area = 2; + nvcv::Tensor anchor({{num_erasing_area}, "N"}, nvcv::TYPE_2S32); + nvcv::Tensor erasing({{num_erasing_area}, "N"}, nvcv::TYPE_3S32); + nvcv::Tensor values({{num_erasing_area}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor imgIdx({{num_erasing_area}, "N"}, nvcv::TYPE_S32); + + auto anchorData = anchor.exportData(); + auto erasingData = erasing.exportData(); + auto valuesData = values.exportData(); + auto imgIdxData = imgIdx.exportData(); + + ASSERT_NE(nullptr, anchorData); + ASSERT_NE(nullptr, erasingData); + ASSERT_NE(nullptr, valuesData); + ASSERT_NE(nullptr, imgIdxData); + + std::vector anchorVec(num_erasing_area); + std::vector erasingVec(num_erasing_area); + std::vector imgIdxVec(num_erasing_area); + std::vector valuesVec(num_erasing_area); + + anchorVec[0].x = 0; + anchorVec[0].y = 0; + erasingVec[0].x = 10; + erasingVec[0].y = 10; + erasingVec[0].z = 0x1; + imgIdxVec[0] = 0; + valuesVec[0] = 1.f; + + anchorVec[1].x = 10; + anchorVec[1].y = 10; + erasingVec[1].x = 20; + erasingVec[1].y = 20; + erasingVec[1].z = 0x1; + imgIdxVec[1] = 0; + valuesVec[1] = 1.f; + + // Copy vectors to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(anchorData->basePtr(), anchorVec.data(), anchorVec.size() * sizeof(int2), + cudaMemcpyHostToDevice, stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(erasingData->basePtr(), erasingVec.data(), + erasingVec.size() * sizeof(int3), cudaMemcpyHostToDevice, stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(imgIdxData->basePtr(), imgIdxVec.data(), imgIdxVec.size() * sizeof(int), + cudaMemcpyHostToDevice, stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(valuesData->basePtr(), valuesVec.data(), + valuesVec.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); + + // Call operator + unsigned int seed = 0; + bool random = false; + int max_num_erasing_area = 2; + cvcuda::Erase eraseOp(max_num_erasing_area); + EXPECT_NO_THROW(eraseOp(stream, batchSrc, batchDst, anchor, erasing, values, imgIdx, random, seed)); + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + + const auto dstData = isInplace ? imgSrc[0].exportData() + : imgDst[0].exportData(); + assert(dstData->numPlanes() == 1); + + int dstWidth = dstData->plane(0).width; + int dstHeight = dstData->plane(0).height; + + int dstRowStride = dstWidth * nvcv::FMT_U8.planePixelStrideBytes(0); + + std::vector test(dstHeight * dstRowStride, 0xFF); + + // Copy output data to Host + if (!random) + { + ASSERT_EQ(cudaSuccess, + cudaMemcpy2D(test.data(), dstRowStride, dstData->plane(0).basePtr, dstData->plane(0).rowStride, + dstRowStride, dstHeight, cudaMemcpyDeviceToHost)); + + EXPECT_EQ(test[0], 1); + EXPECT_EQ(test[9], 1); + EXPECT_EQ(test[10], 0); + EXPECT_EQ(test[9 * 640], 1); + EXPECT_EQ(test[9 * 640 + 9], 1); + EXPECT_EQ(test[9 * 640 + 10], 0); + EXPECT_EQ(test[10 * 640], 0); + EXPECT_EQ(test[10 * 640 + 10], 1); + } + } - int srcWidth = srcData->plane(0).width; - int srcHeight = srcData->plane(0).height; + EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} - int srcRowStride = srcWidth * nvcv::FMT_U8.planePixelStrideBytes(0); +// clang-format off +NVCV_TEST_SUITE_P(OpErase_Negative, nvcv::test::ValueList +{ + // in_layout, in_data_type, out_layout, out_data_type, anchor_layout, anchor_datatype, erasingData_layout, erasingData_datatype, imgIdxData_layout, imgIdxData_datatype, valuesData_layout, valuesData_type, num_erasing_area expectedReturnCode + { "CHW", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid in layout + { "HWC", nvcv::TYPE_F16, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid in datatype + { "HWC", nvcv::TYPE_U8, "CHW", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid out layout + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_F16, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid out datatype + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2F32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid anchor datatype + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "NHW", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid anchor dim + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 3, NVCV_ERROR_INVALID_ARGUMENT}, // Invalid num of erasing area 3 (> max) + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3F32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid erasing datatype + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "NHW", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid erasing dim + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid imgIdx datatype + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "NHW", nvcv::TYPE_S32, "N", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid imgIdx datatype + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "N", nvcv::TYPE_S32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid values datatype + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_2S32, "N", nvcv::TYPE_3S32, "N", nvcv::TYPE_S32, "NHW", nvcv::TYPE_F32, 2, NVCV_ERROR_INVALID_ARGUMENT}, // invalid values datatype +}); - EXPECT_EQ(cudaSuccess, cudaMemset2D(srcData->plane(0).basePtr, srcRowStride, 0, srcRowStride, srcHeight)); - } +// clang-format on - //parameters - int num_erasing_area = 2; - nvcv::Tensor anchor({{num_erasing_area}, "N"}, nvcv::TYPE_2S32); - nvcv::Tensor erasing({{num_erasing_area}, "N"}, nvcv::TYPE_3S32); - nvcv::Tensor values({{num_erasing_area}, "N"}, nvcv::TYPE_F32); - nvcv::Tensor imgIdx({{num_erasing_area}, "N"}, nvcv::TYPE_S32); +TEST(OpErase_Negative, create_null_handle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaEraseCreate(nullptr, 1)); +} - auto anchorData = anchor.exportData(); - auto erasingData = erasing.exportData(); - auto valuesData = values.exportData(); - auto imgIdxData = imgIdx.exportData(); +TEST(OpErase_Negative, create_negative_area) +{ + NVCVOperatorHandle handle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaEraseCreate(&handle, -1)); +} - ASSERT_NE(nullptr, anchorData); - ASSERT_NE(nullptr, erasingData); - ASSERT_NE(nullptr, valuesData); - ASSERT_NE(nullptr, imgIdxData); +TEST_P(OpErase_Negative, infer_negative_parameter) +{ + std::string in_layout = GetParamValue<0>(); + nvcv::DataType in_data_type = GetParamValue<1>(); + std::string out_layout = GetParamValue<2>(); + nvcv::DataType out_data_type = GetParamValue<3>(); + std::string anchor_layout = GetParamValue<4>(); + nvcv::DataType anchor_datatype = GetParamValue<5>(); + std::string erasingData_layout = GetParamValue<6>(); + nvcv::DataType erasingData_datatype = GetParamValue<7>(); + std::string imgIdxData_layout = GetParamValue<8>(); + nvcv::DataType imgIdxData_datatype = GetParamValue<9>(); + std::string valuesData_layout = GetParamValue<10>(); + nvcv::DataType valuesData_datatype = GetParamValue<11>(); + int num_erasing_area = GetParamValue<12>(); + NVCVStatus expectedReturnCode = GetParamValue<13>(); - std::vector anchorVec(num_erasing_area); - std::vector erasingVec(num_erasing_area); - std::vector imgIdxVec(num_erasing_area); - std::vector valuesVec(num_erasing_area); + int max_num_erasing_area = 2; + unsigned int seed = 0; - anchorVec[0].x = 0; - anchorVec[0].y = 0; - erasingVec[0].x = 10; - erasingVec[0].y = 10; - erasingVec[0].z = 0x1; - imgIdxVec[0] = 0; - valuesVec[0] = 1.f; + nvcv::Tensor imgIn( + { + {24, 24, 2}, + in_layout.c_str() + }, + in_data_type); - anchorVec[1].x = 10; - anchorVec[1].y = 10; - erasingVec[1].x = 20; - erasingVec[1].y = 20; - erasingVec[1].z = 0x1; - imgIdxVec[1] = 0; - valuesVec[1] = 1.f; + nvcv::Tensor imgOut( + { + {24, 24, 2}, + out_layout.c_str() + }, + out_data_type); - // Copy vectors to the GPU - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(anchorData->basePtr(), anchorVec.data(), anchorVec.size() * sizeof(int2), - cudaMemcpyHostToDevice, stream)); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(erasingData->basePtr(), erasingVec.data(), erasingVec.size() * sizeof(int3), - cudaMemcpyHostToDevice, stream)); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(imgIdxData->basePtr(), imgIdxVec.data(), imgIdxVec.size() * sizeof(int), - cudaMemcpyHostToDevice, stream)); - ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(valuesData->basePtr(), valuesVec.data(), valuesVec.size() * sizeof(float), - cudaMemcpyHostToDevice, stream)); + //parameters + nvcv::TensorShape anchorShape = anchor_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, anchor_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, anchor_layout.c_str()}; + nvcv::TensorShape erasingShape = erasingData_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, erasingData_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, erasingData_layout.c_str()}; + nvcv::TensorShape imgIdxShape = imgIdxData_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, imgIdxData_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, imgIdxData_layout.c_str()}; + nvcv::TensorShape valuesShape = valuesData_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, valuesData_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, valuesData_layout.c_str()}; + nvcv::Tensor anchor(anchorShape, anchor_datatype); + nvcv::Tensor erasing(erasingShape, erasingData_datatype); + nvcv::Tensor values(valuesShape, valuesData_datatype); + nvcv::Tensor imgIdx(imgIdxShape, imgIdxData_datatype); // Call operator - unsigned int seed = 0; - bool random = false; - int max_num_erasing_area = 2; cvcuda::Erase eraseOp(max_num_erasing_area); - EXPECT_NO_THROW(eraseOp(stream, batchSrc, batchSrc, anchor, erasing, values, imgIdx, random, seed)); - - EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_EQ( + expectedReturnCode, + nvcv::ProtectCall([&] { eraseOp(nullptr, imgIn, imgOut, anchor, erasing, values, imgIdx, false, seed); })); +} - const auto dstData = imgSrc[0].exportData(); - assert(dstData->numPlanes() == 1); +TEST_P(OpErase_Negative, varshape_infer_negative_parameter) +{ + std::string in_layout = GetParamValue<0>(); + nvcv::DataType in_data_type = GetParamValue<1>(); + std::string out_layout = GetParamValue<2>(); + nvcv::DataType out_data_type = GetParamValue<3>(); + std::string anchor_layout = GetParamValue<4>(); + nvcv::DataType anchor_datatype = GetParamValue<5>(); + std::string erasingData_layout = GetParamValue<6>(); + nvcv::DataType erasingData_datatype = GetParamValue<7>(); + std::string imgIdxData_layout = GetParamValue<8>(); + nvcv::DataType imgIdxData_datatype = GetParamValue<9>(); + std::string valuesData_layout = GetParamValue<10>(); + nvcv::DataType valuesData_datatype = GetParamValue<11>(); + int num_erasing_area = GetParamValue<12>(); + NVCVStatus expectedReturnCode = GetParamValue<13>(); - int dstWidth = dstData->plane(0).width; - int dstHeight = dstData->plane(0).height; + int max_num_erasing_area = 2; + unsigned int seed = 0; - int dstRowStride = dstWidth * nvcv::FMT_U8.planePixelStrideBytes(0); + if (in_layout == "CHW" || in_data_type == nvcv::TYPE_F16 || out_layout == "CHW" || out_data_type == nvcv::TYPE_F16) + { + GTEST_SKIP(); + } - std::vector test(dstHeight * dstRowStride, 0xFF); + std::vector imgSrc, imgDst; + imgSrc.emplace_back(nvcv::Size2D{32, 32}, nvcv::FMT_U8); - // Copy output data to Host - if (!random) - { - ASSERT_EQ(cudaSuccess, - cudaMemcpy2D(test.data(), dstRowStride, dstData->plane(0).basePtr, dstData->plane(0).rowStride, - dstRowStride, dstHeight, cudaMemcpyDeviceToHost)); + nvcv::ImageBatchVarShape batchSrc(1); + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); - EXPECT_EQ(test[0], 1); - EXPECT_EQ(test[9], 1); - EXPECT_EQ(test[10], 0); - EXPECT_EQ(test[9 * 640], 1); - EXPECT_EQ(test[9 * 640 + 9], 1); - EXPECT_EQ(test[9 * 640 + 10], 0); - EXPECT_EQ(test[10 * 640], 0); - EXPECT_EQ(test[10 * 640 + 10], 1); - } + //parameters + nvcv::TensorShape anchorShape = anchor_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, anchor_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, anchor_layout.c_str()}; + nvcv::TensorShape erasingShape = erasingData_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, erasingData_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, erasingData_layout.c_str()}; + nvcv::TensorShape imgIdxShape = imgIdxData_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, imgIdxData_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, imgIdxData_layout.c_str()}; + nvcv::TensorShape valuesShape = valuesData_layout.size() == 3 ? nvcv::TensorShape{{num_erasing_area, num_erasing_area, num_erasing_area}, valuesData_layout.c_str()} : nvcv::TensorShape{{num_erasing_area}, valuesData_layout.c_str()}; + nvcv::Tensor anchor(anchorShape, anchor_datatype); + nvcv::Tensor erasing(erasingShape, erasingData_datatype); + nvcv::Tensor values(valuesShape, valuesData_datatype); + nvcv::Tensor imgIdx(imgIdxShape, imgIdxData_datatype); - EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); + // Call operator + cvcuda::Erase eraseOp(max_num_erasing_area); + EXPECT_EQ( + expectedReturnCode, + nvcv::ProtectCall([&] { eraseOp(nullptr, batchSrc, batchSrc, anchor, erasing, values, imgIdx, false, seed); })); } diff --git a/tests/cvcuda/system/TestOpHQResize.cpp b/tests/cvcuda/system/TestOpHQResize.cpp index f9ce474af..b78d4ab22 100644 --- a/tests/cvcuda/system/TestOpHQResize.cpp +++ b/tests/cvcuda/system/TestOpHQResize.cpp @@ -117,12 +117,12 @@ struct CpuSample } private: - int offset(int sampleIdx, int2 idx) + int64_t offset(int sampleIdx, int2 idx) { return sampleIdx * m_strides.z + idx.y * m_strides.y + idx.x * m_strides.x; } - int offset(int sampleIdx, int3 idx) + int64_t offset(int sampleIdx, int3 idx) { return sampleIdx * m_strides.w + idx.z * m_strides.z + idx.y * m_strides.y + idx.x * m_strides.x; } @@ -643,7 +643,8 @@ NVCV_TYPED_TEST_SUITE( NVCV_TEST_ROW(4, NVCV_SHAPE2D(1024, 101), NVCV_SHAPE2D(105, 512), 2, float, float, NVCV_INTERP_LINEAR), NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 3, float, float, NVCV_INTERP_CUBIC), NVCV_TEST_ROW(4, NVCV_SHAPE2D(41, 41), NVCV_SHAPE2D(244, 244), 4, float, float, NVCV_INTERP_GAUSSIAN), - NVCV_TEST_ROW(3, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 7, float, float, NVCV_INTERP_LANCZOS)>); + NVCV_TEST_ROW(3, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 7, float, float, NVCV_INTERP_LANCZOS), + NVCV_TEST_ROW(1, NVCV_SHAPE2D(1 << 14, 1 << 13), NVCV_SHAPE2D(512, 256), 7, float, float, NVCV_INTERP_LINEAR)>); template void TestTensor(bool antialias) @@ -750,7 +751,9 @@ NVCV_TYPED_TEST_SUITE( NVCV_TEST_ROW(4, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(100, 50, 100), 4, ushort, float, NVCV_INTERP_LINEAR), NVCV_TEST_ROW(3, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(100, 100, 50), 3, float, float, NVCV_INTERP_CUBIC), NVCV_TEST_ROW(4, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(100, 40, 40), 5, uchar, float, NVCV_INTERP_LANCZOS), - NVCV_TEST_ROW(7, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, NVCV_INTERP_CUBIC)>); + NVCV_TEST_ROW(7, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(3, NVCV_SHAPE3D(1 << 10, 1 << 9, 1 << 9), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, + NVCV_INTERP_CUBIC)>); TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) { @@ -843,20 +846,22 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } -#define NVCV_TEST_ROW_TB(NumChannels, InT, OutT, Antialias, MinInterpolation, MagInterpolation) \ +#define NVCV_TEST_ROW_TB(NumChannels, InT, OutT, Antialias, MinInterpolation, MagInterpolation, LargeSample) \ ttype::Types, InT, OutT, ttype::Value, ttype::Value, \ - ttype::Value> - -NVCV_TYPED_TEST_SUITE(OpHQResizeBatch, - ttype::Types); + ttype::Value, ttype::Value> + +NVCV_TYPED_TEST_SUITE( + OpHQResizeBatch, + ttype::Types); TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output) { @@ -867,20 +872,34 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output) const nvcv::DataType outDtype = TypeAsFormat(); const bool antialias = ttype::GetValue; const NVCVInterpolationType minInterpolation = ttype::GetValue; - const NVCVInterpolationType magInterpolation = ttype::GetValue; + const NVCVInterpolationType magInterpolation = ttype::GetValue; + const bool largeSample = ttype::GetValue; constexpr int numSamples = 5; const int varChannels[numSamples] = {4, 1, 7, 3, 5}; + int inShape1[] = {1 << 14, 1 << 13}; + if (sizeof(InBT) == 1) + { + inShape1[0] *= 2; + inShape1[1] *= 2; + } + + auto sample1 + = largeSample + ? HQResizeTensorShapeI({inShape1[0], inShape1[1]}, 2, numChannels > 0 ? numChannels : varChannels[0]) + : HQResizeTensorShapeI({728, 1024, 0}, 2, numChannels > 0 ? numChannels : varChannels[0]); + std::vector inShapes = { - {{728, 1024}, 2, numChannels > 0 ? numChannels : varChannels[0]}, - { {512, 512}, 2, numChannels > 0 ? numChannels : varChannels[1]}, - { {128, 256}, 2, numChannels > 0 ? numChannels : varChannels[2]}, - { {256, 128}, 2, numChannels > 0 ? numChannels : varChannels[3]}, - { {40, 40}, 2, numChannels > 0 ? numChannels : varChannels[4]} + sample1, + {{512, 512}, 2, numChannels > 0 ? numChannels : varChannels[1]}, + {{128, 256}, 2, numChannels > 0 ? numChannels : varChannels[2]}, + {{256, 128}, 2, numChannels > 0 ? numChannels : varChannels[3]}, + { {40, 40}, 2, numChannels > 0 ? numChannels : varChannels[4]} }; + std::vector outShapes = { - {{245, 245}, 2, inShapes[0].numChannels}, + {{512, 245}, 2, inShapes[0].numChannels}, { {250, 51}, 2, inShapes[1].numChannels}, {{243, 128}, 2, inShapes[2].numChannels}, {{128, 256}, 2, inShapes[3].numChannels}, diff --git a/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp b/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp index 81208d38a..f2fe9ccee 100644 --- a/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp +++ b/tests/cvcuda/system/TestOpResizeCropConvertReformat.cpp @@ -16,11 +16,9 @@ */ #include "Definitions.hpp" -#include "ResizeUtils.hpp" #include #include -// #include #include #include #include @@ -73,79 +71,132 @@ inline NVCVChannelManip ChannelManip(nvcv::ImageFormat srcFormat, nvcv::ImageFor return manip; } -template -void CropConvert(DstT *dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, const SrcT *src, - const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, const int numImages, const int2 cropPos, - const NVCVChannelManip manip, const double scale = 1.0, const double offst = 0.0) +// clang-format off + +template +void ResizeCropConvert( DstT *dst, NVCVSize2D dstSize, nvcv::ImageFormat dstFrmt, + const SrcT *src, NVCVSize2D srcSize, nvcv::ImageFormat srcFrmt, + int numImages, NVCVSize2D newSize, int2 crop, NVCVInterpolationType interp, + const NVCVChannelManip manip, float scale, float offset, bool srcCast = true) { - int srcPlanes = srcFormat.numPlanes(); - int dstPlanes = dstFormat.numPlanes(); - int srcChannels = srcFormat.numChannels(); - int dstChannels = dstFormat.numChannels(); + int channels = dstFrmt.numChannels(); + int srcPlanes = srcFrmt.numPlanes(); + int dstPlanes = dstFrmt.numPlanes(); - size_t srcIncrX = srcChannels / srcPlanes; // 1 if planar; srcChannels if not. - size_t dstIncrX = dstChannels / dstPlanes; // 1 if planar; dstChannels if not. + size_t srcIncrX = channels / srcPlanes; // 1 if planar; channels if not. + size_t dstIncrX = channels / dstPlanes; // 1 if planar; channels if not. size_t srcIncrY = srcIncrX * srcSize.w; size_t dstIncrY = dstIncrX * dstSize.w; size_t srcIncrC = (srcPlanes > 1 ? srcSize.w * srcSize.h : 1); size_t dstIncrC = (dstPlanes > 1 ? dstSize.w * dstSize.h : 1); - size_t srcIncrN = srcSize.w * srcSize.h * srcChannels; - size_t dstIncrN = dstSize.w * dstSize.h * dstChannels; - size_t srcOffst = cropPos.y * srcIncrY + cropPos.x * srcIncrX; + size_t srcIncrN = srcSize.w * srcSize.h * channels; + size_t dstIncrN = dstSize.w * dstSize.h * channels; - int channelMap[4] = {0, 1, 2, 3}; - - int channels = (srcChannels < dstChannels ? srcChannels : dstChannels); + int mapC[4] = {0, 1, 2, 3}; if (manip == NVCV_CHANNEL_REVERSE) { - for (int c = 0; c < channels; ++c) channelMap[c] = channels - c - 1; + for (int c = 0; c < channels; ++c) mapC[c] = channels - c - 1; } + float scaleW = static_cast(srcSize.w) / newSize.w; + float scaleH = static_cast(srcSize.h) / newSize.h; + for (int i = 0; i < numImages; i++) { - const SrcT *srcBase = src + i * srcIncrN + srcOffst; + const SrcT *srcBase = src + i * srcIncrN; DstT *dstBase = dst + i * dstIncrN; - for (int y = 0; y < dstSize.h; y++) + for (int dy = 0; dy < dstSize.h; dy++) { - const SrcT *srcRow = srcBase + y * srcIncrY; - DstT *dstRow = dstBase + y * dstIncrY; + DstT *dstRow = dstBase + dy * dstIncrY; - for (int x = 0; x < dstSize.w; x++) + for (int dx = 0; dx < dstSize.w; dx++) { - const SrcT *srcPtr = srcRow + x * srcIncrX; - DstT *dstPtr = dstRow + x * dstIncrX; + DstT *dstPtr = dstRow + dx * dstIncrX; - for (int c = 0; c < channels; c++) + if (interp == NVCV_INTERP_NEAREST) { - dstPtr[channelMap[c] * dstIncrC] = static_cast(srcPtr[c * srcIncrC] * scale + offst); + int sx = std::floor(scaleW * (dx + crop.x + 0.5f)); + int sy = std::floor(scaleH * (dy + crop.y + 0.5f)); + + const SrcT *src0 = srcBase + sy * srcIncrY + sx * srcIncrX; + + for (int c = 0; c < channels; c++) + { + dstPtr[mapC[c] * dstIncrC] = cuda::SaturateCast(scale * src0[c * srcIncrC] + offset); + } + } + else if (interp == NVCV_INTERP_LINEAR) + { + float fx = scaleW * (dx + crop.x + 0.5f) - 0.5f; + float fy = scaleH * (dy + crop.y + 0.5f) - 0.5f; + + int sx0 = std::floor(fx); + int sy0 = std::floor(fy); + int sx1 = std::min(sx0 + 1, srcSize.w - 1); + int sy1 = std::min(sy0 + 1, srcSize.h - 1); + + fx -= sx0; + fy -= sy0; + + sx0 = std::max(0, sx0); + sy0 = std::max(0, sy0); + + float wghtX[2] = {1 - fx, fx}; + float wghtY[2] = {1 - fy, fy}; + + const size_t x0 = sx0 * srcIncrX; + const size_t x1 = sx1 * srcIncrX; + + const SrcT *src0 = srcBase + sy0 * srcIncrY; + const SrcT *src1 = srcBase + sy1 * srcIncrY; + + for (int c = 0; c < channels; c++) + { + const size_t xc = c * srcIncrC; + + float val = src0[x0 + xc] * wghtY[0] * wghtX[0] + + src0[x1 + xc] * wghtY[0] * wghtX[1] + + src1[x0 + xc] * wghtY[1] * wghtX[0] + + src1[x1 + xc] * wghtY[1] * wghtX[1]; + + val = scale * (srcCast ? cuda::SaturateCast(val) : val) + offset; + + dstPtr[mapC[c] * dstIncrC] = cuda::SaturateCast(val); + } } } } } } -template -void CropConvert(std::vector &dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, - const std::vector src, const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, - const int numImages, const int2 cropPos, const NVCVChannelManip manip, const double scale = 1.0, - const double offst = 0.0) +template +void ResizeCropConvert( std::vector &dst, NVCVSize2D dstSize, nvcv::ImageFormat dstFrmt, + const std::vector &src, NVCVSize2D srcSize, nvcv::ImageFormat srcFrmt, + int numImages, NVCVSize2D newSize, int2 crop, NVCVInterpolationType interp, + const NVCVChannelManip manip, float scale, float offset, bool srcCast = true) { - CropConvert(dst.data(), dstSize, dstFormat, src.data(), srcSize, srcFormat, numImages, cropPos, manip, scale, - offst); + ResizeCropConvert(dst.data(), dstSize, dstFrmt, src.data(), srcSize, srcFrmt, + numImages, newSize, crop, interp, manip, scale, offset, srcCast); + } -template -void CropConvert(DstT *dst, const nvcv::Size2D dstSize, const nvcv::ImageFormat dstFormat, const std::vector src, - const nvcv::Size2D srcSize, const nvcv::ImageFormat srcFormat, const int numImages, const int2 cropPos, - const NVCVChannelManip manip, const double scale = 1.0, const double offst = 0.0) +template +void ResizeCropConvert( DstT *dst, NVCVSize2D dstSize, nvcv::ImageFormat dstFrmt, + const std::vector &src, NVCVSize2D srcSize, nvcv::ImageFormat srcFrmt, + int numImages, NVCVSize2D newSize, int2 crop, NVCVInterpolationType interp, + const NVCVChannelManip manip, float scale, float offset, bool srcCast = true) { - CropConvert(dst, dstSize, dstFormat, src.data(), srcSize, srcFormat, numImages, cropPos, manip, scale, offst); + ResizeCropConvert(dst, dstSize, dstFrmt, src.data(), srcSize, srcFrmt, + numImages, newSize, crop, interp, manip, scale, offset, srcCast); + } +// clang-format on + template -void fillVec(std::vector &vec, const nvcv::Size2D size, const nvcv::ImageFormat frmt, size_t offst = 0) +void fillVec(std::vector &vec, const NVCVSize2D size, const nvcv::ImageFormat frmt, size_t offst = 0) { int planes = frmt.numPlanes(); int channels = frmt.numChannels(); @@ -171,84 +222,110 @@ void fillVec(std::vector &vec, const nvcv::Size2D size, const nvcv::ImageForm #define _SHAPE(w, h, n) (int3{w, h, n}) -#define _TEST_ROW(SrcShape, ResizeDim, Interp, DstSize, CropPos, SrcFrmt, DstFrmt, SrcType, DstType) \ - ttype::Types, ttype::Value, ttype::Value, ttype::Value, \ - ttype::Value, ttype::Value, ttype::Value, SrcType, DstType> - // clang-format off +#define _TEST_ROW(SrcShape, ResizeDim, Interp, DstSize, CropPos, Scale, Offset, SrcFrmt, DstFrmt, SrcType, DstType, SrcCast) \ + ttype::Types, ttype::Value, ttype::Value, ttype::Value, \ + ttype::Value, ttype::Value, ttype::Value, \ + ttype::Value, ttype::Value, SrcType, DstType, ttype::Value > + NVCV_TYPED_TEST_SUITE( OpResizeCropConvertReformat, ttype::Types< // Test cases: RGB (interleaved) -> BGR (planar); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_LINEAR, int2( 6, 6), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 0 - _TEST_ROW(_SHAPE( 8, 8, 1), int2( 16, 16), NVCV_INTERP_LINEAR, int2( 12, 12), int2( 2, 2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 1 - _TEST_ROW(_SHAPE( 42, 48, 1), int2( 23, 24), NVCV_INTERP_LINEAR, int2( 15, 13), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 2 - _TEST_ROW(_SHAPE( 42, 40, 3), int2( 21, 20), NVCV_INTERP_LINEAR, int2( 17, 13), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 3 - _TEST_ROW(_SHAPE( 21, 21, 5), int2( 42, 42), NVCV_INTERP_LINEAR, int2( 32, 32), int2( 10, 10), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 4 - _TEST_ROW(_SHAPE( 113, 12, 7), int2( 12, 36), NVCV_INTERP_LINEAR, int2( 7, 13), int2( 3, 11), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 5 - _TEST_ROW(_SHAPE( 17, 151, 7), int2( 48, 16), NVCV_INTERP_LINEAR, int2( 32, 16), int2( 4, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 6 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 7 - _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_LINEAR, int2( 640, 480), int2(101, 64), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 8 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 9 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 10 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 11 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_LINEAR, int2( 6, 6), int2( 1, 1), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 0 + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 16, 16), NVCV_INTERP_LINEAR, int2( 12, 12), int2( 2, 2), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 1 + _TEST_ROW(_SHAPE( 42, 48, 1), int2( 23, 24), NVCV_INTERP_LINEAR, int2( 15, 13), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 2 + _TEST_ROW(_SHAPE( 42, 40, 3), int2( 21, 20), NVCV_INTERP_LINEAR, int2( 17, 13), int2( 1, 1), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 3 + _TEST_ROW(_SHAPE( 21, 21, 5), int2( 42, 42), NVCV_INTERP_LINEAR, int2( 32, 32), int2( 10, 10), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 4 + _TEST_ROW(_SHAPE( 113, 12, 7), int2( 12, 36), NVCV_INTERP_LINEAR, int2( 7, 13), int2( 3, 11), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 5 + _TEST_ROW(_SHAPE( 17, 151, 7), int2( 48, 16), NVCV_INTERP_LINEAR, int2( 32, 16), int2( 4, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 6 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 7 + _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_LINEAR, int2( 640, 480), int2(101, 64), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 8 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 9 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t, false), // 10 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t, false), // 11 // Test cases: RGB (interleaved) -> RGB (planar); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 12 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 13 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 14 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 15 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 12 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 13 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 14 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 15 // Test cases: BGR (interleaved) -> RGB (planar); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 16 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 17 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 18 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 19 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 16 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 17 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 18 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 19 // Test cases: BGR (interleaved) -> BGR (planar); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 20 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 21 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 22 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 23 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 20 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 21 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t, false), // 22 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t, false), // 23 // Test cases: RGB (interleaved) -> BGR (interleaved); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_LINEAR, int2( 6, 6), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 24 - _TEST_ROW(_SHAPE( 8, 8, 1), int2( 16, 16), NVCV_INTERP_LINEAR, int2( 12, 12), int2( 2, 2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 25 - _TEST_ROW(_SHAPE( 113, 12, 7), int2( 12, 36), NVCV_INTERP_LINEAR, int2( 7, 13), int2( 3, 11), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 26 - _TEST_ROW(_SHAPE( 17, 151, 7), int2( 48, 16), NVCV_INTERP_LINEAR, int2( 32, 16), int2( 4, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 27 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 28 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3), // 29 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, uchar3, uchar3), // 30 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, uchar3, uchar3), // 31 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_LINEAR, int2( 6, 6), int2( 1, 1), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3 , false), // 24 + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 16, 16), NVCV_INTERP_LINEAR, int2( 12, 12), int2( 2, 2), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3 , false), // 25 + _TEST_ROW(_SHAPE( 113, 12, 7), int2( 12, 36), NVCV_INTERP_LINEAR, int2( 7, 13), int2( 3, 11), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3 , false), // 26 + _TEST_ROW(_SHAPE( 17, 151, 7), int2( 48, 16), NVCV_INTERP_LINEAR, int2( 32, 16), int2( 4, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3 , false), // 27 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3 , false), // 28 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32, uchar3, float3 , false), // 29 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, uchar3, uchar3 , false), // 30 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, uchar3, uchar3 , false), // 31 // Test cases: RGB (interleaved) -> RGB (interleaved); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 32 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 33 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 34 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 35 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3 , false), // 32 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3 , false), // 33 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3 , false), // 34 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3 , false), // 35 // Test cases: BGR (interleaved) -> RGB (interleaved); linear interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 36 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3), // 37 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 38 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3), // 39 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3 , false), // 36 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGBf32, uchar3, float3 , false), // 37 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3 , false), // 38 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_LINEAR, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3 , false), // 39 // Test cases: RGB (interleaved) -> BGR (planar); nearest-neighbor interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_NEAREST, int2( 6, 6), int2( 1, 1), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 40 - _TEST_ROW(_SHAPE( 8, 8, 5), int2( 16, 16), NVCV_INTERP_NEAREST, int2( 12, 12), int2( 2, 2), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 41 - _TEST_ROW(_SHAPE( 42, 48, 1), int2( 23, 24), NVCV_INTERP_NEAREST, int2( 15, 13), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 42 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 43 - _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_NEAREST, int2( 640, 480), int2(101, 64), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 44 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float), // 45 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 46 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t), // 47 + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 8, 8, 1), int2( 8, 8), NVCV_INTERP_NEAREST, int2( 6, 6), int2( 1, 1), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 40 + _TEST_ROW(_SHAPE( 8, 8, 5), int2( 16, 16), NVCV_INTERP_NEAREST, int2( 12, 12), int2( 2, 2), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 41 + _TEST_ROW(_SHAPE( 42, 48, 1), int2( 23, 24), NVCV_INTERP_NEAREST, int2( 15, 13), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 42 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 43 + _TEST_ROW(_SHAPE(1080, 1920, 13), int2(800, 600), NVCV_INTERP_NEAREST, int2( 640, 480), int2(101, 64), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 44 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf32p, uchar3, float , false), // 45 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t, false), // 46 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8p, uchar3, uint8_t, false), // 47 // Test cases: BGR (interleaved) -> RGB (planar); nearest-neighbor interpolation; float and uchar output. - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 48 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float), // 49 - _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t), // 50 - _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t) // 51 - + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y), scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 48 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 49 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_NEAREST, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 50 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 51 + + // Test cases: Rescaling. + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y) , scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1/127.5, -1, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 52 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 2, -255, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , false), // 53 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), -1, 255, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3 , false), // 54 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 0.5, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 55 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 2, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, false), // 56 + + // Test cases: Source cast true (with and w/o rescaling). + // source(w, h, n) , resize(w, h) , interpolation , dest.(w, h) , crop(x, y) , scale, offst, source format , destination format , src type, dst type, src cast + _TEST_ROW(_SHAPE( 353, 450, 3), int2(256, 256), NVCV_INTERP_LINEAR, int2( 224, 224), int2( 16, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , true), // 57 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1/127.5, -1, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , true), // 58 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 2, -255, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGBf32p, uchar3, float , true), // 59 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), -1, 255, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8, uchar3, uchar3 , true), // 60 + _TEST_ROW(_SHAPE( 313, 212, 4), int2(412, 336), NVCV_INTERP_LINEAR, int2( 412, 336), int2( 0, 0), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, true), // 61 + _TEST_ROW(_SHAPE(1280, 960, 3), int2(300, 225), NVCV_INTERP_NEAREST, int2( 250, 200), int2( 15, 16), 1, 0, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_RGB8p, uchar3, uint8_t, true) // 62 >); #undef _TEST_ROW @@ -264,20 +341,23 @@ TYPED_TEST(OpResizeCropConvertReformat, tensor_correct_output) int2 cropDim = ttype::GetValue; int2 cropPos = ttype::GetValue; - nvcv::ImageFormat srcFormat{ttype::GetValue}; - nvcv::ImageFormat dstFormat{ttype::GetValue}; + float scale = ttype::GetValue; + float offset = ttype::GetValue; - using SrcVT = typename ttype::GetType; - using DstVT = typename ttype::GetType; + nvcv::ImageFormat srcFormat{ttype::GetValue}; + nvcv::ImageFormat dstFormat{ttype::GetValue}; + + using SrcVT = typename ttype::GetType; + using DstVT = typename ttype::GetType; using SrcBT = typename cuda::BaseType; using DstBT = typename cuda::BaseType; + bool srcCast = ttype::GetValue; + int srcW = srcShape.x; int srcH = srcShape.y; int dstW = cropDim.x; int dstH = cropDim.y; - int tmpW = resize.x; - int tmpH = resize.y; int numImages = srcShape.z; int srcChannels = srcFormat.numChannels(); @@ -290,8 +370,6 @@ TYPED_TEST(OpResizeCropConvertReformat, tensor_correct_output) ASSERT_LE(srcChannels, 4); ASSERT_EQ(srcChannels, dstChannels); - NVCVSize2D resizeDim{resize.x, resize.y}; - NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat); // Create input and output tensors. @@ -311,42 +389,41 @@ TYPED_TEST(OpResizeCropConvertReformat, tensor_correct_output) ASSERT_TRUE(dstAccess); int srcRowElems = srcPixElems * srcW; - int tmpRowElems = srcPixElems * tmpW; int dstRowElems = dstPixElems * dstW; size_t srcElems = (size_t)srcRowElems * (size_t)srcH * (size_t)srcPlanes * (size_t)numImages; - size_t tmpElems = (size_t)tmpRowElems * (size_t)tmpH * (size_t)srcPlanes * (size_t)numImages; size_t dstElems = (size_t)dstRowElems * (size_t)dstH * (size_t)dstPlanes * (size_t)numImages; - nvcv::Size2D srcSize{srcW, srcH}; - nvcv::Size2D tmpSize{tmpW, tmpH}; - nvcv::Size2D dstSize{dstW, dstH}; + NVCVSize2D srcSize{srcW, srcH}; + NVCVSize2D newSize{resize.x, resize.y}; + NVCVSize2D dstSize{dstW, dstH}; size_t srcPitch = srcW * sizeof(SrcVT); - size_t tmpPitch = tmpW * sizeof(SrcVT); size_t dstPitch = dstW * sizeof(DstVT); std::vector srcVec(srcElems); - std::vector tmpVec(tmpElems); std::vector refVec(dstElems); // Populate source tensor. - fillVec(srcVec, srcSize, srcFormat); - - // Generate "gold" result for image and place in reference vector. - test::Resize(tmpVec, tmpPitch, tmpSize, srcVec, srcPitch, srcSize, srcFormat, interp, false); - CropConvert(refVec, dstSize, dstFormat, tmpVec, tmpSize, srcFormat, numImages, cropPos, manip); + for (int n = 0; n < numImages; n++) + { + fillVec(srcVec, srcSize, srcFormat, n * (size_t)srcRowElems * (size_t)srcH * (size_t)srcPlanes); + } // Copy source tensor to device. ASSERT_EQ(cudaSuccess, cudaMemcpy2D(src->basePtr(), srcAccess->rowStride(), srcVec.data(), srcPitch, srcPitch, - srcH * srcPlanes, cudaMemcpyHostToDevice)); + srcH * srcPlanes * numImages, cudaMemcpyHostToDevice)); + + // Generate "gold" result for image and place in reference vector. + ResizeCropConvert(refVec, dstSize, dstFormat, srcVec, srcSize, srcFormat, numImages, newSize, cropPos, interp, + manip, scale, offset, srcCast); // Run fused ResizeCropConvertReformat operator. cudaStream_t stream; ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); cvcuda::ResizeCropConvertReformat resizeCrop; - EXPECT_NO_THROW(resizeCrop(stream, srcTensor, dstTensor, resizeDim, interp, cropPos, manip)); + EXPECT_NO_THROW(resizeCrop(stream, srcTensor, dstTensor, newSize, interp, cropPos, manip, scale, offset, srcCast)); ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); @@ -354,7 +431,7 @@ TYPED_TEST(OpResizeCropConvertReformat, tensor_correct_output) // Copy destination tensor back to host. std::vector dstVec(dstElems); ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitch, dst->basePtr(), dstAccess->rowStride(), dstPitch, - dstH * dstPlanes, cudaMemcpyDeviceToHost)); + dstH * dstPlanes * numImages, cudaMemcpyDeviceToHost)); // Compare "gold" reference to computed output. VEC_EXPECT_NEAR(refVec, dstVec, 1); @@ -370,20 +447,23 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) int2 cropDim = ttype::GetValue; int2 cropPos = ttype::GetValue; - nvcv::ImageFormat srcFormat{ttype::GetValue}; - nvcv::ImageFormat dstFormat{ttype::GetValue}; + float scale = ttype::GetValue; + float offset = ttype::GetValue; - using SrcVT = typename ttype::GetType; - using DstVT = typename ttype::GetType; + nvcv::ImageFormat srcFormat{ttype::GetValue}; + nvcv::ImageFormat dstFormat{ttype::GetValue}; + + using SrcVT = typename ttype::GetType; + using DstVT = typename ttype::GetType; using SrcBT = typename cuda::BaseType; using DstBT = typename cuda::BaseType; + bool srcCast = ttype::GetValue; + int srcW = srcShape.x; int srcH = srcShape.y; int dstW = cropDim.x; int dstH = cropDim.y; - int tmpW = resize.x; - int tmpH = resize.y; int numImages = srcShape.z; int srcChannels = srcFormat.numChannels(); @@ -396,8 +476,6 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) ASSERT_LE(srcChannels, 4); ASSERT_EQ(srcChannels, dstChannels); - NVCVSize2D resizeDim{resize.x, resize.y}; - NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat); std::vector srcImg; @@ -408,20 +486,16 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) std::uniform_int_distribution randW(srcW * 0.8, srcW * 1.2); std::uniform_int_distribution randH(srcH * 0.8, srcH * 1.2); - int tmpRowElems = srcPixElems * tmpW; int dstRowElems = dstPixElems * dstW; - size_t tmpElems = (size_t)tmpRowElems * (size_t)tmpH * (size_t)srcPlanes; size_t refIncr = (size_t)dstRowElems * (size_t)dstH * (size_t)dstPlanes; size_t dstElems = refIncr * (size_t)numImages; - nvcv::Size2D tmpSize{tmpW, tmpH}; - nvcv::Size2D dstSize{dstW, dstH}; + NVCVSize2D newSize{resize.x, resize.y}; + NVCVSize2D dstSize{dstW, dstH}; - std::vector tmpVec(tmpElems); std::vector refVec(dstElems); - size_t tmpPitch = tmpW * sizeof(SrcVT); size_t dstPitch = dstW * sizeof(DstVT); for (int i = 0; i < numImages; ++i) @@ -429,7 +503,7 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) int imgW = (interp ? randW(randEng) : srcW); int imgH = (interp ? randH(randEng) : srcH); - srcImg.emplace_back(nvcv::Size2D{imgW, imgH}, srcFormat); + srcImg.emplace_back(NVCVSize2D{imgW, imgH}, srcFormat); auto srcData = srcImg[i].exportData(); ASSERT_TRUE(srcData); @@ -439,7 +513,7 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) size_t imgPitch = imgW * sizeof(SrcVT); size_t imgElems = (size_t)imgRowElems * (size_t)imgH * (size_t)srcPlanes; - nvcv::Size2D imgSize{imgW, imgH}; + NVCVSize2D imgSize{imgW, imgH}; std::vector imgVec(imgElems); @@ -449,8 +523,8 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) // Generate "gold" result for image and place in reference image plane. DstBT *refPlane = refVec.data() + i * refIncr; - test::Resize(tmpVec, tmpPitch, tmpSize, imgVec, imgPitch, imgSize, srcFormat, interp, true); - CropConvert(refPlane, dstSize, dstFormat, tmpVec, tmpSize, srcFormat, 1, cropPos, manip); + ResizeCropConvert(refPlane, dstSize, dstFormat, imgVec, imgSize, srcFormat, 1, newSize, cropPos, interp, manip, + scale, offset, srcCast); // Copy source tensor to device. ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->plane(0).basePtr, srcData->plane(0).rowStride, imgVec.data(), @@ -476,7 +550,7 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); cvcuda::ResizeCropConvertReformat resizeCrop; - EXPECT_NO_THROW(resizeCrop(stream, src, dstTensor, resizeDim, interp, cropPos, manip)); + EXPECT_NO_THROW(resizeCrop(stream, src, dstTensor, newSize, interp, cropPos, manip, scale, offset, srcCast)); ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); @@ -489,3 +563,136 @@ TYPED_TEST(OpResizeCropConvertReformat, varshape_correct_output) // Compare "gold" reference to computed output. VEC_EXPECT_NEAR(refVec, dstVec, 1); } + +#define _TEST_ROW(Interp, inputBatch, outputBatch, srcFmt, dstFmt, DstSize, CropPos, SrcType, DstType, returnCode) \ + ttype::Types, ttype::Value, ttype::Value, ttype::Value, \ + ttype::Value, ttype::Value, ttype::Value, SrcType, DstType, \ + ttype::Value> + +// clang-format off +NVCV_TYPED_TEST_SUITE(OpResizeCropConvertReformat_Negative, +ttype::Types< + // Interpolation, input batch size, output batch size, src fmt, dst fmt, crop dim, crop pos + // invalid Interpolation + _TEST_ROW(NVCV_INTERP_CUBIC, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + _TEST_ROW(NVCV_INTERP_AREA, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + _TEST_ROW(NVCV_INTERP_LANCZOS, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + // different input/output batch size + _TEST_ROW(NVCV_INTERP_LINEAR, 1, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 1, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + // different channels + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRA8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_NOT_COMPATIBLE), + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRA8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_NOT_COMPATIBLE), + // not equal to 3 channels + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_BGRA8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_NOT_COMPATIBLE), + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_BGRA8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_NOT_COMPATIBLE), + // input is not uchar + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_NOT_COMPATIBLE), + // output is not uchar/float + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGRf16, int2(4, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_NOT_COMPATIBLE), + // invalid Crop Range + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(-1, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 4), int2(0, -1), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(32, 4), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT), + _TEST_ROW(NVCV_INTERP_LINEAR, 2, 2, NVCV_IMAGE_FORMAT_RGB8, NVCV_IMAGE_FORMAT_BGR8, int2(4, 32), int2(0, 0), uchar3, uint8_t, NVCV_ERROR_INVALID_ARGUMENT) +>); +// clang-format on + +#undef _TEST_ROW + +TEST(OpResizeCropConvertReformat_Negative, createWillNullPtr) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaResizeCropConvertReformatCreate(nullptr)); +} + +TYPED_TEST(OpResizeCropConvertReformat_Negative, infer_negative_parameter) +{ + NVCVInterpolationType interp = ttype::GetValue; + + int inputBatchSize = ttype::GetValue; + int outputBatchSize = ttype::GetValue; + + nvcv::ImageFormat srcFormat{ttype::GetValue}; + nvcv::ImageFormat dstFormat{ttype::GetValue}; + + int2 cropDim = ttype::GetValue; + int2 cropPos = ttype::GetValue; + + using SrcVT = typename ttype::GetType; + using DstVT = typename ttype::GetType; + using SrcBT = typename cuda::BaseType; + using DstBT = typename cuda::BaseType; + + NVCVStatus expectedReturnCode = ttype::GetValue; + + // Resize to 16 * 16 then crop + int srcW = 32; + int srcH = 32; + int dstW = cropDim.x; + int dstH = cropDim.y; + NVCVSize2D resizeDim{16, 16}; + + NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat); + + // Create input and output tensors. + nvcv::Tensor srcTensor = nvcv::util::CreateTensor(inputBatchSize, srcW, srcH, srcFormat); + nvcv::Tensor dstTensor = nvcv::util::CreateTensor(outputBatchSize, dstW, dstH, dstFormat); + + cvcuda::ResizeCropConvertReformat resizeCrop; + EXPECT_EQ(expectedReturnCode, + nvcv::ProtectCall([&] { resizeCrop(nullptr, srcTensor, dstTensor, resizeDim, interp, cropPos, manip); })); +} + +TYPED_TEST(OpResizeCropConvertReformat_Negative, varshape_infer_negative_parameter) +{ + NVCVInterpolationType interp = ttype::GetValue; + + int inputBatchSize = ttype::GetValue; + int outputBatchSize = ttype::GetValue; + + nvcv::ImageFormat srcFormat{ttype::GetValue}; + nvcv::ImageFormat dstFormat{ttype::GetValue}; + + int2 cropDim = ttype::GetValue; + int2 cropPos = ttype::GetValue; + + using SrcVT = typename ttype::GetType; + using DstVT = typename ttype::GetType; + using SrcBT = typename cuda::BaseType; + using DstBT = typename cuda::BaseType; + + NVCVStatus expectedReturnCode = ttype::GetValue; + + std::vector srcImg; + + int srcW = 32; + int srcH = 32; + int dstW = cropDim.x; + int dstH = cropDim.y; + NVCVSize2D resizeDim{16, 16}; + + NVCVChannelManip manip = ChannelManip(srcFormat, dstFormat); + + uniform_dist randVal(std::is_integral_v ? cuda::TypeTraits::min : SrcBT{0}, + std::is_integral_v ? cuda::TypeTraits::max : SrcBT{1}); + + std::uniform_int_distribution randW(srcW * 0.8, srcW * 1.2); + std::uniform_int_distribution randH(srcH * 0.8, srcH * 1.2); + + for (int i = 0; i < inputBatchSize; ++i) + { + int imgW = (interp ? randW(randEng) : srcW); + int imgH = (interp ? randH(randEng) : srcH); + + srcImg.emplace_back(nvcv::Size2D{imgW, imgH}, srcFormat); + } + + nvcv::ImageBatchVarShape src(inputBatchSize); + src.pushBack(srcImg.begin(), srcImg.end()); + + nvcv::Tensor dstTensor = nvcv::util::CreateTensor(outputBatchSize, dstW, dstH, dstFormat); + + cvcuda::ResizeCropConvertReformat resizeCrop; + EXPECT_EQ(expectedReturnCode, + nvcv::ProtectCall([&] { resizeCrop(nullptr, src, dstTensor, resizeDim, interp, cropPos, manip); })); +} diff --git a/tests/nvcv_types/cudatools_system/TestInterpolationWrap.cpp b/tests/nvcv_types/cudatools_system/TestInterpolationWrap.cpp index de654ed8e..3da35b1c5 100644 --- a/tests/nvcv_types/cudatools_system/TestInterpolationWrap.cpp +++ b/tests/nvcv_types/cudatools_system/TestInterpolationWrap.cpp @@ -54,7 +54,7 @@ NVCV_TYPED_TEST_SUITE( ttype::Types< NVCV_TEST_ROW(NVCV_INTERP_NEAREST, 1, 1234.567f, 1234), NVCV_TEST_ROW(NVCV_INTERP_NEAREST, 2, 1234.567f, 1234), NVCV_TEST_ROW(NVCV_INTERP_NEAREST, 1, -3.6f, -4), NVCV_TEST_ROW(NVCV_INTERP_LINEAR, 1, 5.678f, 5), - NVCV_TEST_ROW(NVCV_INTERP_LINEAR, 2, 5.678f, 5), NVCV_TEST_ROW(NVCV_INTERP_CUBIC, 1, -1234.567f, -1234), + NVCV_TEST_ROW(NVCV_INTERP_LINEAR, 2, 5.678f, 5), NVCV_TEST_ROW(NVCV_INTERP_CUBIC, 1, -1234.567f, -1235), NVCV_TEST_ROW(NVCV_INTERP_CUBIC, 2, -1234.567f, -1235), NVCV_TEST_ROW(NVCV_INTERP_AREA, 1, 4.567f, 5), NVCV_TEST_ROW(NVCV_INTERP_AREA, 2, 4.567f, 4)>); @@ -233,27 +233,25 @@ TYPED_TEST(InterpolationWrap2DTest, correct_grid_unaligned_values_in_host) } else if (kInterpType == NVCV_INTERP_CUBIC) { - int xmin = cuda::round(floatCoord.x - 2.f); - int ymin = cuda::round(floatCoord.y - 2.f); - int xmax = cuda::round(floatCoord.x + 2.f); - int ymax = cuda::round(floatCoord.y + 2.f); + int ix = cuda::round(floatCoord.x); + int iy = cuda::round(floatCoord.y); + using FT = cuda::ConvertBaseTypeTo; auto sum = cuda::SetAll(0); - float w, wsum = 0.f; + float wx[4]; + test::GetBicubicCoeffs(floatCoord.x - ix, wx[0], wx[1], wx[2], wx[3]); + float wy[4]; + test::GetBicubicCoeffs(floatCoord.y - iy, wy[0], wy[1], wy[2], wy[3]); - for (int cy = ymin; cy <= ymax; cy++) + for (int cy = -1; cy <= 2; cy++) { - for (int cx = xmin; cx <= xmax; cx++) + for (int cx = -1; cx <= 2; cx++) { - w = test::GetBicubicCoeff(floatCoord.x - cx) * test::GetBicubicCoeff(floatCoord.y - cy); - sum += w * borderWrap[int2{cx, cy}]; - wsum += w; + sum += borderWrap[int2{ix + cx, iy + cy}] * (wx[cx + 1] * wy[cy + 1]); } } - sum = (wsum == 0.f) ? cuda::SetAll(0) : sum / wsum; - gold = cuda::SaturateCast(sum); } else if (kInterpType == NVCV_INTERP_AREA) @@ -583,27 +581,25 @@ TYPED_TEST(InterpolationWrap3DTest, correct_grid_unaligned_values_in_host) } else if (kInterpType == NVCV_INTERP_CUBIC) { - int xmin = cuda::round(floatCoord.x - 2.f); - int ymin = cuda::round(floatCoord.y - 2.f); - int xmax = cuda::round(floatCoord.x + 2.f); - int ymax = cuda::round(floatCoord.y + 2.f); + int ix = cuda::round(floatCoord.x); + int iy = cuda::round(floatCoord.y); + using FT = cuda::ConvertBaseTypeTo; auto sum = cuda::SetAll(0); - float w, wsum = 0.f; + float wx[4]; + test::GetBicubicCoeffs(floatCoord.x - ix, wx[0], wx[1], wx[2], wx[3]); + float wy[4]; + test::GetBicubicCoeffs(floatCoord.y - iy, wy[0], wy[1], wy[2], wy[3]); - for (int cy = ymin; cy <= ymax; cy++) + for (int cy = -1; cy <= 2; cy++) { - for (int cx = xmin; cx <= xmax; cx++) + for (int cx = -1; cx <= 2; cx++) { - w = test::GetBicubicCoeff(floatCoord.x - cx) * test::GetBicubicCoeff(floatCoord.y - cy); - sum += w * borderWrap[int3{cx, cy, z}]; - wsum += w; + sum += borderWrap[int3{ix + cx, iy + cy, z}] * (wx[cx + 1] * wy[cy + 1]); } } - sum = (wsum == 0.f) ? cuda::SetAll(0) : sum / wsum; - gold = cuda::SaturateCast(sum); } else if (kInterpType == NVCV_INTERP_AREA) @@ -938,27 +934,25 @@ TYPED_TEST(InterpolationWrap4DTest, correct_grid_unaligned_values_in_host) } else if (kInterpType == NVCV_INTERP_CUBIC) { - int xmin = cuda::round(floatCoord.x - 2.f); - int ymin = cuda::round(floatCoord.y - 2.f); - int xmax = cuda::round(floatCoord.x + 2.f); - int ymax = cuda::round(floatCoord.y + 2.f); + int ix = cuda::round(floatCoord.x); + int iy = cuda::round(floatCoord.y); + using FT = cuda::ConvertBaseTypeTo; auto sum = cuda::SetAll(0); - float w, wsum = 0.f; + float wx[4]; + test::GetBicubicCoeffs(floatCoord.x - ix, wx[0], wx[1], wx[2], wx[3]); + float wy[4]; + test::GetBicubicCoeffs(floatCoord.y - iy, wy[0], wy[1], wy[2], wy[3]); - for (int cy = ymin; cy <= ymax; cy++) + for (int cy = -1; cy <= 2; cy++) { - for (int cx = xmin; cx <= xmax; cx++) + for (int cx = -1; cx <= 2; cx++) { - w = test::GetBicubicCoeff(floatCoord.x - cx) * test::GetBicubicCoeff(floatCoord.y - cy); - sum += w * borderWrap[int4{k, cx, cy, z}]; - wsum += w; + sum += borderWrap[int4{k, ix + cx, iy + cy, z}] * (wx[cx + 1] * wy[cy + 1]); } } - sum = (wsum == 0.f) ? cuda::SetAll(0) : sum / wsum; - gold = cuda::SaturateCast(sum); } else if (kInterpType == NVCV_INTERP_AREA) diff --git a/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp b/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp index 427cf6db0..511f43a6a 100644 --- a/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp +++ b/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp @@ -630,7 +630,8 @@ TEST(Tensor3DWrapBigPitchDeathTest, it_dies) auto dev = tensor.exportData(); ASSERT_NE(dev, nullptr); - EXPECT_DEATH({ cuda::Tensor3DWrap wrap(*dev); }, ""); + using TensorWrap = cuda::Tensor3DWrap; + EXPECT_DEATH({ TensorWrap wrap(*dev); }, ""); } #endif diff --git a/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp b/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp index bac6f4bf6..047955016 100644 --- a/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp +++ b/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp @@ -17,6 +17,7 @@ #include // for NVCV_TYPED_TEST_SUITE_F, etc. #include // for StringLiteral +#include // also object of this test #include // the object of this test #include // for std::numeric_limits, etc. diff --git a/tests/run_tests.sh.in b/tests/run_tests.sh.in index af178ca4b..c50b66a08 100755 --- a/tests/run_tests.sh.in +++ b/tests/run_tests.sh.in @@ -20,6 +20,7 @@ shopt -s extglob # Defaults test_set="all" curdir=$(dirname "$(readlink -f "$0")") +failure_sets="" if [[ $# -ge 1 ]]; then test_set=$1 @@ -30,6 +31,10 @@ IFS="," read -r -a test_set <<< "$test_set" function on_exit() { set +e + if [ -n "$failure_sets" ]; then + echo Tests FAILED: ${failure_sets:1} + exit 1 + fi } function on_error() @@ -57,7 +62,12 @@ function run() do if [ "$testgroup" == "$test" ] || [ "$test" == "all" ];then echo "Running $testexec test suite..." + set +e NVCV_LEAK_DETECTION=abort "$curdir/$testexec" + if [ $? -ne 0 ]; then + failure_sets=$failure_sets,$testexec + fi + set -e return fi done