diff --git a/.github/workflows/CI-Linux-x64-Clang.yml b/.github/workflows/CI-Linux-x64-Clang.yml index e33716e..a2a5265 100644 --- a/.github/workflows/CI-Linux-x64-Clang.yml +++ b/.github/workflows/CI-Linux-x64-Clang.yml @@ -13,6 +13,9 @@ on: - LICENSE workflow_dispatch: +env: + GITHUB_ACTIONS: true + jobs: Linux-x64-Clang: strategy: @@ -66,9 +69,14 @@ jobs: cmake -DOpenMP_CXX_FLAGS="-fexceptions -frtti" .. cmake --build . -j 4 - - name: dist + - name: Test run: | cp src/build/realesrgan_ncnn_vulkan_wrapper.*.so src/realesrgan_ncnn_py + pdm install + pdm run test + + - name: dist + run: | mkdir dist cp -r src/realesrgan_ncnn_py dist diff --git a/.github/workflows/CI-Linux-x64-GCC.yml b/.github/workflows/CI-Linux-x64-GCC.yml index cf38ea5..fac431a 100644 --- a/.github/workflows/CI-Linux-x64-GCC.yml +++ b/.github/workflows/CI-Linux-x64-GCC.yml @@ -13,6 +13,9 @@ on: - LICENSE workflow_dispatch: +env: + GITHUB_ACTIONS: true + jobs: Linux-x64-GCC: strategy: @@ -63,9 +66,14 @@ jobs: cmake -DOpenMP_CXX_FLAGS="-fexceptions -frtti" .. cmake --build . -j 4 - - name: dist + - name: Test run: | cp src/build/realesrgan_ncnn_vulkan_wrapper.*.so src/realesrgan_ncnn_py + pdm install + pdm run test + + - name: dist + run: | mkdir dist cp -r src/realesrgan_ncnn_py dist diff --git a/.github/workflows/CI-MacOS-Universal-Clang.yml b/.github/workflows/CI-MacOS-Universal-Clang.yml index 927b31b..e526b7c 100644 --- a/.github/workflows/CI-MacOS-Universal-Clang.yml +++ b/.github/workflows/CI-MacOS-Universal-Clang.yml @@ -14,6 +14,7 @@ on: workflow_dispatch: env: + GITHUB_ACTIONS: true DEVELOPER_DIR: /Applications/Xcode_14.2.app/Contents/Developer jobs: @@ -102,6 +103,11 @@ jobs: if: matrix.python-version == '3.11' run: lipo -create src/build-arm64/realesrgan_ncnn_vulkan_wrapper.cpython-311-darwin.so src/build-x86_64/realesrgan_ncnn_vulkan_wrapper.cpython-311-darwin.so -o src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan_wrapper.cpython-311-darwin.so + - name: Test + run: | + pdm install + pdm run test + - name: dist run: | mkdir dist diff --git a/.github/workflows/CI-Windows-x64-MSVC.yml b/.github/workflows/CI-Windows-x64-MSVC.yml index 62fbac6..8ef7fa0 100644 --- a/.github/workflows/CI-Windows-x64-MSVC.yml +++ b/.github/workflows/CI-Windows-x64-MSVC.yml @@ -13,6 +13,9 @@ on: - LICENSE workflow_dispatch: +env: + GITHUB_ACTIONS: true + jobs: windows: strategy: @@ -63,11 +66,16 @@ jobs: cmake -A x64 -DCMAKE_CXX_FLAGS="-frtti -fexceptions" .. cmake --build . --config Release -j 4 - - name: dist + - name: Test run: | - mkdir dist echo F | xcopy .\src\build\Release\realesrgan_ncnn_vulkan_wrapper.*.pyd .\src\realesrgan_ncnn_py echo F | xcopy .\tests\vulkan-1.dll .\src\realesrgan_ncnn_py + pdm install + pdm run test + + - name: dist + run: | + mkdir dist echo D | xcopy .\src\realesrgan_ncnn_py dist - name: upload diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml index cc0f1f6..3137b1c 100644 --- a/.github/workflows/Release.yml +++ b/.github/workflows/Release.yml @@ -4,6 +4,7 @@ on: workflow_dispatch: env: + GITHUB_ACTIONS: true DEVELOPER_DIR: /Applications/Xcode_14.2.app/Contents/Developer jobs: diff --git a/.github/workflows/test_pip.yml b/.github/workflows/test_pip.yml new file mode 100644 index 0000000..136a25c --- /dev/null +++ b/.github/workflows/test_pip.yml @@ -0,0 +1,41 @@ +name: test_pip + +on: + schedule: + - cron: "0 19 * 1 *" + workflow_dispatch: + +env: + GITHUB_ACTIONS: true + +jobs: + test: + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os-version: ["macos-latest", "windows-latest", "ubuntu-20.04"] + + runs-on: ${{ matrix.os-version }} + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + + - name: vulkan dll + if: matrix.os-version == 'windows-latest' + run: echo F | xcopy .\tests\vulkan-1.dll C:\Windows\System32 + + - name: Check Python version + run: | + python --version + + - name: Test + run: | + pip install --upgrade pip chardet + pip install pathlib opencv-python scikit-image Pillow pytest pytest-cov realesrgan-ncnn-py + python -m pytest tests diff --git a/.gitmodules b/.gitmodules index 2d85a92..0fedaa6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "src/Real-ESRGAN-ncnn-vulkan"] - path = src/Real-ESRGAN-ncnn-vulkan - url = https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan [submodule "src/pybind11"] path = src/pybind11 url = https://github.com/pybind/pybind11 +[submodule "src/Real-ESRGAN-ncnn-vulkan/src/ncnn"] + path = src/Real-ESRGAN-ncnn-vulkan/src/ncnn + url = https://github.com/Tencent/ncnn diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 600f80a..1bfa8c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: rev: v1.7.1 hooks: - id: mypy - args: [src, tests] + args: [src/realesrgan_ncnn_py, tests] pass_filenames: false additional_dependencies: - types-requests diff --git a/README.md b/README.md index 5bdbfb9..0956867 100644 --- a/README.md +++ b/README.md @@ -7,18 +7,19 @@ Python Binding for realesrgan-ncnn-py with PyBind11 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/realesrgan-ncnn-py) Real-ESRGAN aims at developing Practical Algorithms for General Image/Video Restoration. -We extend the powerful ESRGAN to a practical restoration application (namely, Real-ESRGAN), which is trained with pure synthetic data. +We extend the powerful ESRGAN to a practical restoration application (namely, Real-ESRGAN), which is trained with pure +synthetic data. This wrapper provides an easy-to-use interface for running the pre-trained Real-ESRGAN model. ### Current building status matrix -| System | Status | CPU (32bit) | CPU (64bit) | GPU (32bit) | GPU (64bit) | -| :-----------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------: | :---------: | :---------: | :----------------: | -| Linux (Clang) | [![CI-Linux-x64-Clang](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml) | — | — | — | :white_check_mark: | -| Linux (GCC) | [![CI-Linux-x64-GCC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml) | — | — | — | :white_check_mark: | -| Windows | [![CI-Windows-x64-MSVC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml) | — | — | — | :white_check_mark: | -| MacOS | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) | — | — | — | :white_check_mark: | -| MacOS (ARM) | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) | — | — | — | :white_check_mark: | +| System | Status | CPU (32bit) | CPU (64bit) | GPU (32bit) | GPU (64bit) | +| :-----------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------: | :----------------: | :---------: | :----------------: | +| Linux (Clang) | [![CI-Linux-x64-Clang](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml) | — | :white_check_mark: | — | :white_check_mark: | +| Linux (GCC) | [![CI-Linux-x64-GCC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml) | — | :white_check_mark: | — | :white_check_mark: | +| Windows | [![CI-Windows-x64-MSVC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml) | — | :white_check_mark: | — | :white_check_mark: | +| MacOS | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) | — | :white_check_mark: | — | :white_check_mark: | +| MacOS (ARM) | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) | — | :white_check_mark: | — | :white_check_mark: | # Usage @@ -56,7 +57,8 @@ realesrgan = Realesrgan(gpuid: int = 0, tta_mode: bool = False, tilesize: int = ``` -Here, gpuid specifies the GPU device to use, tta_mode enables test-time augmentation, tilesize specifies the tile size for processing (0 or >= 32), and model specifies the num of the pre-trained model to use. +Here, gpuid specifies the GPU device to use, tta_mode enables test-time augmentation, tilesize specifies the tile size +for processing (0 or >= 32), and model specifies the num of the pre-trained model to use. Once the model is initialized, you can use the upscale method to super-resolve your images: @@ -64,6 +66,7 @@ Once the model is initialized, you can use the upscale method to super-resolve y ```python from PIL import Image + realesrgan = Realesrgan(gpuid=0) with Image.open("input.jpg") as image: image = realesrgan.process_pil(image) @@ -74,6 +77,7 @@ with Image.open("input.jpg") as image: ```python import cv2 + realesrgan = Realesrgan(gpuid=0) image = cv2.imdecode(np.fromfile("input.jpg", dtype=np.uint8), cv2.IMREAD_COLOR) image = realesrgan.process_cv2(image) @@ -84,9 +88,10 @@ cv2.imencode(".jpg", image)[1].tofile("output_cv2.jpg") ```python import subprocess as sp + # your ffmpeg parameters -command_out = [FFMPEG_BIN,........] -command_in = [FFMPEG_BIN,........] +command_out = [FFMPEG_BIN, ........] +command_in = [FFMPEG_BIN, ........] pipe_out = sp.Popen(command_out, stdout=sp.PIPE, bufsize=10 ** 8) pipe_in = sp.Popen(command_in, stdin=sp.PIPE) realesrgan = Realesrgan(gpuid=0) @@ -102,20 +107,27 @@ while True: [here](https://github.com/Tohrusky/realesrgan-ncnn-py/blob/main/.github/workflows/Release.yml) -_The project just only been tested in Ubuntu 18+ and Debian 9+ environments on Linux, so if the project does not work on your system, please try building it._ +_The project just only been tested in Ubuntu 18+ and Debian 9+ environments on Linux, so if the project does not work on +your system, please try building it._ # References The following references were used in the development of this project: -[xinntao/Real-ESRGAN-ncnn-vulkan](https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan) - This project was the main inspiration for our work. It provided the core implementation of the Real-ESRGAN algorithm using the ncnn and Vulkan libraries. +[xinntao/Real-ESRGAN-ncnn-vulkan](https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan) - This project was the main +inspiration for our work. It provided the core implementation of the Real-ESRGAN algorithm using the ncnn and Vulkan +libraries. -[Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN) - Real-ESRGAN is an AI super resolution model, aims at developing Practical Algorithms for General Image/Video Restoration. +[Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN) - Real-ESRGAN is an AI super resolution model, aims at developing +Practical Algorithms for General Image/Video Restoration. -[media2x/realsr-ncnn-vulkan-python](https://github.com/media2x/realsr-ncnn-vulkan-python) - This project was used as a reference for implementing the wrapper. _Special thanks_ to the original author for sharing the code. +[media2x/realsr-ncnn-vulkan-python](https://github.com/media2x/realsr-ncnn-vulkan-python) - This project was used as a +reference for implementing the wrapper. _Special thanks_ to the original author for sharing the code. -[ncnn](https://github.com/Tencent/ncnn) - ncnn is a high-performance neural network inference framework developed by Tencent AI Lab. +[ncnn](https://github.com/Tencent/ncnn) - ncnn is a high-performance neural network inference framework developed by +Tencent AI Lab. # License -This project is licensed under the BSD 3-Clause - see the [LICENSE file](https://github.com/Tohrusky/realesrgan-ncnn-py/blob/main/LICENSE) for details. +This project is licensed under the BSD 3-Clause - see +the [LICENSE file](https://github.com/Tohrusky/realesrgan-ncnn-py/blob/main/LICENSE) for details. diff --git a/src/Real-ESRGAN-ncnn-vulkan b/src/Real-ESRGAN-ncnn-vulkan deleted file mode 160000 index 37026f4..0000000 --- a/src/Real-ESRGAN-ncnn-vulkan +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 37026f49824c5cf84062e7c6a5dd71445dcf610f diff --git a/src/Real-ESRGAN-ncnn-vulkan/LICENSE b/src/Real-ESRGAN-ncnn-vulkan/LICENSE new file mode 100644 index 0000000..e8ea6d7 --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/LICENSE @@ -0,0 +1,46 @@ +The MIT License (MIT) + +Copyright (c) 2021 Xintao Wang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +------------------------ +The following is the License of realsr-ncnn-vulkan + +The MIT License (MIT) + +Copyright (c) 2019 nihui + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/ncnn b/src/Real-ESRGAN-ncnn-vulkan/src/ncnn new file mode 160000 index 0000000..6125c9f --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/ncnn @@ -0,0 +1 @@ +Subproject commit 6125c9f47cd14b589de0521350668cf9d3d37e3c diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.cpp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.cpp new file mode 100644 index 0000000..ea882c7 --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.cpp @@ -0,0 +1,874 @@ +// realesrgan implemented with ncnn library + +#include "realesrgan.h" + +#include +#include +#include + +static const uint32_t realesrgan_preproc_spv_data[] = { +#include "realesrgan_preproc.spv.hex.h" +}; +static const uint32_t realesrgan_preproc_fp16s_spv_data[] = { +#include "realesrgan_preproc_fp16s.spv.hex.h" +}; +static const uint32_t realesrgan_preproc_int8s_spv_data[] = { +#include "realesrgan_preproc_int8s.spv.hex.h" +}; +static const uint32_t realesrgan_postproc_spv_data[] = { +#include "realesrgan_postproc.spv.hex.h" +}; +static const uint32_t realesrgan_postproc_fp16s_spv_data[] = { +#include "realesrgan_postproc_fp16s.spv.hex.h" +}; +static const uint32_t realesrgan_postproc_int8s_spv_data[] = { +#include "realesrgan_postproc_int8s.spv.hex.h" +}; + +static const uint32_t realesrgan_preproc_tta_spv_data[] = { +#include "realesrgan_preproc_tta.spv.hex.h" +}; +static const uint32_t realesrgan_preproc_tta_fp16s_spv_data[] = { +#include "realesrgan_preproc_tta_fp16s.spv.hex.h" +}; +static const uint32_t realesrgan_preproc_tta_int8s_spv_data[] = { +#include "realesrgan_preproc_tta_int8s.spv.hex.h" +}; +static const uint32_t realesrgan_postproc_tta_spv_data[] = { +#include "realesrgan_postproc_tta.spv.hex.h" +}; +static const uint32_t realesrgan_postproc_tta_fp16s_spv_data[] = { +#include "realesrgan_postproc_tta_fp16s.spv.hex.h" +}; +static const uint32_t realesrgan_postproc_tta_int8s_spv_data[] = { +#include "realesrgan_postproc_tta_int8s.spv.hex.h" +}; + +RealESRGAN::RealESRGAN(int gpuid, bool _tta_mode) { + vkdev = gpuid == -1 ? 0 : ncnn::get_gpu_device(gpuid); + + realesrgan_preproc = 0; + realesrgan_postproc = 0; + bicubic_2x = 0; + bicubic_3x = 0; + bicubic_4x = 0; + tta_mode = _tta_mode; +} + +RealESRGAN::~RealESRGAN() { + // cleanup preprocess and postprocess pipeline + { + delete realesrgan_preproc; + delete realesrgan_postproc; + } + + bicubic_2x->destroy_pipeline(net.opt); + delete bicubic_2x; + + bicubic_3x->destroy_pipeline(net.opt); + delete bicubic_3x; + + bicubic_4x->destroy_pipeline(net.opt); + delete bicubic_4x; +} + +#if _WIN32 +int RealESRGAN::load(const std::wstring& parampath, const std::wstring& modelpath) +#else + +int RealESRGAN::load(const std::string ¶mpath, const std::string &modelpath) +#endif +{ + net.opt.use_vulkan_compute = vkdev ? true : false; + net.opt.use_fp16_packed = true; + net.opt.use_fp16_storage = vkdev ? true : false; + net.opt.use_fp16_arithmetic = false; + net.opt.use_int8_storage = true; + net.opt.use_int8_arithmetic = false; + + net.set_vulkan_device(vkdev); + +#if _WIN32 + { + FILE* fp = _wfopen(parampath.c_str(), L"rb"); + if (!fp) + { + fwprintf(stderr, L"_wfopen %ls failed\n", parampath.c_str()); + } + + net.load_param(fp); + + fclose(fp); + } + { + FILE* fp = _wfopen(modelpath.c_str(), L"rb"); + if (!fp) + { + fwprintf(stderr, L"_wfopen %ls failed\n", modelpath.c_str()); + } + + net.load_model(fp); + + fclose(fp); + } +#else + net.load_param(parampath.c_str()); + net.load_model(modelpath.c_str()); +#endif + + if (vkdev) + // initialize preprocess and postprocess pipeline + { + std::vector specializations(1); +#if _WIN32 + specializations[0].i = 1; +#else + specializations[0].i = 0; +#endif + + realesrgan_preproc = new ncnn::Pipeline(vkdev); + realesrgan_preproc->set_optimal_local_size_xyz(32, 32, 3); + + realesrgan_postproc = new ncnn::Pipeline(vkdev); + realesrgan_postproc->set_optimal_local_size_xyz(32, 32, 3); + + if (tta_mode) { + if (net.opt.use_fp16_storage && net.opt.use_int8_storage) + realesrgan_preproc->create(realesrgan_preproc_tta_int8s_spv_data, + sizeof(realesrgan_preproc_tta_int8s_spv_data), specializations); + else if (net.opt.use_fp16_storage) + realesrgan_preproc->create(realesrgan_preproc_tta_fp16s_spv_data, + sizeof(realesrgan_preproc_tta_fp16s_spv_data), specializations); + else + realesrgan_preproc->create(realesrgan_preproc_tta_spv_data, sizeof(realesrgan_preproc_tta_spv_data), + specializations); + + if (net.opt.use_fp16_storage && net.opt.use_int8_storage) + realesrgan_postproc->create(realesrgan_postproc_tta_int8s_spv_data, + sizeof(realesrgan_postproc_tta_int8s_spv_data), specializations); + else if (net.opt.use_fp16_storage) + realesrgan_postproc->create(realesrgan_postproc_tta_fp16s_spv_data, + sizeof(realesrgan_postproc_tta_fp16s_spv_data), specializations); + else + realesrgan_postproc->create(realesrgan_postproc_tta_spv_data, sizeof(realesrgan_postproc_tta_spv_data), + specializations); + } else { + if (net.opt.use_fp16_storage && net.opt.use_int8_storage) + realesrgan_preproc->create(realesrgan_preproc_int8s_spv_data, sizeof(realesrgan_preproc_int8s_spv_data), + specializations); + else if (net.opt.use_fp16_storage) + realesrgan_preproc->create(realesrgan_preproc_fp16s_spv_data, sizeof(realesrgan_preproc_fp16s_spv_data), + specializations); + else + realesrgan_preproc->create(realesrgan_preproc_spv_data, sizeof(realesrgan_preproc_spv_data), + specializations); + + if (net.opt.use_fp16_storage && net.opt.use_int8_storage) + realesrgan_postproc->create(realesrgan_postproc_int8s_spv_data, + sizeof(realesrgan_postproc_int8s_spv_data), specializations); + else if (net.opt.use_fp16_storage) + realesrgan_postproc->create(realesrgan_postproc_fp16s_spv_data, + sizeof(realesrgan_postproc_fp16s_spv_data), specializations); + else + realesrgan_postproc->create(realesrgan_postproc_spv_data, sizeof(realesrgan_postproc_spv_data), + specializations); + } + } + + // bicubic 2x/3x/4x for alpha channel + { + bicubic_2x = ncnn::create_layer("Interp"); + bicubic_2x->vkdev = vkdev; + + ncnn::ParamDict pd; + pd.set(0, 3);// bicubic + pd.set(1, 2.f); + pd.set(2, 2.f); + bicubic_2x->load_param(pd); + + bicubic_2x->create_pipeline(net.opt); + } + { + bicubic_3x = ncnn::create_layer("Interp"); + bicubic_3x->vkdev = vkdev; + + ncnn::ParamDict pd; + pd.set(0, 3);// bicubic + pd.set(1, 3.f); + pd.set(2, 3.f); + bicubic_3x->load_param(pd); + + bicubic_3x->create_pipeline(net.opt); + } + { + bicubic_4x = ncnn::create_layer("Interp"); + bicubic_4x->vkdev = vkdev; + + ncnn::ParamDict pd; + pd.set(0, 3);// bicubic + pd.set(1, 4.f); + pd.set(2, 4.f); + bicubic_4x->load_param(pd); + + bicubic_4x->create_pipeline(net.opt); + } + + return 0; +} + +int RealESRGAN::process(const ncnn::Mat &inimage, ncnn::Mat &outimage) const { + if (!vkdev) { + // cpu only + return process_cpu(inimage, outimage); + } + + + const unsigned char *pixeldata = (const unsigned char *) inimage.data; + const int w = inimage.w; + const int h = inimage.h; + const int channels = inimage.elempack; + + const int TILE_SIZE_X = tilesize; + const int TILE_SIZE_Y = tilesize; + + ncnn::VkAllocator *blob_vkallocator = net.vulkan_device()->acquire_blob_allocator(); + ncnn::VkAllocator *staging_vkallocator = net.vulkan_device()->acquire_staging_allocator(); + + ncnn::Option opt = net.opt; + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + // each tile 100x100 + const int xtiles = (w + TILE_SIZE_X - 1) / TILE_SIZE_X; + const int ytiles = (h + TILE_SIZE_Y - 1) / TILE_SIZE_Y; + + const size_t in_out_tile_elemsize = opt.use_fp16_storage ? 2u : 4u; + + //#pragma omp parallel for num_threads(2) + for (int yi = 0; yi < ytiles; yi++) { + const int tile_h_nopad = std::min((yi + 1) * TILE_SIZE_Y, h) - yi * TILE_SIZE_Y; + + int in_tile_y0 = std::max(yi * TILE_SIZE_Y - prepadding, 0); + int in_tile_y1 = std::min((yi + 1) * TILE_SIZE_Y + prepadding, h); + + ncnn::Mat in; + if (opt.use_fp16_storage && opt.use_int8_storage) { + in = ncnn::Mat(w, (in_tile_y1 - in_tile_y0), (unsigned char *) pixeldata + in_tile_y0 * w * channels, + (size_t) channels, 1); + } else { + if (channels == 3) { + in = ncnn::Mat::from_pixels(pixeldata + in_tile_y0 * w * channels, ncnn::Mat::PIXEL_RGB, w, + (in_tile_y1 - in_tile_y0)); + } + if (channels == 4) { + in = ncnn::Mat::from_pixels(pixeldata + in_tile_y0 * w * channels, ncnn::Mat::PIXEL_RGBA, w, + (in_tile_y1 - in_tile_y0)); + } + } + + ncnn::VkCompute cmd(net.vulkan_device()); + + // upload + ncnn::VkMat in_gpu; + { + cmd.record_clone(in, in_gpu, opt); + + if (xtiles > 1) { + cmd.submit_and_wait(); + cmd.reset(); + } + } + + int out_tile_y0 = std::max(yi * TILE_SIZE_Y, 0); + int out_tile_y1 = std::min((yi + 1) * TILE_SIZE_Y, h); + + ncnn::VkMat out_gpu; + if (opt.use_fp16_storage && opt.use_int8_storage) { + out_gpu.create(w * scale, (out_tile_y1 - out_tile_y0) * scale, (size_t) channels, 1, blob_vkallocator); + } else { + out_gpu.create(w * scale, (out_tile_y1 - out_tile_y0) * scale, channels, (size_t) 4u, 1, blob_vkallocator); + } + + for (int xi = 0; xi < xtiles; xi++) { + const int tile_w_nopad = std::min((xi + 1) * TILE_SIZE_X, w) - xi * TILE_SIZE_X; + + if (tta_mode) { + // preproc + ncnn::VkMat in_tile_gpu[8]; + ncnn::VkMat in_alpha_tile_gpu; + { + // crop tile + int tile_x0 = xi * TILE_SIZE_X - prepadding; + int tile_x1 = std::min((xi + 1) * TILE_SIZE_X, w) + prepadding; + int tile_y0 = yi * TILE_SIZE_Y - prepadding; + int tile_y1 = std::min((yi + 1) * TILE_SIZE_Y, h) + prepadding; + + in_tile_gpu[0].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[1].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[2].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[3].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[4].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[5].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[6].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + in_tile_gpu[7].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + + if (channels == 4) { + in_alpha_tile_gpu.create(tile_w_nopad, tile_h_nopad, 1, in_out_tile_elemsize, 1, + blob_vkallocator); + } + + std::vector bindings(10); + bindings[0] = in_gpu; + bindings[1] = in_tile_gpu[0]; + bindings[2] = in_tile_gpu[1]; + bindings[3] = in_tile_gpu[2]; + bindings[4] = in_tile_gpu[3]; + bindings[5] = in_tile_gpu[4]; + bindings[6] = in_tile_gpu[5]; + bindings[7] = in_tile_gpu[6]; + bindings[8] = in_tile_gpu[7]; + bindings[9] = in_alpha_tile_gpu; + + std::vector constants(13); + constants[0].i = in_gpu.w; + constants[1].i = in_gpu.h; + constants[2].i = in_gpu.cstep; + constants[3].i = in_tile_gpu[0].w; + constants[4].i = in_tile_gpu[0].h; + constants[5].i = in_tile_gpu[0].cstep; + constants[6].i = prepadding; + constants[7].i = prepadding; + constants[8].i = xi * TILE_SIZE_X; + constants[9].i = std::min(yi * TILE_SIZE_Y, prepadding); + constants[10].i = channels; + constants[11].i = in_alpha_tile_gpu.w; + constants[12].i = in_alpha_tile_gpu.h; + + ncnn::VkMat dispatcher; + dispatcher.w = in_tile_gpu[0].w; + dispatcher.h = in_tile_gpu[0].h; + dispatcher.c = channels; + + cmd.record_pipeline(realesrgan_preproc, bindings, constants, dispatcher); + } + + // realesrgan + ncnn::VkMat out_tile_gpu[8]; + for (int ti = 0; ti < 8; ti++) { + ncnn::Extractor ex = net.create_extractor(); + + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("data", in_tile_gpu[ti]); + + ex.extract("output", out_tile_gpu[ti], cmd); + + { + cmd.submit_and_wait(); + cmd.reset(); + } + } + + ncnn::VkMat out_alpha_tile_gpu; + if (channels == 4) { + if (scale == 1) { + out_alpha_tile_gpu = in_alpha_tile_gpu; + } + if (scale == 2) { + bicubic_2x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt); + } + if (scale == 3) { + bicubic_3x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt); + } + if (scale == 4) { + bicubic_4x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt); + } + } + + // postproc + { + std::vector bindings(10); + bindings[0] = out_tile_gpu[0]; + bindings[1] = out_tile_gpu[1]; + bindings[2] = out_tile_gpu[2]; + bindings[3] = out_tile_gpu[3]; + bindings[4] = out_tile_gpu[4]; + bindings[5] = out_tile_gpu[5]; + bindings[6] = out_tile_gpu[6]; + bindings[7] = out_tile_gpu[7]; + bindings[8] = out_alpha_tile_gpu; + bindings[9] = out_gpu; + + std::vector constants(13); + constants[0].i = out_tile_gpu[0].w; + constants[1].i = out_tile_gpu[0].h; + constants[2].i = out_tile_gpu[0].cstep; + constants[3].i = out_gpu.w; + constants[4].i = out_gpu.h; + constants[5].i = out_gpu.cstep; + constants[6].i = xi * TILE_SIZE_X * scale; + constants[7].i = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale); + constants[8].i = prepadding * scale; + constants[9].i = prepadding * scale; + constants[10].i = channels; + constants[11].i = out_alpha_tile_gpu.w; + constants[12].i = out_alpha_tile_gpu.h; + + ncnn::VkMat dispatcher; + dispatcher.w = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale); + dispatcher.h = out_gpu.h; + dispatcher.c = channels; + + cmd.record_pipeline(realesrgan_postproc, bindings, constants, dispatcher); + } + } else { + // preproc + ncnn::VkMat in_tile_gpu; + ncnn::VkMat in_alpha_tile_gpu; + { + // crop tile + int tile_x0 = xi * TILE_SIZE_X - prepadding; + int tile_x1 = std::min((xi + 1) * TILE_SIZE_X, w) + prepadding; + int tile_y0 = yi * TILE_SIZE_Y - prepadding; + int tile_y1 = std::min((yi + 1) * TILE_SIZE_Y, h) + prepadding; + + in_tile_gpu.create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1, + blob_vkallocator); + + if (channels == 4) { + in_alpha_tile_gpu.create(tile_w_nopad, tile_h_nopad, 1, in_out_tile_elemsize, 1, + blob_vkallocator); + } + + std::vector bindings(3); + bindings[0] = in_gpu; + bindings[1] = in_tile_gpu; + bindings[2] = in_alpha_tile_gpu; + + std::vector constants(13); + constants[0].i = in_gpu.w; + constants[1].i = in_gpu.h; + constants[2].i = in_gpu.cstep; + constants[3].i = in_tile_gpu.w; + constants[4].i = in_tile_gpu.h; + constants[5].i = in_tile_gpu.cstep; + constants[6].i = prepadding; + constants[7].i = prepadding; + constants[8].i = xi * TILE_SIZE_X; + constants[9].i = std::min(yi * TILE_SIZE_Y, prepadding); + constants[10].i = channels; + constants[11].i = in_alpha_tile_gpu.w; + constants[12].i = in_alpha_tile_gpu.h; + + ncnn::VkMat dispatcher; + dispatcher.w = in_tile_gpu.w; + dispatcher.h = in_tile_gpu.h; + dispatcher.c = channels; + + cmd.record_pipeline(realesrgan_preproc, bindings, constants, dispatcher); + } + + // realesrgan + ncnn::VkMat out_tile_gpu; + { + ncnn::Extractor ex = net.create_extractor(); + + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("data", in_tile_gpu); + + ex.extract("output", out_tile_gpu, cmd); + } + + ncnn::VkMat out_alpha_tile_gpu; + if (channels == 4) { + if (scale == 1) { + out_alpha_tile_gpu = in_alpha_tile_gpu; + } + if (scale == 2) { + bicubic_2x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt); + } + if (scale == 3) { + bicubic_3x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt); + } + if (scale == 4) { + bicubic_4x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt); + } + } + + // postproc + { + std::vector bindings(3); + bindings[0] = out_tile_gpu; + bindings[1] = out_alpha_tile_gpu; + bindings[2] = out_gpu; + + std::vector constants(13); + constants[0].i = out_tile_gpu.w; + constants[1].i = out_tile_gpu.h; + constants[2].i = out_tile_gpu.cstep; + constants[3].i = out_gpu.w; + constants[4].i = out_gpu.h; + constants[5].i = out_gpu.cstep; + constants[6].i = xi * TILE_SIZE_X * scale; + constants[7].i = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale); + constants[8].i = prepadding * scale; + constants[9].i = prepadding * scale; + constants[10].i = channels; + constants[11].i = out_alpha_tile_gpu.w; + constants[12].i = out_alpha_tile_gpu.h; + + ncnn::VkMat dispatcher; + dispatcher.w = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale); + dispatcher.h = out_gpu.h; + dispatcher.c = channels; + + cmd.record_pipeline(realesrgan_postproc, bindings, constants, dispatcher); + } + } + + if (xtiles > 1) { + cmd.submit_and_wait(); + cmd.reset(); + } + + fprintf(stderr, "%.2f%%\n", (float) (yi * xtiles + xi) / (ytiles * xtiles) * 100); + } + + // download + { + ncnn::Mat out; + + if (opt.use_fp16_storage && opt.use_int8_storage) { + out = ncnn::Mat(out_gpu.w, out_gpu.h, + (unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels, + (size_t) channels, 1); + } + + cmd.record_clone(out_gpu, out, opt); + + cmd.submit_and_wait(); + + if (!(opt.use_fp16_storage && opt.use_int8_storage)) { + if (channels == 3) { + out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels, + ncnn::Mat::PIXEL_RGB); + } + if (channels == 4) { + out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels, + ncnn::Mat::PIXEL_RGBA); + } + } + } + } + + net.vulkan_device()->reclaim_blob_allocator(blob_vkallocator); + net.vulkan_device()->reclaim_staging_allocator(staging_vkallocator); + + return 0; +} + +int RealESRGAN::process_cpu(const ncnn::Mat &inimage, ncnn::Mat &outimage) const { + const unsigned char *pixeldata = (const unsigned char *) inimage.data; + const int w = inimage.w; + const int h = inimage.h; + const int channels = inimage.elempack; + + const int TILE_SIZE_X = tilesize; + const int TILE_SIZE_Y = tilesize; + + ncnn::Option opt = net.opt; + + // each tile 100x100 + const int xtiles = (w + TILE_SIZE_X - 1) / TILE_SIZE_X; + const int ytiles = (h + TILE_SIZE_Y - 1) / TILE_SIZE_Y; + + for (int yi = 0; yi < ytiles; yi++) { + const int tile_h_nopad = std::min((yi + 1) * TILE_SIZE_Y, h) - yi * TILE_SIZE_Y; + + int in_tile_y0 = std::max(yi * TILE_SIZE_Y - prepadding, 0); + int in_tile_y1 = std::min((yi + 1) * TILE_SIZE_Y + prepadding, h); + + for (int xi = 0; xi < xtiles; xi++) { + const int tile_w_nopad = std::min((xi + 1) * TILE_SIZE_X, w) - xi * TILE_SIZE_X; + + int in_tile_x0 = std::max(xi * TILE_SIZE_X - prepadding, 0); + int in_tile_x1 = std::min((xi + 1) * TILE_SIZE_X + prepadding, w); + + // crop tile + ncnn::Mat in; + { + if (channels == 3) { + in = ncnn::Mat::from_pixels_roi(pixeldata, ncnn::Mat::PIXEL_RGB, w, h, in_tile_x0, in_tile_y0, + in_tile_x1 - in_tile_x0, in_tile_y1 - in_tile_y0); + } + if (channels == 4) { + in = ncnn::Mat::from_pixels_roi(pixeldata, ncnn::Mat::PIXEL_RGBA, w, h, in_tile_x0, in_tile_y0, + in_tile_x1 - in_tile_x0, in_tile_y1 - in_tile_y0); + } + } + + ncnn::Mat out; + + if (tta_mode) { + // split alpha and preproc + ncnn::Mat in_tile[8]; + ncnn::Mat in_alpha_tile; + { + in_tile[0].create(in.w, in.h, 3); + for (int q = 0; q < 3; q++) { + const float *ptr = in.channel(q); + float *outptr0 = in_tile[0].channel(q); + + for (int i = 0; i < in.h; i++) { + for (int j = 0; j < in.w; j++) { + *outptr0++ = *ptr++ * (1 / 255.f); + } + } + } + + if (channels == 4) { + in_alpha_tile = in.channel_range(3, 1).clone(); + } + } + + // border padding + { + int pad_top = std::max(prepadding - yi * TILE_SIZE_Y, 0); + int pad_bottom = std::max(std::min((yi + 1) * TILE_SIZE_Y + prepadding - h, prepadding), 0); + int pad_left = std::max(prepadding - xi * TILE_SIZE_X, 0); + int pad_right = std::max(std::min((xi + 1) * TILE_SIZE_X + prepadding - w, prepadding), 0); + + ncnn::Mat in_tile_padded; + ncnn::copy_make_border(in_tile[0], in_tile_padded, pad_top, pad_bottom, pad_left, pad_right, 2, 0.f, + net.opt); + in_tile[0] = in_tile_padded; + } + + // the other 7 directions + { + in_tile[1].create(in_tile[0].w, in_tile[0].h, 3); + in_tile[2].create(in_tile[0].w, in_tile[0].h, 3); + in_tile[3].create(in_tile[0].w, in_tile[0].h, 3); + in_tile[4].create(in_tile[0].h, in_tile[0].w, 3); + in_tile[5].create(in_tile[0].h, in_tile[0].w, 3); + in_tile[6].create(in_tile[0].h, in_tile[0].w, 3); + in_tile[7].create(in_tile[0].h, in_tile[0].w, 3); + + for (int q = 0; q < 3; q++) { + const ncnn::Mat in_tile_0 = in_tile[0].channel(q); + ncnn::Mat in_tile_1 = in_tile[1].channel(q); + ncnn::Mat in_tile_2 = in_tile[2].channel(q); + ncnn::Mat in_tile_3 = in_tile[3].channel(q); + ncnn::Mat in_tile_4 = in_tile[4].channel(q); + ncnn::Mat in_tile_5 = in_tile[5].channel(q); + ncnn::Mat in_tile_6 = in_tile[6].channel(q); + ncnn::Mat in_tile_7 = in_tile[7].channel(q); + + for (int i = 0; i < in_tile[0].h; i++) { + const float *outptr0 = in_tile_0.row(i); + float *outptr1 = in_tile_1.row(in_tile[0].h - 1 - i); + float *outptr2 = in_tile_2.row(i) + in_tile[0].w - 1; + float *outptr3 = in_tile_3.row(in_tile[0].h - 1 - i) + in_tile[0].w - 1; + + for (int j = 0; j < in_tile[0].w; j++) { + float *outptr4 = in_tile_4.row(j) + i; + float *outptr5 = in_tile_5.row(in_tile[0].w - 1 - j) + i; + float *outptr6 = in_tile_6.row(j) + in_tile[0].h - 1 - i; + float *outptr7 = in_tile_7.row(in_tile[0].w - 1 - j) + in_tile[0].h - 1 - i; + + float v = *outptr0++; + + *outptr1++ = v; + *outptr2-- = v; + *outptr3-- = v; + *outptr4 = v; + *outptr5 = v; + *outptr6 = v; + *outptr7 = v; + } + } + } + } + + // realsr + ncnn::Mat out_tile[8]; + for (int ti = 0; ti < 8; ti++) { + ncnn::Extractor ex = net.create_extractor(); + + ex.input("data", in_tile[ti]); + + ex.extract("output", out_tile[ti]); + } + + ncnn::Mat out_alpha_tile; + if (channels == 4) { + if (scale == 1) { + out_alpha_tile = in_alpha_tile; + } + if (scale == 2) { + bicubic_2x->forward(in_alpha_tile, out_alpha_tile, opt); + } + if (scale == 3) { + bicubic_3x->forward(in_alpha_tile, out_alpha_tile, opt); + } + if (scale == 4) { + bicubic_4x->forward(in_alpha_tile, out_alpha_tile, opt); + } + } + + // postproc and merge alpha + { + out.create(tile_w_nopad * scale, tile_h_nopad * scale, channels); + for (int q = 0; q < 3; q++) { + const ncnn::Mat out_tile_0 = out_tile[0].channel(q); + const ncnn::Mat out_tile_1 = out_tile[1].channel(q); + const ncnn::Mat out_tile_2 = out_tile[2].channel(q); + const ncnn::Mat out_tile_3 = out_tile[3].channel(q); + const ncnn::Mat out_tile_4 = out_tile[4].channel(q); + const ncnn::Mat out_tile_5 = out_tile[5].channel(q); + const ncnn::Mat out_tile_6 = out_tile[6].channel(q); + const ncnn::Mat out_tile_7 = out_tile[7].channel(q); + float *outptr = out.channel(q); + + for (int i = 0; i < out.h; i++) { + const float *ptr0 = out_tile_0.row(i + prepadding * scale) + prepadding * scale; + const float *ptr1 = + out_tile_1.row(out_tile[0].h - 1 - i - prepadding * scale) + prepadding * scale; + const float *ptr2 = + out_tile_2.row(i + prepadding * scale) + out_tile[0].w - 1 - prepadding * scale; + const float *ptr3 = + out_tile_3.row(out_tile[0].h - 1 - i - prepadding * scale) + out_tile[0].w - 1 - + prepadding * scale; + + for (int j = 0; j < out.w; j++) { + const float *ptr4 = out_tile_4.row(j + prepadding * scale) + i + prepadding * scale; + const float *ptr5 = out_tile_5.row(out_tile[0].w - 1 - j - prepadding * scale) + i + + prepadding * scale; + const float *ptr6 = out_tile_6.row(j + prepadding * scale) + out_tile[0].h - 1 - i - + prepadding * scale; + const float *ptr7 = + out_tile_7.row(out_tile[0].w - 1 - j - prepadding * scale) + out_tile[0].h - 1 - + i - prepadding * scale; + + float v = (*ptr0++ + *ptr1++ + *ptr2-- + *ptr3-- + *ptr4 + *ptr5 + *ptr6 + *ptr7) / 8; + + *outptr++ = v * 255.f + 0.5f; + } + } + } + + if (channels == 4) { + memcpy(out.channel_range(3, 1), out_alpha_tile, out_alpha_tile.total() * sizeof(float)); + } + } + } else { + // split alpha and preproc + ncnn::Mat in_tile; + ncnn::Mat in_alpha_tile; + { + in_tile.create(in.w, in.h, 3); + for (int q = 0; q < 3; q++) { + const float *ptr = in.channel(q); + float *outptr = in_tile.channel(q); + + for (int i = 0; i < in.w * in.h; i++) { + *outptr++ = *ptr++ * (1 / 255.f); + } + } + + if (channels == 4) { + in_alpha_tile = in.channel_range(3, 1).clone(); + } + } + + // border padding + { + int pad_top = std::max(prepadding - yi * TILE_SIZE_Y, 0); + int pad_bottom = std::max(std::min((yi + 1) * TILE_SIZE_Y + prepadding - h, prepadding), 0); + int pad_left = std::max(prepadding - xi * TILE_SIZE_X, 0); + int pad_right = std::max(std::min((xi + 1) * TILE_SIZE_X + prepadding - w, prepadding), 0); + + ncnn::Mat in_tile_padded; + ncnn::copy_make_border(in_tile, in_tile_padded, pad_top, pad_bottom, pad_left, pad_right, 2, 0.f, + net.opt); + in_tile = in_tile_padded; + } + + // realsr + ncnn::Mat out_tile; + { + ncnn::Extractor ex = net.create_extractor(); + + ex.input("data", in_tile); + + ex.extract("output", out_tile); + } + + ncnn::Mat out_alpha_tile; + if (channels == 4) { + if (scale == 1) { + out_alpha_tile = in_alpha_tile; + } + if (scale == 2) { + bicubic_2x->forward(in_alpha_tile, out_alpha_tile, opt); + } + if (scale == 3) { + bicubic_3x->forward(in_alpha_tile, out_alpha_tile, opt); + } + if (scale == 4) { + bicubic_4x->forward(in_alpha_tile, out_alpha_tile, opt); + } + } + + // postproc and merge alpha + { + out.create(tile_w_nopad * scale, tile_h_nopad * scale, channels); + for (int q = 0; q < 3; q++) { + float *outptr = out.channel(q); + + for (int i = 0; i < out.h; i++) { + const float *ptr = out_tile.channel(q).row(i + prepadding * scale) + prepadding * scale; + + for (int j = 0; j < out.w; j++) { + *outptr++ = *ptr++ * 255.f + 0.5f; + } + } + } + + if (channels == 4) { + memcpy(out.channel_range(3, 1), out_alpha_tile, out_alpha_tile.total() * sizeof(float)); + } + } + } + + { + if (channels == 3) { + out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels + + xi * scale * TILE_SIZE_X * channels, ncnn::Mat::PIXEL_RGB, w * scale * channels); + } + if (channels == 4) { + out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels + + xi * scale * TILE_SIZE_X * channels, ncnn::Mat::PIXEL_RGBA, w * scale * channels); + } + } + + fprintf(stderr, "%.2f%%\n", (float) (yi * xtiles + xi) / (ytiles * xtiles) * 100); + } + } + + return 0; +} diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.h b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.h new file mode 100644 index 0000000..dc39048 --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.h @@ -0,0 +1,48 @@ +// realesrgan implemented with ncnn library + +#ifndef REALESRGAN_H +#define REALESRGAN_H + +#include + +// ncnn +#include "net.h" +#include "gpu.h" +#include "layer.h" + +class RealESRGAN { +public: + RealESRGAN(int gpuid, bool tta_mode = false); + + ~RealESRGAN(); + +#if _WIN32 + int load(const std::wstring& parampath, const std::wstring& modelpath); +#else + + int load(const std::string ¶mpath, const std::string &modelpath); + +#endif + + int process(const ncnn::Mat &inimage, ncnn::Mat &outimage) const; + + int process_cpu(const ncnn::Mat &inimage, ncnn::Mat &outimage) const; + +public: + // realesrgan parameters + int scale; + int tilesize; + int prepadding; + +private: + ncnn::VulkanDevice *vkdev; + ncnn::Net net; + ncnn::Pipeline *realesrgan_preproc; + ncnn::Pipeline *realesrgan_postproc; + ncnn::Layer *bicubic_2x; + ncnn::Layer *bicubic_3x; + ncnn::Layer *bicubic_4x; + bool tta_mode; +}; + +#endif // REALESRGAN_H diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc.comp new file mode 100644 index 0000000..39eda23 --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc.comp @@ -0,0 +1,89 @@ + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#define sfp float16_t +#else +#define sfp float +#endif + +#if NCNN_int8_storage +#extension GL_EXT_shader_8bit_storage: require +#endif + +layout (constant_id = 0) const int bgr = 0; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) readonly buffer alpha_blob { sfp alpha_blob_data[]; }; +#if NCNN_int8_storage +layout (binding = 2) writeonly buffer top_blob { uint8_t top_blob_data[]; }; +#else +layout (binding = 2) writeonly buffer top_blob { float top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outw; + int outh; + int outcstep; + + int offset_x; + int gx_max; + + int crop_x; + int crop_y; + + int channels; + + int alphaw; + int alphah; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.gx_max || gy >= p.outh || gz >= p.channels) + return; + + float v; + + if (gz == 3) + { + v = float(alpha_blob_data[gy * p.alphaw + gx]); + } + else + { + v = float(bottom_blob_data[gz * p.cstep + (gy + p.crop_y) * p.w + gx + p.crop_x]); + + const float denorm_val = 255.f; + + v = v * denorm_val; + } + + const float clip_eps = 0.5f; + + v = v + clip_eps; + +#if NCNN_int8_storage + int v_offset = gy * p.outw + gx + p.offset_x; + + uint v32 = clamp(uint(floor(v)), 0, 255); + + if (bgr == 1 && gz != 3) + top_blob_data[v_offset * p.channels + 2 - gz] = uint8_t(v32); + else + top_blob_data[v_offset * p.channels + gz] = uint8_t(v32); +#else + int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset_x; + + top_blob_data[v_offset] = v; +#endif +} diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc_tta.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc_tta.comp new file mode 100644 index 0000000..22a65ba --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc_tta.comp @@ -0,0 +1,110 @@ + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#define sfp float16_t +#else +#define sfp float +#endif + +#if NCNN_int8_storage +#extension GL_EXT_shader_8bit_storage: require +#endif + +layout (constant_id = 0) const int bgr = 0; + +layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; }; +layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; }; +layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; }; +layout (binding = 3) readonly buffer bottom_blob3 { sfp bottom_blob3_data[]; }; +layout (binding = 4) readonly buffer bottom_blob4 { sfp bottom_blob4_data[]; }; +layout (binding = 5) readonly buffer bottom_blob5 { sfp bottom_blob5_data[]; }; +layout (binding = 6) readonly buffer bottom_blob6 { sfp bottom_blob6_data[]; }; +layout (binding = 7) readonly buffer bottom_blob7 { sfp bottom_blob7_data[]; }; +layout (binding = 8) readonly buffer alpha_blob { sfp alpha_blob_data[]; }; +#if NCNN_int8_storage +layout (binding = 9) writeonly buffer top_blob { uint8_t top_blob_data[]; }; +#else +layout (binding = 9) writeonly buffer top_blob { float top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outw; + int outh; + int outcstep; + + int offset_x; + int gx_max; + + int crop_x; + int crop_y; + + int channels; + + int alphaw; + int alphah; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.gx_max || gy >= p.outh || gz >= p.channels) + return; + + float v; + + if (gz == 3) + { + v = float(alpha_blob_data[gy * p.alphaw + gx]); + } + else + { + int gzi = gz * p.cstep; + + int sy = gy + p.crop_y; + int sx = gx + p.crop_x; + + float v0 = float(bottom_blob0_data[gzi + sy * p.w + sx]); + float v1 = float(bottom_blob1_data[gzi + sy * p.w + (p.w - 1 - sx)]); + float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - sy) * p.w + (p.w - 1 - sx)]); + float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - sy) * p.w + sx]); + float v4 = float(bottom_blob4_data[gzi + sx * p.h + sy]); + float v5 = float(bottom_blob5_data[gzi + sx * p.h + (p.h - 1 - sy)]); + float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - sx) * p.h + (p.h - 1 - sy)]); + float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - sx) * p.h + sy]); + + v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f; + + const float denorm_val = 255.f; + + v = v * denorm_val; + } + + const float clip_eps = 0.5f; + + v = v + clip_eps; + +#if NCNN_int8_storage + int v_offset = gy * p.outw + gx + p.offset_x; + + uint v32 = clamp(uint(floor(v)), 0, 255); + + if (bgr == 1 && gz != 3) + top_blob_data[v_offset * p.channels + 2 - gz] = uint8_t(v32); + else + top_blob_data[v_offset * p.channels + gz] = uint8_t(v32); +#else + int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset_x; + + top_blob_data[v_offset] = v; +#endif +} diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc.comp new file mode 100644 index 0000000..b9e7f71 --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc.comp @@ -0,0 +1,95 @@ + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#define sfp float16_t +#else +#define sfp float +#endif + +#if NCNN_int8_storage +#extension GL_EXT_shader_8bit_storage: require +#endif + +layout (constant_id = 0) const int bgr = 0; + +#if NCNN_int8_storage +layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) writeonly buffer alpha_blob { sfp alpha_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outw; + int outh; + int outcstep; + + int pad_top; + int pad_left; + + int crop_x; + int crop_y; + + int channels; + + int alphaw; + int alphah; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.channels) + return; + + int x = gx + p.crop_x - p.pad_left; + int y = gy + p.crop_y - p.pad_top; + + x = abs(x); + y = abs(y); + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + +#if NCNN_int8_storage + int v_offset = y * p.w + x; + + float v; + + if (bgr == 1 && gz != 3) + v = float(uint(bottom_blob_data[v_offset * p.channels + 2 - gz])); + else + v = float(uint(bottom_blob_data[v_offset * p.channels + gz])); +#else + int v_offset = gz * p.cstep + y * p.w + x; + + float v = bottom_blob_data[v_offset]; +#endif + + if (gz == 3) + { + gx -= p.pad_left; + gy -= p.pad_top; + + if (gx >= 0 && gx < p.alphaw && gy >= 0 && gy < p.alphah) + { + alpha_blob_data[gy * p.alphaw + gx] = sfp(v); + } + } + else + { + const float norm_val = 1 / 255.f; + + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(v * norm_val); + } +} diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc_tta.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc_tta.comp new file mode 100644 index 0000000..b3af689 --- /dev/null +++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc_tta.comp @@ -0,0 +1,113 @@ + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#define sfp float16_t +#else +#define sfp float +#endif + +#if NCNN_int8_storage +#extension GL_EXT_shader_8bit_storage: require +#endif + +layout (constant_id = 0) const int bgr = 0; + +#if NCNN_int8_storage +layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob0 { sfp top_blob0_data[]; }; +layout (binding = 2) writeonly buffer top_blob1 { sfp top_blob1_data[]; }; +layout (binding = 3) writeonly buffer top_blob2 { sfp top_blob2_data[]; }; +layout (binding = 4) writeonly buffer top_blob3 { sfp top_blob3_data[]; }; +layout (binding = 5) writeonly buffer top_blob4 { sfp top_blob4_data[]; }; +layout (binding = 6) writeonly buffer top_blob5 { sfp top_blob5_data[]; }; +layout (binding = 7) writeonly buffer top_blob6 { sfp top_blob6_data[]; }; +layout (binding = 8) writeonly buffer top_blob7 { sfp top_blob7_data[]; }; +layout (binding = 9) writeonly buffer alpha_blob { sfp alpha_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outw; + int outh; + int outcstep; + + int pad_top; + int pad_left; + + int crop_x; + int crop_y; + + int channels; + + int alphaw; + int alphah; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.channels) + return; + + int x = gx + p.crop_x - p.pad_left; + int y = gy + p.crop_y - p.pad_top; + + x = abs(x); + y = abs(y); + x = (p.w - 1) - abs(x - (p.w - 1)); + y = (p.h - 1) - abs(y - (p.h - 1)); + +#if NCNN_int8_storage + int v_offset = y * p.w + x; + + float v; + + if (bgr == 1 && gz != 3) + v = float(uint(bottom_blob_data[v_offset * p.channels + 2 - gz])); + else + v = float(uint(bottom_blob_data[v_offset * p.channels + gz])); +#else + int v_offset = gz * p.cstep + y * p.w + x; + + float v = bottom_blob_data[v_offset]; +#endif + + if (gz == 3) + { + gx -= p.pad_left; + gy -= p.pad_top; + + if (gx >= 0 && gx < p.alphaw && gy >= 0 && gy < p.alphah) + { + alpha_blob_data[gy * p.alphaw + gx] = sfp(v); + } + } + else + { + const float norm_val = 1 / 255.f; + + v = v * norm_val; + + int gzi = gz * p.outcstep; + + top_blob0_data[gzi + gy * p.outw + gx] = sfp(v); + top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(v); + top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(v); + top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(v); + top_blob4_data[gzi + gx * p.outh + gy] = sfp(v); + top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(v); + top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(v); + top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(v); + } +} diff --git a/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py b/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py index d222f1d..d4383ee 100644 --- a/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py +++ b/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py @@ -41,14 +41,14 @@ def __init__(self, gpuid: int = 0, tta_mode: bool = False, tilesize: int = 0, mo """ RealESRGAN class for Super Resolution - :param gpuid: gpu device to use, cpu is not supported yet + :param gpuid: gpu device to use, -1 for cpu :param tta_mode: enable test time argumentation :param tilesize: tile size, 0 for auto, must >= 32 :param model: realesrgan model, 0 for default, -1 for custom load """ # check arguments' validity - assert gpuid >= 0, "gpuid must >= 0" + assert gpuid >= -1, "gpuid must >= -1" assert tilesize == 0 or tilesize >= 32, "tilesize must >= 32 or be 0" assert model >= -1, "model must > 0 or -1" diff --git a/tests/test_realesrgan.py b/tests/test_realesrgan.py index c44a422..eeffa12 100644 --- a/tests/test_realesrgan.py +++ b/tests/test_realesrgan.py @@ -1,8 +1,10 @@ +import os import sys from pathlib import Path import cv2 import numpy as np +import pytest from realesrgan_ncnn_py import Realesrgan from skimage.metrics import structural_similarity @@ -28,12 +30,25 @@ def calculate_image_similarity(image1: np.ndarray, image2: np.ndarray) -> bool: _gpuid = 0 +# gpuid = -1 when in GitHub Actions +if os.environ.get("GITHUB_ACTIONS") == "true": + _gpuid = -1 + TEST_IMG = cv2.imread(str(filePATH.parent / "test.png")) class Test_Realesrgan: - def test_cv2(self) -> None: - _realesrgan = Realesrgan(gpuid=_gpuid, model=0) - outimg = _realesrgan.process_cv2(TEST_IMG) + def test_animevideov3(self) -> None: + for _model in [0, 1, 2]: + _realesrgan = Realesrgan(gpuid=_gpuid, model=_model) + outimg = _realesrgan.process_cv2(TEST_IMG) + + assert calculate_image_similarity(TEST_IMG, outimg) + + @pytest.mark.skipif(_gpuid == -1, reason="skip when in GitHub Actions") + def test_x4plus(self) -> None: + for _model in [3, 4]: + _realesrgan = Realesrgan(gpuid=_gpuid, model=_model) + outimg = _realesrgan.process_cv2(TEST_IMG) - assert calculate_image_similarity(TEST_IMG, outimg) + assert calculate_image_similarity(TEST_IMG, outimg)