diff --git a/.github/workflows/CI-Linux-x64-Clang.yml b/.github/workflows/CI-Linux-x64-Clang.yml
index e33716e..a2a5265 100644
--- a/.github/workflows/CI-Linux-x64-Clang.yml
+++ b/.github/workflows/CI-Linux-x64-Clang.yml
@@ -13,6 +13,9 @@ on:
       - LICENSE
   workflow_dispatch:
 
+env:
+  GITHUB_ACTIONS: true
+
 jobs:
   Linux-x64-Clang:
     strategy:
@@ -66,9 +69,14 @@ jobs:
           cmake -DOpenMP_CXX_FLAGS="-fexceptions -frtti" ..
           cmake --build . -j 4
 
-      - name: dist
+      - name: Test
         run: |
           cp src/build/realesrgan_ncnn_vulkan_wrapper.*.so src/realesrgan_ncnn_py
+          pdm install
+          pdm run test
+
+      - name: dist
+        run: |
           mkdir dist
           cp -r src/realesrgan_ncnn_py dist
 
diff --git a/.github/workflows/CI-Linux-x64-GCC.yml b/.github/workflows/CI-Linux-x64-GCC.yml
index cf38ea5..fac431a 100644
--- a/.github/workflows/CI-Linux-x64-GCC.yml
+++ b/.github/workflows/CI-Linux-x64-GCC.yml
@@ -13,6 +13,9 @@ on:
       - LICENSE
   workflow_dispatch:
 
+env:
+  GITHUB_ACTIONS: true
+
 jobs:
   Linux-x64-GCC:
     strategy:
@@ -63,9 +66,14 @@ jobs:
           cmake -DOpenMP_CXX_FLAGS="-fexceptions -frtti" ..
           cmake --build . -j 4
 
-      - name: dist
+      - name: Test
         run: |
           cp src/build/realesrgan_ncnn_vulkan_wrapper.*.so src/realesrgan_ncnn_py
+          pdm install
+          pdm run test
+
+      - name: dist
+        run: |
           mkdir dist
           cp -r src/realesrgan_ncnn_py dist
 
diff --git a/.github/workflows/CI-MacOS-Universal-Clang.yml b/.github/workflows/CI-MacOS-Universal-Clang.yml
index 927b31b..e526b7c 100644
--- a/.github/workflows/CI-MacOS-Universal-Clang.yml
+++ b/.github/workflows/CI-MacOS-Universal-Clang.yml
@@ -14,6 +14,7 @@ on:
   workflow_dispatch:
 
 env:
+  GITHUB_ACTIONS: true
   DEVELOPER_DIR: /Applications/Xcode_14.2.app/Contents/Developer
 
 jobs:
@@ -102,6 +103,11 @@ jobs:
         if: matrix.python-version == '3.11'
         run: lipo -create src/build-arm64/realesrgan_ncnn_vulkan_wrapper.cpython-311-darwin.so src/build-x86_64/realesrgan_ncnn_vulkan_wrapper.cpython-311-darwin.so -o src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan_wrapper.cpython-311-darwin.so
 
+      - name: Test
+        run: |
+          pdm install
+          pdm run test
+
       - name: dist
         run: |
           mkdir dist
diff --git a/.github/workflows/CI-Windows-x64-MSVC.yml b/.github/workflows/CI-Windows-x64-MSVC.yml
index 62fbac6..8ef7fa0 100644
--- a/.github/workflows/CI-Windows-x64-MSVC.yml
+++ b/.github/workflows/CI-Windows-x64-MSVC.yml
@@ -13,6 +13,9 @@ on:
       - LICENSE
   workflow_dispatch:
 
+env:
+  GITHUB_ACTIONS: true
+
 jobs:
   windows:
     strategy:
@@ -63,11 +66,16 @@ jobs:
           cmake -A x64 -DCMAKE_CXX_FLAGS="-frtti -fexceptions" ..
           cmake --build . --config Release -j 4
 
-      - name: dist
+      - name: Test
         run: |
-          mkdir dist
           echo F | xcopy .\src\build\Release\realesrgan_ncnn_vulkan_wrapper.*.pyd .\src\realesrgan_ncnn_py
           echo F | xcopy .\tests\vulkan-1.dll .\src\realesrgan_ncnn_py
+          pdm install
+          pdm run test
+
+      - name: dist
+        run: |
+          mkdir dist
           echo D | xcopy .\src\realesrgan_ncnn_py dist
 
       - name: upload
diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml
index cc0f1f6..3137b1c 100644
--- a/.github/workflows/Release.yml
+++ b/.github/workflows/Release.yml
@@ -4,6 +4,7 @@ on:
   workflow_dispatch:
 
 env:
+  GITHUB_ACTIONS: true
   DEVELOPER_DIR: /Applications/Xcode_14.2.app/Contents/Developer
 
 jobs:
diff --git a/.github/workflows/test_pip.yml b/.github/workflows/test_pip.yml
new file mode 100644
index 0000000..136a25c
--- /dev/null
+++ b/.github/workflows/test_pip.yml
@@ -0,0 +1,41 @@
+name: test_pip
+
+on:
+  schedule:
+    - cron: "0 19 * 1 *"
+  workflow_dispatch:
+
+env:
+  GITHUB_ACTIONS: true
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        os-version: ["macos-latest", "windows-latest", "ubuntu-20.04"]
+
+    runs-on: ${{ matrix.os-version }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+
+      - name: vulkan dll
+        if: matrix.os-version == 'windows-latest'
+        run: echo F | xcopy .\tests\vulkan-1.dll C:\Windows\System32
+
+      - name: Check Python version
+        run: |
+          python --version
+
+      - name: Test
+        run: |
+          pip install --upgrade pip chardet
+          pip install pathlib opencv-python scikit-image Pillow pytest pytest-cov realesrgan-ncnn-py
+          python -m pytest tests
diff --git a/.gitmodules b/.gitmodules
index 2d85a92..0fedaa6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
-[submodule "src/Real-ESRGAN-ncnn-vulkan"]
-	path = src/Real-ESRGAN-ncnn-vulkan
-	url = https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan
 [submodule "src/pybind11"]
 	path = src/pybind11
 	url = https://github.com/pybind/pybind11
+[submodule "src/Real-ESRGAN-ncnn-vulkan/src/ncnn"]
+	path = src/Real-ESRGAN-ncnn-vulkan/src/ncnn
+	url = https://github.com/Tencent/ncnn
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 600f80a..1bfa8c4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
     rev: v1.7.1
     hooks:
       - id: mypy
-        args: [src, tests]
+        args: [src/realesrgan_ncnn_py, tests]
         pass_filenames: false
         additional_dependencies:
           - types-requests
diff --git a/README.md b/README.md
index 5bdbfb9..0956867 100644
--- a/README.md
+++ b/README.md
@@ -7,18 +7,19 @@ Python Binding for realesrgan-ncnn-py with PyBind11
 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/realesrgan-ncnn-py)
 
 Real-ESRGAN aims at developing Practical Algorithms for General Image/Video Restoration.
-We extend the powerful ESRGAN to a practical restoration application (namely, Real-ESRGAN), which is trained with pure synthetic data.
+We extend the powerful ESRGAN to a practical restoration application (namely, Real-ESRGAN), which is trained with pure
+synthetic data.
 This wrapper provides an easy-to-use interface for running the pre-trained Real-ESRGAN model.
 
 ### Current building status matrix
 
-|    System     |                                                                                                               Status                                                                                                                | CPU (32bit) | CPU (64bit) | GPU (32bit) |    GPU (64bit)     |
-| :-----------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------: | :---------: | :---------: | :----------------: |
-| Linux (Clang) |         [![CI-Linux-x64-Clang](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml)         |      —      |      —      |      —      | :white_check_mark: |
-|  Linux (GCC)  |            [![CI-Linux-x64-GCC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml)            |      —      |      —      |      —      | :white_check_mark: |
-|    Windows    |       [![CI-Windows-x64-MSVC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml)        |      —      |      —      |      —      | :white_check_mark: |
-|     MacOS     | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) |      —      |      —      |      —      | :white_check_mark: |
-|  MacOS (ARM)  | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) |      —      |      —      |      —      | :white_check_mark: |
+|    System     |                                                                                                               Status                                                                                                                | CPU (32bit) |    CPU (64bit)     | GPU (32bit) |    GPU (64bit)     |
+| :-----------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------: | :----------------: | :---------: | :----------------: |
+| Linux (Clang) |         [![CI-Linux-x64-Clang](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-Clang.yml)         |      —      | :white_check_mark: |      —      | :white_check_mark: |
+|  Linux (GCC)  |            [![CI-Linux-x64-GCC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Linux-x64-GCC.yml)            |      —      | :white_check_mark: |      —      | :white_check_mark: |
+|    Windows    |       [![CI-Windows-x64-MSVC](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml/badge.svg)](https://github.com/Tohrusky/realesrgan-ncnn-py/actions/workflows/CI-Windows-x64-MSVC.yml)        |      —      | :white_check_mark: |      —      | :white_check_mark: |
+|     MacOS     | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) |      —      | :white_check_mark: |      —      | :white_check_mark: |
+|  MacOS (ARM)  | [![CI-MacOS-Universal-Clang](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml/badge.svg)](https://github.com/Tohrusky/realcugan-ncnn-py/actions/workflows/CI-MacOS-Universal-Clang.yml) |      —      | :white_check_mark: |      —      | :white_check_mark: |
 
 # Usage
 
@@ -56,7 +57,8 @@ realesrgan = Realesrgan(gpuid: int = 0, tta_mode: bool = False, tilesize: int =
 
 ```
 
-Here, gpuid specifies the GPU device to use, tta_mode enables test-time augmentation, tilesize specifies the tile size for processing (0 or >= 32), and model specifies the num of the pre-trained model to use.
+Here, gpuid specifies the GPU device to use, tta_mode enables test-time augmentation, tilesize specifies the tile size
+for processing (0 or >= 32), and model specifies the num of the pre-trained model to use.
 
 Once the model is initialized, you can use the upscale method to super-resolve your images:
 
@@ -64,6 +66,7 @@ Once the model is initialized, you can use the upscale method to super-resolve y
 
 ```python
 from PIL import Image
+
 realesrgan = Realesrgan(gpuid=0)
 with Image.open("input.jpg") as image:
     image = realesrgan.process_pil(image)
@@ -74,6 +77,7 @@ with Image.open("input.jpg") as image:
 
 ```python
 import cv2
+
 realesrgan = Realesrgan(gpuid=0)
 image = cv2.imdecode(np.fromfile("input.jpg", dtype=np.uint8), cv2.IMREAD_COLOR)
 image = realesrgan.process_cv2(image)
@@ -84,9 +88,10 @@ cv2.imencode(".jpg", image)[1].tofile("output_cv2.jpg")
 
 ```python
 import subprocess as sp
+
 # your ffmpeg parameters
-command_out = [FFMPEG_BIN,........]
-command_in = [FFMPEG_BIN,........]
+command_out = [FFMPEG_BIN, ........]
+command_in = [FFMPEG_BIN, ........]
 pipe_out = sp.Popen(command_out, stdout=sp.PIPE, bufsize=10 ** 8)
 pipe_in = sp.Popen(command_in, stdin=sp.PIPE)
 realesrgan = Realesrgan(gpuid=0)
@@ -102,20 +107,27 @@ while True:
 
 [here](https://github.com/Tohrusky/realesrgan-ncnn-py/blob/main/.github/workflows/Release.yml)
 
-_The project just only been tested in Ubuntu 18+ and Debian 9+ environments on Linux, so if the project does not work on your system, please try building it._
+_The project just only been tested in Ubuntu 18+ and Debian 9+ environments on Linux, so if the project does not work on
+your system, please try building it._
 
 # References
 
 The following references were used in the development of this project:
 
-[xinntao/Real-ESRGAN-ncnn-vulkan](https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan) - This project was the main inspiration for our work. It provided the core implementation of the Real-ESRGAN algorithm using the ncnn and Vulkan libraries.
+[xinntao/Real-ESRGAN-ncnn-vulkan](https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan) - This project was the main
+inspiration for our work. It provided the core implementation of the Real-ESRGAN algorithm using the ncnn and Vulkan
+libraries.
 
-[Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN) - Real-ESRGAN is an AI super resolution model, aims at developing Practical Algorithms for General Image/Video Restoration.
+[Real-ESRGAN](https://github.com/xinntao/Real-ESRGAN) - Real-ESRGAN is an AI super resolution model, aims at developing
+Practical Algorithms for General Image/Video Restoration.
 
-[media2x/realsr-ncnn-vulkan-python](https://github.com/media2x/realsr-ncnn-vulkan-python) - This project was used as a reference for implementing the wrapper. _Special thanks_ to the original author for sharing the code.
+[media2x/realsr-ncnn-vulkan-python](https://github.com/media2x/realsr-ncnn-vulkan-python) - This project was used as a
+reference for implementing the wrapper. _Special thanks_ to the original author for sharing the code.
 
-[ncnn](https://github.com/Tencent/ncnn) - ncnn is a high-performance neural network inference framework developed by Tencent AI Lab.
+[ncnn](https://github.com/Tencent/ncnn) - ncnn is a high-performance neural network inference framework developed by
+Tencent AI Lab.
 
 # License
 
-This project is licensed under the BSD 3-Clause - see the [LICENSE file](https://github.com/Tohrusky/realesrgan-ncnn-py/blob/main/LICENSE) for details.
+This project is licensed under the BSD 3-Clause - see
+the [LICENSE file](https://github.com/Tohrusky/realesrgan-ncnn-py/blob/main/LICENSE) for details.
diff --git a/src/Real-ESRGAN-ncnn-vulkan b/src/Real-ESRGAN-ncnn-vulkan
deleted file mode 160000
index 37026f4..0000000
--- a/src/Real-ESRGAN-ncnn-vulkan
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 37026f49824c5cf84062e7c6a5dd71445dcf610f
diff --git a/src/Real-ESRGAN-ncnn-vulkan/LICENSE b/src/Real-ESRGAN-ncnn-vulkan/LICENSE
new file mode 100644
index 0000000..e8ea6d7
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/LICENSE
@@ -0,0 +1,46 @@
+The MIT License (MIT)
+
+Copyright (c) 2021 Xintao Wang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+------------------------
+The following is the License of realsr-ncnn-vulkan
+
+The MIT License (MIT)
+
+Copyright (c) 2019 nihui
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/ncnn b/src/Real-ESRGAN-ncnn-vulkan/src/ncnn
new file mode 160000
index 0000000..6125c9f
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/ncnn
@@ -0,0 +1 @@
+Subproject commit 6125c9f47cd14b589de0521350668cf9d3d37e3c
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.cpp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.cpp
new file mode 100644
index 0000000..ea882c7
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.cpp
@@ -0,0 +1,874 @@
+// realesrgan implemented with ncnn library
+
+#include "realesrgan.h"
+
+#include <algorithm>
+#include <vector>
+#include <iostream>
+
+static const uint32_t realesrgan_preproc_spv_data[] = {
+#include "realesrgan_preproc.spv.hex.h"
+};
+static const uint32_t realesrgan_preproc_fp16s_spv_data[] = {
+#include "realesrgan_preproc_fp16s.spv.hex.h"
+};
+static const uint32_t realesrgan_preproc_int8s_spv_data[] = {
+#include "realesrgan_preproc_int8s.spv.hex.h"
+};
+static const uint32_t realesrgan_postproc_spv_data[] = {
+#include "realesrgan_postproc.spv.hex.h"
+};
+static const uint32_t realesrgan_postproc_fp16s_spv_data[] = {
+#include "realesrgan_postproc_fp16s.spv.hex.h"
+};
+static const uint32_t realesrgan_postproc_int8s_spv_data[] = {
+#include "realesrgan_postproc_int8s.spv.hex.h"
+};
+
+static const uint32_t realesrgan_preproc_tta_spv_data[] = {
+#include "realesrgan_preproc_tta.spv.hex.h"
+};
+static const uint32_t realesrgan_preproc_tta_fp16s_spv_data[] = {
+#include "realesrgan_preproc_tta_fp16s.spv.hex.h"
+};
+static const uint32_t realesrgan_preproc_tta_int8s_spv_data[] = {
+#include "realesrgan_preproc_tta_int8s.spv.hex.h"
+};
+static const uint32_t realesrgan_postproc_tta_spv_data[] = {
+#include "realesrgan_postproc_tta.spv.hex.h"
+};
+static const uint32_t realesrgan_postproc_tta_fp16s_spv_data[] = {
+#include "realesrgan_postproc_tta_fp16s.spv.hex.h"
+};
+static const uint32_t realesrgan_postproc_tta_int8s_spv_data[] = {
+#include "realesrgan_postproc_tta_int8s.spv.hex.h"
+};
+
+RealESRGAN::RealESRGAN(int gpuid, bool _tta_mode) {
+    vkdev = gpuid == -1 ? 0 : ncnn::get_gpu_device(gpuid);
+
+    realesrgan_preproc = 0;
+    realesrgan_postproc = 0;
+    bicubic_2x = 0;
+    bicubic_3x = 0;
+    bicubic_4x = 0;
+    tta_mode = _tta_mode;
+}
+
+RealESRGAN::~RealESRGAN() {
+    // cleanup preprocess and postprocess pipeline
+    {
+        delete realesrgan_preproc;
+        delete realesrgan_postproc;
+    }
+
+    bicubic_2x->destroy_pipeline(net.opt);
+    delete bicubic_2x;
+
+    bicubic_3x->destroy_pipeline(net.opt);
+    delete bicubic_3x;
+
+    bicubic_4x->destroy_pipeline(net.opt);
+    delete bicubic_4x;
+}
+
+#if _WIN32
+int RealESRGAN::load(const std::wstring& parampath, const std::wstring& modelpath)
+#else
+
+int RealESRGAN::load(const std::string &parampath, const std::string &modelpath)
+#endif
+{
+    net.opt.use_vulkan_compute = vkdev ? true : false;
+    net.opt.use_fp16_packed = true;
+    net.opt.use_fp16_storage = vkdev ? true : false;
+    net.opt.use_fp16_arithmetic = false;
+    net.opt.use_int8_storage = true;
+    net.opt.use_int8_arithmetic = false;
+
+    net.set_vulkan_device(vkdev);
+
+#if _WIN32
+    {
+        FILE* fp = _wfopen(parampath.c_str(), L"rb");
+        if (!fp)
+        {
+            fwprintf(stderr, L"_wfopen %ls failed\n", parampath.c_str());
+        }
+
+        net.load_param(fp);
+
+        fclose(fp);
+    }
+    {
+        FILE* fp = _wfopen(modelpath.c_str(), L"rb");
+        if (!fp)
+        {
+            fwprintf(stderr, L"_wfopen %ls failed\n", modelpath.c_str());
+        }
+
+        net.load_model(fp);
+
+        fclose(fp);
+    }
+#else
+    net.load_param(parampath.c_str());
+    net.load_model(modelpath.c_str());
+#endif
+
+    if (vkdev)
+        // initialize preprocess and postprocess pipeline
+    {
+        std::vector <ncnn::vk_specialization_type> specializations(1);
+#if _WIN32
+        specializations[0].i = 1;
+#else
+        specializations[0].i = 0;
+#endif
+
+        realesrgan_preproc = new ncnn::Pipeline(vkdev);
+        realesrgan_preproc->set_optimal_local_size_xyz(32, 32, 3);
+
+        realesrgan_postproc = new ncnn::Pipeline(vkdev);
+        realesrgan_postproc->set_optimal_local_size_xyz(32, 32, 3);
+
+        if (tta_mode) {
+            if (net.opt.use_fp16_storage && net.opt.use_int8_storage)
+                realesrgan_preproc->create(realesrgan_preproc_tta_int8s_spv_data,
+                                           sizeof(realesrgan_preproc_tta_int8s_spv_data), specializations);
+            else if (net.opt.use_fp16_storage)
+                realesrgan_preproc->create(realesrgan_preproc_tta_fp16s_spv_data,
+                                           sizeof(realesrgan_preproc_tta_fp16s_spv_data), specializations);
+            else
+                realesrgan_preproc->create(realesrgan_preproc_tta_spv_data, sizeof(realesrgan_preproc_tta_spv_data),
+                                           specializations);
+
+            if (net.opt.use_fp16_storage && net.opt.use_int8_storage)
+                realesrgan_postproc->create(realesrgan_postproc_tta_int8s_spv_data,
+                                            sizeof(realesrgan_postproc_tta_int8s_spv_data), specializations);
+            else if (net.opt.use_fp16_storage)
+                realesrgan_postproc->create(realesrgan_postproc_tta_fp16s_spv_data,
+                                            sizeof(realesrgan_postproc_tta_fp16s_spv_data), specializations);
+            else
+                realesrgan_postproc->create(realesrgan_postproc_tta_spv_data, sizeof(realesrgan_postproc_tta_spv_data),
+                                            specializations);
+        } else {
+            if (net.opt.use_fp16_storage && net.opt.use_int8_storage)
+                realesrgan_preproc->create(realesrgan_preproc_int8s_spv_data, sizeof(realesrgan_preproc_int8s_spv_data),
+                                           specializations);
+            else if (net.opt.use_fp16_storage)
+                realesrgan_preproc->create(realesrgan_preproc_fp16s_spv_data, sizeof(realesrgan_preproc_fp16s_spv_data),
+                                           specializations);
+            else
+                realesrgan_preproc->create(realesrgan_preproc_spv_data, sizeof(realesrgan_preproc_spv_data),
+                                           specializations);
+
+            if (net.opt.use_fp16_storage && net.opt.use_int8_storage)
+                realesrgan_postproc->create(realesrgan_postproc_int8s_spv_data,
+                                            sizeof(realesrgan_postproc_int8s_spv_data), specializations);
+            else if (net.opt.use_fp16_storage)
+                realesrgan_postproc->create(realesrgan_postproc_fp16s_spv_data,
+                                            sizeof(realesrgan_postproc_fp16s_spv_data), specializations);
+            else
+                realesrgan_postproc->create(realesrgan_postproc_spv_data, sizeof(realesrgan_postproc_spv_data),
+                                            specializations);
+        }
+    }
+
+    // bicubic 2x/3x/4x for alpha channel
+    {
+        bicubic_2x = ncnn::create_layer("Interp");
+        bicubic_2x->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 3);// bicubic
+        pd.set(1, 2.f);
+        pd.set(2, 2.f);
+        bicubic_2x->load_param(pd);
+
+        bicubic_2x->create_pipeline(net.opt);
+    }
+    {
+        bicubic_3x = ncnn::create_layer("Interp");
+        bicubic_3x->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 3);// bicubic
+        pd.set(1, 3.f);
+        pd.set(2, 3.f);
+        bicubic_3x->load_param(pd);
+
+        bicubic_3x->create_pipeline(net.opt);
+    }
+    {
+        bicubic_4x = ncnn::create_layer("Interp");
+        bicubic_4x->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 3);// bicubic
+        pd.set(1, 4.f);
+        pd.set(2, 4.f);
+        bicubic_4x->load_param(pd);
+
+        bicubic_4x->create_pipeline(net.opt);
+    }
+
+    return 0;
+}
+
+int RealESRGAN::process(const ncnn::Mat &inimage, ncnn::Mat &outimage) const {
+    if (!vkdev) {
+        // cpu only
+        return process_cpu(inimage, outimage);
+    }
+
+
+    const unsigned char *pixeldata = (const unsigned char *) inimage.data;
+    const int w = inimage.w;
+    const int h = inimage.h;
+    const int channels = inimage.elempack;
+
+    const int TILE_SIZE_X = tilesize;
+    const int TILE_SIZE_Y = tilesize;
+
+    ncnn::VkAllocator *blob_vkallocator = net.vulkan_device()->acquire_blob_allocator();
+    ncnn::VkAllocator *staging_vkallocator = net.vulkan_device()->acquire_staging_allocator();
+
+    ncnn::Option opt = net.opt;
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    // each tile 100x100
+    const int xtiles = (w + TILE_SIZE_X - 1) / TILE_SIZE_X;
+    const int ytiles = (h + TILE_SIZE_Y - 1) / TILE_SIZE_Y;
+
+    const size_t in_out_tile_elemsize = opt.use_fp16_storage ? 2u : 4u;
+
+    //#pragma omp parallel for num_threads(2)
+    for (int yi = 0; yi < ytiles; yi++) {
+        const int tile_h_nopad = std::min((yi + 1) * TILE_SIZE_Y, h) - yi * TILE_SIZE_Y;
+
+        int in_tile_y0 = std::max(yi * TILE_SIZE_Y - prepadding, 0);
+        int in_tile_y1 = std::min((yi + 1) * TILE_SIZE_Y + prepadding, h);
+
+        ncnn::Mat in;
+        if (opt.use_fp16_storage && opt.use_int8_storage) {
+            in = ncnn::Mat(w, (in_tile_y1 - in_tile_y0), (unsigned char *) pixeldata + in_tile_y0 * w * channels,
+                           (size_t) channels, 1);
+        } else {
+            if (channels == 3) {
+                in = ncnn::Mat::from_pixels(pixeldata + in_tile_y0 * w * channels, ncnn::Mat::PIXEL_RGB, w,
+                                            (in_tile_y1 - in_tile_y0));
+            }
+            if (channels == 4) {
+                in = ncnn::Mat::from_pixels(pixeldata + in_tile_y0 * w * channels, ncnn::Mat::PIXEL_RGBA, w,
+                                            (in_tile_y1 - in_tile_y0));
+            }
+        }
+
+        ncnn::VkCompute cmd(net.vulkan_device());
+
+        // upload
+        ncnn::VkMat in_gpu;
+        {
+            cmd.record_clone(in, in_gpu, opt);
+
+            if (xtiles > 1) {
+                cmd.submit_and_wait();
+                cmd.reset();
+            }
+        }
+
+        int out_tile_y0 = std::max(yi * TILE_SIZE_Y, 0);
+        int out_tile_y1 = std::min((yi + 1) * TILE_SIZE_Y, h);
+
+        ncnn::VkMat out_gpu;
+        if (opt.use_fp16_storage && opt.use_int8_storage) {
+            out_gpu.create(w * scale, (out_tile_y1 - out_tile_y0) * scale, (size_t) channels, 1, blob_vkallocator);
+        } else {
+            out_gpu.create(w * scale, (out_tile_y1 - out_tile_y0) * scale, channels, (size_t) 4u, 1, blob_vkallocator);
+        }
+
+        for (int xi = 0; xi < xtiles; xi++) {
+            const int tile_w_nopad = std::min((xi + 1) * TILE_SIZE_X, w) - xi * TILE_SIZE_X;
+
+            if (tta_mode) {
+                // preproc
+                ncnn::VkMat in_tile_gpu[8];
+                ncnn::VkMat in_alpha_tile_gpu;
+                {
+                    // crop tile
+                    int tile_x0 = xi * TILE_SIZE_X - prepadding;
+                    int tile_x1 = std::min((xi + 1) * TILE_SIZE_X, w) + prepadding;
+                    int tile_y0 = yi * TILE_SIZE_Y - prepadding;
+                    int tile_y1 = std::min((yi + 1) * TILE_SIZE_Y, h) + prepadding;
+
+                    in_tile_gpu[0].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[1].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[2].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[3].create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[4].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[5].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[6].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+                    in_tile_gpu[7].create(tile_y1 - tile_y0, tile_x1 - tile_x0, 3, in_out_tile_elemsize, 1,
+                                          blob_vkallocator);
+
+                    if (channels == 4) {
+                        in_alpha_tile_gpu.create(tile_w_nopad, tile_h_nopad, 1, in_out_tile_elemsize, 1,
+                                                 blob_vkallocator);
+                    }
+
+                    std::vector <ncnn::VkMat> bindings(10);
+                    bindings[0] = in_gpu;
+                    bindings[1] = in_tile_gpu[0];
+                    bindings[2] = in_tile_gpu[1];
+                    bindings[3] = in_tile_gpu[2];
+                    bindings[4] = in_tile_gpu[3];
+                    bindings[5] = in_tile_gpu[4];
+                    bindings[6] = in_tile_gpu[5];
+                    bindings[7] = in_tile_gpu[6];
+                    bindings[8] = in_tile_gpu[7];
+                    bindings[9] = in_alpha_tile_gpu;
+
+                    std::vector <ncnn::vk_constant_type> constants(13);
+                    constants[0].i = in_gpu.w;
+                    constants[1].i = in_gpu.h;
+                    constants[2].i = in_gpu.cstep;
+                    constants[3].i = in_tile_gpu[0].w;
+                    constants[4].i = in_tile_gpu[0].h;
+                    constants[5].i = in_tile_gpu[0].cstep;
+                    constants[6].i = prepadding;
+                    constants[7].i = prepadding;
+                    constants[8].i = xi * TILE_SIZE_X;
+                    constants[9].i = std::min(yi * TILE_SIZE_Y, prepadding);
+                    constants[10].i = channels;
+                    constants[11].i = in_alpha_tile_gpu.w;
+                    constants[12].i = in_alpha_tile_gpu.h;
+
+                    ncnn::VkMat dispatcher;
+                    dispatcher.w = in_tile_gpu[0].w;
+                    dispatcher.h = in_tile_gpu[0].h;
+                    dispatcher.c = channels;
+
+                    cmd.record_pipeline(realesrgan_preproc, bindings, constants, dispatcher);
+                }
+
+                // realesrgan
+                ncnn::VkMat out_tile_gpu[8];
+                for (int ti = 0; ti < 8; ti++) {
+                    ncnn::Extractor ex = net.create_extractor();
+
+                    ex.set_blob_vkallocator(blob_vkallocator);
+                    ex.set_workspace_vkallocator(blob_vkallocator);
+                    ex.set_staging_vkallocator(staging_vkallocator);
+
+                    ex.input("data", in_tile_gpu[ti]);
+
+                    ex.extract("output", out_tile_gpu[ti], cmd);
+
+                    {
+                        cmd.submit_and_wait();
+                        cmd.reset();
+                    }
+                }
+
+                ncnn::VkMat out_alpha_tile_gpu;
+                if (channels == 4) {
+                    if (scale == 1) {
+                        out_alpha_tile_gpu = in_alpha_tile_gpu;
+                    }
+                    if (scale == 2) {
+                        bicubic_2x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt);
+                    }
+                    if (scale == 3) {
+                        bicubic_3x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt);
+                    }
+                    if (scale == 4) {
+                        bicubic_4x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt);
+                    }
+                }
+
+                // postproc
+                {
+                    std::vector <ncnn::VkMat> bindings(10);
+                    bindings[0] = out_tile_gpu[0];
+                    bindings[1] = out_tile_gpu[1];
+                    bindings[2] = out_tile_gpu[2];
+                    bindings[3] = out_tile_gpu[3];
+                    bindings[4] = out_tile_gpu[4];
+                    bindings[5] = out_tile_gpu[5];
+                    bindings[6] = out_tile_gpu[6];
+                    bindings[7] = out_tile_gpu[7];
+                    bindings[8] = out_alpha_tile_gpu;
+                    bindings[9] = out_gpu;
+
+                    std::vector <ncnn::vk_constant_type> constants(13);
+                    constants[0].i = out_tile_gpu[0].w;
+                    constants[1].i = out_tile_gpu[0].h;
+                    constants[2].i = out_tile_gpu[0].cstep;
+                    constants[3].i = out_gpu.w;
+                    constants[4].i = out_gpu.h;
+                    constants[5].i = out_gpu.cstep;
+                    constants[6].i = xi * TILE_SIZE_X * scale;
+                    constants[7].i = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale);
+                    constants[8].i = prepadding * scale;
+                    constants[9].i = prepadding * scale;
+                    constants[10].i = channels;
+                    constants[11].i = out_alpha_tile_gpu.w;
+                    constants[12].i = out_alpha_tile_gpu.h;
+
+                    ncnn::VkMat dispatcher;
+                    dispatcher.w = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale);
+                    dispatcher.h = out_gpu.h;
+                    dispatcher.c = channels;
+
+                    cmd.record_pipeline(realesrgan_postproc, bindings, constants, dispatcher);
+                }
+            } else {
+                // preproc
+                ncnn::VkMat in_tile_gpu;
+                ncnn::VkMat in_alpha_tile_gpu;
+                {
+                    // crop tile
+                    int tile_x0 = xi * TILE_SIZE_X - prepadding;
+                    int tile_x1 = std::min((xi + 1) * TILE_SIZE_X, w) + prepadding;
+                    int tile_y0 = yi * TILE_SIZE_Y - prepadding;
+                    int tile_y1 = std::min((yi + 1) * TILE_SIZE_Y, h) + prepadding;
+
+                    in_tile_gpu.create(tile_x1 - tile_x0, tile_y1 - tile_y0, 3, in_out_tile_elemsize, 1,
+                                       blob_vkallocator);
+
+                    if (channels == 4) {
+                        in_alpha_tile_gpu.create(tile_w_nopad, tile_h_nopad, 1, in_out_tile_elemsize, 1,
+                                                 blob_vkallocator);
+                    }
+
+                    std::vector <ncnn::VkMat> bindings(3);
+                    bindings[0] = in_gpu;
+                    bindings[1] = in_tile_gpu;
+                    bindings[2] = in_alpha_tile_gpu;
+
+                    std::vector <ncnn::vk_constant_type> constants(13);
+                    constants[0].i = in_gpu.w;
+                    constants[1].i = in_gpu.h;
+                    constants[2].i = in_gpu.cstep;
+                    constants[3].i = in_tile_gpu.w;
+                    constants[4].i = in_tile_gpu.h;
+                    constants[5].i = in_tile_gpu.cstep;
+                    constants[6].i = prepadding;
+                    constants[7].i = prepadding;
+                    constants[8].i = xi * TILE_SIZE_X;
+                    constants[9].i = std::min(yi * TILE_SIZE_Y, prepadding);
+                    constants[10].i = channels;
+                    constants[11].i = in_alpha_tile_gpu.w;
+                    constants[12].i = in_alpha_tile_gpu.h;
+
+                    ncnn::VkMat dispatcher;
+                    dispatcher.w = in_tile_gpu.w;
+                    dispatcher.h = in_tile_gpu.h;
+                    dispatcher.c = channels;
+
+                    cmd.record_pipeline(realesrgan_preproc, bindings, constants, dispatcher);
+                }
+
+                // realesrgan
+                ncnn::VkMat out_tile_gpu;
+                {
+                    ncnn::Extractor ex = net.create_extractor();
+
+                    ex.set_blob_vkallocator(blob_vkallocator);
+                    ex.set_workspace_vkallocator(blob_vkallocator);
+                    ex.set_staging_vkallocator(staging_vkallocator);
+
+                    ex.input("data", in_tile_gpu);
+
+                    ex.extract("output", out_tile_gpu, cmd);
+                }
+
+                ncnn::VkMat out_alpha_tile_gpu;
+                if (channels == 4) {
+                    if (scale == 1) {
+                        out_alpha_tile_gpu = in_alpha_tile_gpu;
+                    }
+                    if (scale == 2) {
+                        bicubic_2x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt);
+                    }
+                    if (scale == 3) {
+                        bicubic_3x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt);
+                    }
+                    if (scale == 4) {
+                        bicubic_4x->forward(in_alpha_tile_gpu, out_alpha_tile_gpu, cmd, opt);
+                    }
+                }
+
+                // postproc
+                {
+                    std::vector <ncnn::VkMat> bindings(3);
+                    bindings[0] = out_tile_gpu;
+                    bindings[1] = out_alpha_tile_gpu;
+                    bindings[2] = out_gpu;
+
+                    std::vector <ncnn::vk_constant_type> constants(13);
+                    constants[0].i = out_tile_gpu.w;
+                    constants[1].i = out_tile_gpu.h;
+                    constants[2].i = out_tile_gpu.cstep;
+                    constants[3].i = out_gpu.w;
+                    constants[4].i = out_gpu.h;
+                    constants[5].i = out_gpu.cstep;
+                    constants[6].i = xi * TILE_SIZE_X * scale;
+                    constants[7].i = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale);
+                    constants[8].i = prepadding * scale;
+                    constants[9].i = prepadding * scale;
+                    constants[10].i = channels;
+                    constants[11].i = out_alpha_tile_gpu.w;
+                    constants[12].i = out_alpha_tile_gpu.h;
+
+                    ncnn::VkMat dispatcher;
+                    dispatcher.w = std::min(TILE_SIZE_X * scale, out_gpu.w - xi * TILE_SIZE_X * scale);
+                    dispatcher.h = out_gpu.h;
+                    dispatcher.c = channels;
+
+                    cmd.record_pipeline(realesrgan_postproc, bindings, constants, dispatcher);
+                }
+            }
+
+            if (xtiles > 1) {
+                cmd.submit_and_wait();
+                cmd.reset();
+            }
+
+            fprintf(stderr, "%.2f%%\n", (float) (yi * xtiles + xi) / (ytiles * xtiles) * 100);
+        }
+
+        // download
+        {
+            ncnn::Mat out;
+
+            if (opt.use_fp16_storage && opt.use_int8_storage) {
+                out = ncnn::Mat(out_gpu.w, out_gpu.h,
+                                (unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels,
+                                (size_t) channels, 1);
+            }
+
+            cmd.record_clone(out_gpu, out, opt);
+
+            cmd.submit_and_wait();
+
+            if (!(opt.use_fp16_storage && opt.use_int8_storage)) {
+                if (channels == 3) {
+                    out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels,
+                                  ncnn::Mat::PIXEL_RGB);
+                }
+                if (channels == 4) {
+                    out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels,
+                                  ncnn::Mat::PIXEL_RGBA);
+                }
+            }
+        }
+    }
+
+    net.vulkan_device()->reclaim_blob_allocator(blob_vkallocator);
+    net.vulkan_device()->reclaim_staging_allocator(staging_vkallocator);
+
+    return 0;
+}
+
+int RealESRGAN::process_cpu(const ncnn::Mat &inimage, ncnn::Mat &outimage) const {
+    const unsigned char *pixeldata = (const unsigned char *) inimage.data;
+    const int w = inimage.w;
+    const int h = inimage.h;
+    const int channels = inimage.elempack;
+
+    const int TILE_SIZE_X = tilesize;
+    const int TILE_SIZE_Y = tilesize;
+
+    ncnn::Option opt = net.opt;
+
+    // each tile 100x100
+    const int xtiles = (w + TILE_SIZE_X - 1) / TILE_SIZE_X;
+    const int ytiles = (h + TILE_SIZE_Y - 1) / TILE_SIZE_Y;
+
+    for (int yi = 0; yi < ytiles; yi++) {
+        const int tile_h_nopad = std::min((yi + 1) * TILE_SIZE_Y, h) - yi * TILE_SIZE_Y;
+
+        int in_tile_y0 = std::max(yi * TILE_SIZE_Y - prepadding, 0);
+        int in_tile_y1 = std::min((yi + 1) * TILE_SIZE_Y + prepadding, h);
+
+        for (int xi = 0; xi < xtiles; xi++) {
+            const int tile_w_nopad = std::min((xi + 1) * TILE_SIZE_X, w) - xi * TILE_SIZE_X;
+
+            int in_tile_x0 = std::max(xi * TILE_SIZE_X - prepadding, 0);
+            int in_tile_x1 = std::min((xi + 1) * TILE_SIZE_X + prepadding, w);
+
+            // crop tile
+            ncnn::Mat in;
+            {
+                if (channels == 3) {
+                    in = ncnn::Mat::from_pixels_roi(pixeldata, ncnn::Mat::PIXEL_RGB, w, h, in_tile_x0, in_tile_y0,
+                                                    in_tile_x1 - in_tile_x0, in_tile_y1 - in_tile_y0);
+                }
+                if (channels == 4) {
+                    in = ncnn::Mat::from_pixels_roi(pixeldata, ncnn::Mat::PIXEL_RGBA, w, h, in_tile_x0, in_tile_y0,
+                                                    in_tile_x1 - in_tile_x0, in_tile_y1 - in_tile_y0);
+                }
+            }
+
+            ncnn::Mat out;
+
+            if (tta_mode) {
+                // split alpha and preproc
+                ncnn::Mat in_tile[8];
+                ncnn::Mat in_alpha_tile;
+                {
+                    in_tile[0].create(in.w, in.h, 3);
+                    for (int q = 0; q < 3; q++) {
+                        const float *ptr = in.channel(q);
+                        float *outptr0 = in_tile[0].channel(q);
+
+                        for (int i = 0; i < in.h; i++) {
+                            for (int j = 0; j < in.w; j++) {
+                                *outptr0++ = *ptr++ * (1 / 255.f);
+                            }
+                        }
+                    }
+
+                    if (channels == 4) {
+                        in_alpha_tile = in.channel_range(3, 1).clone();
+                    }
+                }
+
+                // border padding
+                {
+                    int pad_top = std::max(prepadding - yi * TILE_SIZE_Y, 0);
+                    int pad_bottom = std::max(std::min((yi + 1) * TILE_SIZE_Y + prepadding - h, prepadding), 0);
+                    int pad_left = std::max(prepadding - xi * TILE_SIZE_X, 0);
+                    int pad_right = std::max(std::min((xi + 1) * TILE_SIZE_X + prepadding - w, prepadding), 0);
+
+                    ncnn::Mat in_tile_padded;
+                    ncnn::copy_make_border(in_tile[0], in_tile_padded, pad_top, pad_bottom, pad_left, pad_right, 2, 0.f,
+                                           net.opt);
+                    in_tile[0] = in_tile_padded;
+                }
+
+                // the other 7 directions
+                {
+                    in_tile[1].create(in_tile[0].w, in_tile[0].h, 3);
+                    in_tile[2].create(in_tile[0].w, in_tile[0].h, 3);
+                    in_tile[3].create(in_tile[0].w, in_tile[0].h, 3);
+                    in_tile[4].create(in_tile[0].h, in_tile[0].w, 3);
+                    in_tile[5].create(in_tile[0].h, in_tile[0].w, 3);
+                    in_tile[6].create(in_tile[0].h, in_tile[0].w, 3);
+                    in_tile[7].create(in_tile[0].h, in_tile[0].w, 3);
+
+                    for (int q = 0; q < 3; q++) {
+                        const ncnn::Mat in_tile_0 = in_tile[0].channel(q);
+                        ncnn::Mat in_tile_1 = in_tile[1].channel(q);
+                        ncnn::Mat in_tile_2 = in_tile[2].channel(q);
+                        ncnn::Mat in_tile_3 = in_tile[3].channel(q);
+                        ncnn::Mat in_tile_4 = in_tile[4].channel(q);
+                        ncnn::Mat in_tile_5 = in_tile[5].channel(q);
+                        ncnn::Mat in_tile_6 = in_tile[6].channel(q);
+                        ncnn::Mat in_tile_7 = in_tile[7].channel(q);
+
+                        for (int i = 0; i < in_tile[0].h; i++) {
+                            const float *outptr0 = in_tile_0.row(i);
+                            float *outptr1 = in_tile_1.row(in_tile[0].h - 1 - i);
+                            float *outptr2 = in_tile_2.row(i) + in_tile[0].w - 1;
+                            float *outptr3 = in_tile_3.row(in_tile[0].h - 1 - i) + in_tile[0].w - 1;
+
+                            for (int j = 0; j < in_tile[0].w; j++) {
+                                float *outptr4 = in_tile_4.row(j) + i;
+                                float *outptr5 = in_tile_5.row(in_tile[0].w - 1 - j) + i;
+                                float *outptr6 = in_tile_6.row(j) + in_tile[0].h - 1 - i;
+                                float *outptr7 = in_tile_7.row(in_tile[0].w - 1 - j) + in_tile[0].h - 1 - i;
+
+                                float v = *outptr0++;
+
+                                *outptr1++ = v;
+                                *outptr2-- = v;
+                                *outptr3-- = v;
+                                *outptr4 = v;
+                                *outptr5 = v;
+                                *outptr6 = v;
+                                *outptr7 = v;
+                            }
+                        }
+                    }
+                }
+
+                // realsr
+                ncnn::Mat out_tile[8];
+                for (int ti = 0; ti < 8; ti++) {
+                    ncnn::Extractor ex = net.create_extractor();
+
+                    ex.input("data", in_tile[ti]);
+
+                    ex.extract("output", out_tile[ti]);
+                }
+
+                ncnn::Mat out_alpha_tile;
+                if (channels == 4) {
+                    if (scale == 1) {
+                        out_alpha_tile = in_alpha_tile;
+                    }
+                    if (scale == 2) {
+                        bicubic_2x->forward(in_alpha_tile, out_alpha_tile, opt);
+                    }
+                    if (scale == 3) {
+                        bicubic_3x->forward(in_alpha_tile, out_alpha_tile, opt);
+                    }
+                    if (scale == 4) {
+                        bicubic_4x->forward(in_alpha_tile, out_alpha_tile, opt);
+                    }
+                }
+
+                // postproc and merge alpha
+                {
+                    out.create(tile_w_nopad * scale, tile_h_nopad * scale, channels);
+                    for (int q = 0; q < 3; q++) {
+                        const ncnn::Mat out_tile_0 = out_tile[0].channel(q);
+                        const ncnn::Mat out_tile_1 = out_tile[1].channel(q);
+                        const ncnn::Mat out_tile_2 = out_tile[2].channel(q);
+                        const ncnn::Mat out_tile_3 = out_tile[3].channel(q);
+                        const ncnn::Mat out_tile_4 = out_tile[4].channel(q);
+                        const ncnn::Mat out_tile_5 = out_tile[5].channel(q);
+                        const ncnn::Mat out_tile_6 = out_tile[6].channel(q);
+                        const ncnn::Mat out_tile_7 = out_tile[7].channel(q);
+                        float *outptr = out.channel(q);
+
+                        for (int i = 0; i < out.h; i++) {
+                            const float *ptr0 = out_tile_0.row(i + prepadding * scale) + prepadding * scale;
+                            const float *ptr1 =
+                                    out_tile_1.row(out_tile[0].h - 1 - i - prepadding * scale) + prepadding * scale;
+                            const float *ptr2 =
+                                    out_tile_2.row(i + prepadding * scale) + out_tile[0].w - 1 - prepadding * scale;
+                            const float *ptr3 =
+                                    out_tile_3.row(out_tile[0].h - 1 - i - prepadding * scale) + out_tile[0].w - 1 -
+                                    prepadding * scale;
+
+                            for (int j = 0; j < out.w; j++) {
+                                const float *ptr4 = out_tile_4.row(j + prepadding * scale) + i + prepadding * scale;
+                                const float *ptr5 = out_tile_5.row(out_tile[0].w - 1 - j - prepadding * scale) + i +
+                                                    prepadding * scale;
+                                const float *ptr6 = out_tile_6.row(j + prepadding * scale) + out_tile[0].h - 1 - i -
+                                                    prepadding * scale;
+                                const float *ptr7 =
+                                        out_tile_7.row(out_tile[0].w - 1 - j - prepadding * scale) + out_tile[0].h - 1 -
+                                        i - prepadding * scale;
+
+                                float v = (*ptr0++ + *ptr1++ + *ptr2-- + *ptr3-- + *ptr4 + *ptr5 + *ptr6 + *ptr7) / 8;
+
+                                *outptr++ = v * 255.f + 0.5f;
+                            }
+                        }
+                    }
+
+                    if (channels == 4) {
+                        memcpy(out.channel_range(3, 1), out_alpha_tile, out_alpha_tile.total() * sizeof(float));
+                    }
+                }
+            } else {
+                // split alpha and preproc
+                ncnn::Mat in_tile;
+                ncnn::Mat in_alpha_tile;
+                {
+                    in_tile.create(in.w, in.h, 3);
+                    for (int q = 0; q < 3; q++) {
+                        const float *ptr = in.channel(q);
+                        float *outptr = in_tile.channel(q);
+
+                        for (int i = 0; i < in.w * in.h; i++) {
+                            *outptr++ = *ptr++ * (1 / 255.f);
+                        }
+                    }
+
+                    if (channels == 4) {
+                        in_alpha_tile = in.channel_range(3, 1).clone();
+                    }
+                }
+
+                // border padding
+                {
+                    int pad_top = std::max(prepadding - yi * TILE_SIZE_Y, 0);
+                    int pad_bottom = std::max(std::min((yi + 1) * TILE_SIZE_Y + prepadding - h, prepadding), 0);
+                    int pad_left = std::max(prepadding - xi * TILE_SIZE_X, 0);
+                    int pad_right = std::max(std::min((xi + 1) * TILE_SIZE_X + prepadding - w, prepadding), 0);
+
+                    ncnn::Mat in_tile_padded;
+                    ncnn::copy_make_border(in_tile, in_tile_padded, pad_top, pad_bottom, pad_left, pad_right, 2, 0.f,
+                                           net.opt);
+                    in_tile = in_tile_padded;
+                }
+
+                // realsr
+                ncnn::Mat out_tile;
+                {
+                    ncnn::Extractor ex = net.create_extractor();
+
+                    ex.input("data", in_tile);
+
+                    ex.extract("output", out_tile);
+                }
+
+                ncnn::Mat out_alpha_tile;
+                if (channels == 4) {
+                    if (scale == 1) {
+                        out_alpha_tile = in_alpha_tile;
+                    }
+                    if (scale == 2) {
+                        bicubic_2x->forward(in_alpha_tile, out_alpha_tile, opt);
+                    }
+                    if (scale == 3) {
+                        bicubic_3x->forward(in_alpha_tile, out_alpha_tile, opt);
+                    }
+                    if (scale == 4) {
+                        bicubic_4x->forward(in_alpha_tile, out_alpha_tile, opt);
+                    }
+                }
+
+                // postproc and merge alpha
+                {
+                    out.create(tile_w_nopad * scale, tile_h_nopad * scale, channels);
+                    for (int q = 0; q < 3; q++) {
+                        float *outptr = out.channel(q);
+
+                        for (int i = 0; i < out.h; i++) {
+                            const float *ptr = out_tile.channel(q).row(i + prepadding * scale) + prepadding * scale;
+
+                            for (int j = 0; j < out.w; j++) {
+                                *outptr++ = *ptr++ * 255.f + 0.5f;
+                            }
+                        }
+                    }
+
+                    if (channels == 4) {
+                        memcpy(out.channel_range(3, 1), out_alpha_tile, out_alpha_tile.total() * sizeof(float));
+                    }
+                }
+            }
+
+            {
+                if (channels == 3) {
+                    out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels +
+                                  xi * scale * TILE_SIZE_X * channels, ncnn::Mat::PIXEL_RGB, w * scale * channels);
+                }
+                if (channels == 4) {
+                    out.to_pixels((unsigned char *) outimage.data + yi * scale * TILE_SIZE_Y * w * scale * channels +
+                                  xi * scale * TILE_SIZE_X * channels, ncnn::Mat::PIXEL_RGBA, w * scale * channels);
+                }
+            }
+
+            fprintf(stderr, "%.2f%%\n", (float) (yi * xtiles + xi) / (ytiles * xtiles) * 100);
+        }
+    }
+
+    return 0;
+}
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.h b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.h
new file mode 100644
index 0000000..dc39048
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan.h
@@ -0,0 +1,48 @@
+// realesrgan implemented with ncnn library
+
+#ifndef REALESRGAN_H
+#define REALESRGAN_H
+
+#include <string>
+
+// ncnn
+#include "net.h"
+#include "gpu.h"
+#include "layer.h"
+
+class RealESRGAN {
+public:
+    RealESRGAN(int gpuid, bool tta_mode = false);
+
+    ~RealESRGAN();
+
+#if _WIN32
+    int load(const std::wstring& parampath, const std::wstring& modelpath);
+#else
+
+    int load(const std::string &parampath, const std::string &modelpath);
+
+#endif
+
+    int process(const ncnn::Mat &inimage, ncnn::Mat &outimage) const;
+
+    int process_cpu(const ncnn::Mat &inimage, ncnn::Mat &outimage) const;
+
+public:
+    // realesrgan parameters
+    int scale;
+    int tilesize;
+    int prepadding;
+
+private:
+    ncnn::VulkanDevice *vkdev;
+    ncnn::Net net;
+    ncnn::Pipeline *realesrgan_preproc;
+    ncnn::Pipeline *realesrgan_postproc;
+    ncnn::Layer *bicubic_2x;
+    ncnn::Layer *bicubic_3x;
+    ncnn::Layer *bicubic_4x;
+    bool tta_mode;
+};
+
+#endif // REALESRGAN_H
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc.comp
new file mode 100644
index 0000000..39eda23
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc.comp
@@ -0,0 +1,89 @@
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#define sfp float16_t
+#else
+#define sfp float
+#endif
+
+#if NCNN_int8_storage
+#extension GL_EXT_shader_8bit_storage: require
+#endif
+
+layout (constant_id = 0) const int bgr = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) readonly buffer alpha_blob { sfp alpha_blob_data[]; };
+#if NCNN_int8_storage
+layout (binding = 2) writeonly buffer top_blob { uint8_t top_blob_data[]; };
+#else
+layout (binding = 2) writeonly buffer top_blob { float top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outcstep;
+
+    int offset_x;
+    int gx_max;
+
+    int crop_x;
+    int crop_y;
+
+    int channels;
+
+    int alphaw;
+    int alphah;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.gx_max || gy >= p.outh || gz >= p.channels)
+        return;
+
+    float v;
+
+    if (gz == 3)
+    {
+        v = float(alpha_blob_data[gy * p.alphaw + gx]);
+    }
+    else
+    {
+        v = float(bottom_blob_data[gz * p.cstep + (gy + p.crop_y) * p.w + gx + p.crop_x]);
+
+        const float denorm_val = 255.f;
+
+        v = v * denorm_val;
+    }
+
+    const float clip_eps = 0.5f;
+
+    v = v + clip_eps;
+
+#if NCNN_int8_storage
+    int v_offset = gy * p.outw + gx + p.offset_x;
+
+    uint v32 = clamp(uint(floor(v)), 0, 255);
+
+    if (bgr == 1 && gz != 3)
+        top_blob_data[v_offset * p.channels + 2 - gz] = uint8_t(v32);
+    else
+        top_blob_data[v_offset * p.channels + gz] = uint8_t(v32);
+#else
+    int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset_x;
+
+    top_blob_data[v_offset] = v;
+#endif
+}
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc_tta.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc_tta.comp
new file mode 100644
index 0000000..22a65ba
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_postproc_tta.comp
@@ -0,0 +1,110 @@
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#define sfp float16_t
+#else
+#define sfp float
+#endif
+
+#if NCNN_int8_storage
+#extension GL_EXT_shader_8bit_storage: require
+#endif
+
+layout (constant_id = 0) const int bgr = 0;
+
+layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; };
+layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
+layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };
+layout (binding = 3) readonly buffer bottom_blob3 { sfp bottom_blob3_data[]; };
+layout (binding = 4) readonly buffer bottom_blob4 { sfp bottom_blob4_data[]; };
+layout (binding = 5) readonly buffer bottom_blob5 { sfp bottom_blob5_data[]; };
+layout (binding = 6) readonly buffer bottom_blob6 { sfp bottom_blob6_data[]; };
+layout (binding = 7) readonly buffer bottom_blob7 { sfp bottom_blob7_data[]; };
+layout (binding = 8) readonly buffer alpha_blob { sfp alpha_blob_data[]; };
+#if NCNN_int8_storage
+layout (binding = 9) writeonly buffer top_blob { uint8_t top_blob_data[]; };
+#else
+layout (binding = 9) writeonly buffer top_blob { float top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outcstep;
+
+    int offset_x;
+    int gx_max;
+
+    int crop_x;
+    int crop_y;
+
+    int channels;
+
+    int alphaw;
+    int alphah;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.gx_max || gy >= p.outh || gz >= p.channels)
+        return;
+
+    float v;
+
+    if (gz == 3)
+    {
+        v = float(alpha_blob_data[gy * p.alphaw + gx]);
+    }
+    else
+    {
+        int gzi = gz * p.cstep;
+
+        int sy = gy + p.crop_y;
+        int sx = gx + p.crop_x;
+
+        float v0 = float(bottom_blob0_data[gzi + sy * p.w + sx]);
+        float v1 = float(bottom_blob1_data[gzi + sy * p.w + (p.w - 1 - sx)]);
+        float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - sy) * p.w + (p.w - 1 - sx)]);
+        float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - sy) * p.w + sx]);
+        float v4 = float(bottom_blob4_data[gzi + sx * p.h + sy]);
+        float v5 = float(bottom_blob5_data[gzi + sx * p.h + (p.h - 1 - sy)]);
+        float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - sx) * p.h + (p.h - 1 - sy)]);
+        float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - sx) * p.h + sy]);
+
+        v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f;
+
+        const float denorm_val = 255.f;
+
+        v = v * denorm_val;
+    }
+
+    const float clip_eps = 0.5f;
+
+    v = v + clip_eps;
+
+#if NCNN_int8_storage
+    int v_offset = gy * p.outw + gx + p.offset_x;
+
+    uint v32 = clamp(uint(floor(v)), 0, 255);
+
+    if (bgr == 1 && gz != 3)
+        top_blob_data[v_offset * p.channels + 2 - gz] = uint8_t(v32);
+    else
+        top_blob_data[v_offset * p.channels + gz] = uint8_t(v32);
+#else
+    int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset_x;
+
+    top_blob_data[v_offset] = v;
+#endif
+}
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc.comp
new file mode 100644
index 0000000..b9e7f71
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc.comp
@@ -0,0 +1,95 @@
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#define sfp float16_t
+#else
+#define sfp float
+#endif
+
+#if NCNN_int8_storage
+#extension GL_EXT_shader_8bit_storage: require
+#endif
+
+layout (constant_id = 0) const int bgr = 0;
+
+#if NCNN_int8_storage
+layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) writeonly buffer alpha_blob { sfp alpha_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outcstep;
+
+    int pad_top;
+    int pad_left;
+
+    int crop_x;
+    int crop_y;
+
+    int channels;
+
+    int alphaw;
+    int alphah;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.channels)
+        return;
+
+    int x = gx + p.crop_x - p.pad_left;
+    int y = gy + p.crop_y - p.pad_top;
+
+    x = abs(x);
+    y = abs(y);
+    x = (p.w - 1) - abs(x - (p.w - 1));
+    y = (p.h - 1) - abs(y - (p.h - 1));
+
+#if NCNN_int8_storage
+    int v_offset = y * p.w + x;
+
+    float v;
+
+    if (bgr == 1 && gz != 3)
+        v = float(uint(bottom_blob_data[v_offset * p.channels + 2 - gz]));
+    else
+        v = float(uint(bottom_blob_data[v_offset * p.channels + gz]));
+#else
+    int v_offset = gz * p.cstep + y * p.w + x;
+
+    float v = bottom_blob_data[v_offset];
+#endif
+
+    if (gz == 3)
+    {
+        gx -= p.pad_left;
+        gy -= p.pad_top;
+
+        if (gx >= 0 && gx < p.alphaw && gy >= 0 && gy < p.alphah)
+        {
+            alpha_blob_data[gy * p.alphaw + gx] = sfp(v);
+        }
+    }
+    else
+    {
+        const float norm_val = 1 / 255.f;
+
+        top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(v * norm_val);
+    }
+}
diff --git a/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc_tta.comp b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc_tta.comp
new file mode 100644
index 0000000..b3af689
--- /dev/null
+++ b/src/Real-ESRGAN-ncnn-vulkan/src/realesrgan_preproc_tta.comp
@@ -0,0 +1,113 @@
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#define sfp float16_t
+#else
+#define sfp float
+#endif
+
+#if NCNN_int8_storage
+#extension GL_EXT_shader_8bit_storage: require
+#endif
+
+layout (constant_id = 0) const int bgr = 0;
+
+#if NCNN_int8_storage
+layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob0 { sfp top_blob0_data[]; };
+layout (binding = 2) writeonly buffer top_blob1 { sfp top_blob1_data[]; };
+layout (binding = 3) writeonly buffer top_blob2 { sfp top_blob2_data[]; };
+layout (binding = 4) writeonly buffer top_blob3 { sfp top_blob3_data[]; };
+layout (binding = 5) writeonly buffer top_blob4 { sfp top_blob4_data[]; };
+layout (binding = 6) writeonly buffer top_blob5 { sfp top_blob5_data[]; };
+layout (binding = 7) writeonly buffer top_blob6 { sfp top_blob6_data[]; };
+layout (binding = 8) writeonly buffer top_blob7 { sfp top_blob7_data[]; };
+layout (binding = 9) writeonly buffer alpha_blob { sfp alpha_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outcstep;
+
+    int pad_top;
+    int pad_left;
+
+    int crop_x;
+    int crop_y;
+
+    int channels;
+
+    int alphaw;
+    int alphah;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.channels)
+        return;
+
+    int x = gx + p.crop_x - p.pad_left;
+    int y = gy + p.crop_y - p.pad_top;
+
+    x = abs(x);
+    y = abs(y);
+    x = (p.w - 1) - abs(x - (p.w - 1));
+    y = (p.h - 1) - abs(y - (p.h - 1));
+
+#if NCNN_int8_storage
+    int v_offset = y * p.w + x;
+
+    float v;
+
+    if (bgr == 1 && gz != 3)
+        v = float(uint(bottom_blob_data[v_offset * p.channels + 2 - gz]));
+    else
+        v = float(uint(bottom_blob_data[v_offset * p.channels + gz]));
+#else
+    int v_offset = gz * p.cstep + y * p.w + x;
+
+    float v = bottom_blob_data[v_offset];
+#endif
+
+    if (gz == 3)
+    {
+        gx -= p.pad_left;
+        gy -= p.pad_top;
+
+        if (gx >= 0 && gx < p.alphaw && gy >= 0 && gy < p.alphah)
+        {
+            alpha_blob_data[gy * p.alphaw + gx] = sfp(v);
+        }
+    }
+    else
+    {
+        const float norm_val = 1 / 255.f;
+
+        v = v * norm_val;
+
+        int gzi = gz * p.outcstep;
+
+        top_blob0_data[gzi + gy * p.outw + gx] = sfp(v);
+        top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(v);
+        top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(v);
+        top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(v);
+        top_blob4_data[gzi + gx * p.outh + gy] = sfp(v);
+        top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(v);
+        top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(v);
+        top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(v);
+    }
+}
diff --git a/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py b/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py
index d222f1d..d4383ee 100644
--- a/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py
+++ b/src/realesrgan_ncnn_py/realesrgan_ncnn_vulkan.py
@@ -41,14 +41,14 @@ def __init__(self, gpuid: int = 0, tta_mode: bool = False, tilesize: int = 0, mo
         """
         RealESRGAN class for Super Resolution
 
-        :param gpuid: gpu device to use, cpu is not supported yet
+        :param gpuid: gpu device to use, -1 for cpu
         :param tta_mode: enable test time argumentation
         :param tilesize: tile size, 0 for auto, must >= 32
         :param model: realesrgan model, 0 for default, -1 for custom load
         """
 
         # check arguments' validity
-        assert gpuid >= 0, "gpuid must >= 0"
+        assert gpuid >= -1, "gpuid must >= -1"
         assert tilesize == 0 or tilesize >= 32, "tilesize must >= 32 or be 0"
         assert model >= -1, "model must > 0 or -1"
 
diff --git a/tests/test_realesrgan.py b/tests/test_realesrgan.py
index c44a422..eeffa12 100644
--- a/tests/test_realesrgan.py
+++ b/tests/test_realesrgan.py
@@ -1,8 +1,10 @@
+import os
 import sys
 from pathlib import Path
 
 import cv2
 import numpy as np
+import pytest
 from realesrgan_ncnn_py import Realesrgan
 from skimage.metrics import structural_similarity
 
@@ -28,12 +30,25 @@ def calculate_image_similarity(image1: np.ndarray, image2: np.ndarray) -> bool:
 
 _gpuid = 0
 
+# gpuid = -1 when in GitHub Actions
+if os.environ.get("GITHUB_ACTIONS") == "true":
+    _gpuid = -1
+
 TEST_IMG = cv2.imread(str(filePATH.parent / "test.png"))
 
 
 class Test_Realesrgan:
-    def test_cv2(self) -> None:
-        _realesrgan = Realesrgan(gpuid=_gpuid, model=0)
-        outimg = _realesrgan.process_cv2(TEST_IMG)
+    def test_animevideov3(self) -> None:
+        for _model in [0, 1, 2]:
+            _realesrgan = Realesrgan(gpuid=_gpuid, model=_model)
+            outimg = _realesrgan.process_cv2(TEST_IMG)
+
+            assert calculate_image_similarity(TEST_IMG, outimg)
+
+    @pytest.mark.skipif(_gpuid == -1, reason="skip when in GitHub Actions")
+    def test_x4plus(self) -> None:
+        for _model in [3, 4]:
+            _realesrgan = Realesrgan(gpuid=_gpuid, model=_model)
+            outimg = _realesrgan.process_cv2(TEST_IMG)
 
-        assert calculate_image_similarity(TEST_IMG, outimg)
+            assert calculate_image_similarity(TEST_IMG, outimg)