commit 5e048175d3db59202424abf2880c0c79dc6d8531 Author: YaoFANGUK Date: Fri Dec 8 17:12:26 2023 +0800 vsr v1.0.0 diff --git a/.condarc b/.condarc new file mode 100644 index 0000000..78a9061 --- /dev/null +++ b/.condarc @@ -0,0 +1,14 @@ +channels: + - defaults +show_channel_urls: true +default_channels: + - http://mirrors.aliyun.com/anaconda/pkgs/main + - http://mirrors.aliyun.com/anaconda/pkgs/r + - http://mirrors.aliyun.com/anaconda/pkgs/msys2 +custom_channels: + conda-forge: http://mirrors.aliyun.com/anaconda/cloud + msys2: http://mirrors.aliyun.com/anaconda/cloud + bioconda: http://mirrors.aliyun.com/anaconda/cloud + menpo: http://mirrors.aliyun.com/anaconda/cloud + pytorch: http://mirrors.aliyun.com/anaconda/cloud + simpleitk: http://mirrors.aliyun.com/anaconda/cloud \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2caabc1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,373 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/intellij+all,python,pycharm+all,macos,windows +# Edit at https://www.toptal.com/developers/gitignore?templates=intellij+all,python,pycharm+all,macos,windows + +### Intellij+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +# Sonarlint plugin +.idea/sonarlint + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff + +# Generated files + +# Sensitive or high-churn files + +# Gradle + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake + +# Mongo Explorer plugin + +# File-based project format + +# IntelliJ + +# mpeltonen/sbt-idea plugin + +# JIRA plugin + +# Cursive Clojure plugin + +# Crashlytics plugin (for Android Studio and IntelliJ) + +# Editor-based Rest Client + +# Android studio 3.1+ serialized cache file + +### PyCharm+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + + +# Sonarlint plugin + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +# .env +.env/ +.venv/ +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pythonenv* + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# operating system-related files +*.DS_Store #file properties cache/storage on macOS +Thumbs.db #thumbnail cache on Windows + +# profiling data +.prof + + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/intellij+all,python,pycharm+all,macos,windows +/backend/models/V2/ch_rec/inference.pdiparams +/backend/models/V4/ch_det/inference.pdiparams +/backend/models/big-lama/models/best.ckpt +/backend/models/sam/sam.pth +/output/ +/backend/test.py +/dylib/ +/settings.ini +/test*.py +/subtitle.ini +*_no_sub.mp4 +/backend/ffmpeg/win_x64/ffmpeg.exe +/test/pic_test/ +demo*.mp4 +out*.mp4 +test*.py +test_*.mp4 +test*_no_sub*.mp4 +/test/coods/ +/local_test/ +/backend/models/video/ProPainter.pth +/backend/models/big-lama/big-lama.pt diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100755 index 0000000..fd78ff2 --- /dev/null +++ b/README.md @@ -0,0 +1,187 @@ +简体中文 | [English](README_en.md) + +## 项目简介 + +![License](https://img.shields.io/badge/License-Apache%202-red.svg) +![python version](https://img.shields.io/badge/Python-3.8+-blue.svg) +![support os](https://img.shields.io/badge/OS-Windows/macOS/Linux-green.svg) + +Video-subtitle-remover (VSR) 是一款基于AI技术,将视频中的硬字幕去除的软件。 +主要实现了以下功能: +- **无损分辨率**将视频中的硬字幕去除,生成去除字幕后的文件 +- 通过超强AI算法模型,对去除字幕文本的区域进行填充(非相邻像素填充与马赛克去除) +- 支持自定义字幕位置,仅去除定义位置中的字幕(传入位置) +- 支持全视频自动去除所有文本(不传入位置) + +

demo.png

+ +**使用说明:** + + - 有使用问题请加群讨论,QQ群:806152575 + - 直接下载压缩包解压运行,如果不能运行再按照下面的教程,尝试源码安装conda环境运行 + +**下载地址:** + +Windows GPU版本v1.0.0(GPU): + +- 百度网盘: vsr_windows_gpu_v1.0.0.7z 提取码:**vsr1** + +- Google Drive: vsr_windows_gpu_v1.0.0.7z + +> 仅供具有Nvidia显卡的用户使用(AMD的显卡不行) + +## 演示 + +- GUI版: + +

demo2.gif

+ +- 点击查看演示视频👇 + +

demo.gif

+ +## 源码使用说明 + +> **无Nvidia显卡请勿使用本项目**,最低配置: +> +> **GPU**:GTX 1060或以上显卡 +> +> CPU: 支持AVX指令集 + +#### 1. 下载安装Miniconda + +- Windows: Miniconda3-py38_4.11.0-Windows-x86_64.exe + +- Linux: Miniconda3-py38_4.11.0-Linux-x86_64.sh + +#### 2. 创建并激活虚机环境 + +(1)切换到源码所在目录: +```shell +cd <源码所在目录> +``` +> 例如:如果你的源代码放在D盘的tools文件下,并且源代码的文件夹名为video-subtitle-remover,就输入 ```cd D:/tools/video-subtitle-remover-main``` + +(2)创建激活conda环境 +```shell +conda create -n videoEnv python=3.8 +``` + +```shell +conda activate videoEnv +``` + +#### 3. 安装依赖文件 + +请确保你已经安装 python 3.8+,使用conda创建项目虚拟环境并激活环境 (建议创建虚拟环境运行,以免后续出现问题) + +- 安装CUDA和cuDNN + +
+ Linux用户 +
(1) 下载CUDA 11.7
+
wget https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
+
(2) 安装CUDA 11.7
+
sudo sh cuda_11.7.0_515.43.04_linux.run
+

1. 输入accept

+ +

2. 选中CUDA Toolkit 11.7(如果你没有安装nvidia驱动则选中Driver,如果你已经安装了nvidia驱动请不要选中driver),之后选中install,回车

+ +

3. 添加环境变量

+

在 ~/.bashrc 加入以下内容

+
# CUDA
+  export PATH=/usr/local/cuda-11.7/bin${PATH:+:${PATH}}
+  export LD_LIBRARY_PATH=/usr/local/cuda-11.7/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+

使其生效

+
source ~/.bashrc
+
(3) 下载cuDNN 8.4.1
+

国内:cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz 提取码:57mg

+

国外:cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz

+
(4) 安装cuDNN 8.4.1
+
 tar -xf cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz
+   mv cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive cuda
+   sudo cp ./cuda/include/* /usr/local/cuda-11.7/include/
+   sudo cp ./cuda/lib/* /usr/local/cuda-11.7/lib64/
+   sudo chmod a+r /usr/local/cuda-11.7/lib64/*
+   sudo chmod a+r /usr/local/cuda-11.7/include/*
+
+ +
+ Windows用户 +
(1) 下载CUDA 11.7
+ cuda_11.7.0_516.01_windows.exe +
(2) 安装CUDA 11.7
+
(3) 下载cuDNN 8.2.4
+

cudnn-windows-x64-v8.2.4.15.zip

+
(4) 安装cuDNN 8.2.4
+

+ 将cuDNN解压后的cuda文件夹中的bin, include, lib目录下的文件复制到C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\对应目录下 +

+
+ + +- 安装GPU版本Paddlepaddle: + + - windows: + + ```shell + python -m pip install paddlepaddle-gpu==2.4.2.post117 -f https://www.paddlepaddle.org.cn/whl/windows/mkl/avx/stable.html + ``` + + - Linux: + + ```shell + python -m pip install paddlepaddle-gpu==2.4.2.post117 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html + ``` + +- 安装GPU版本Pytorch: + + ```shell + conda install pytorch==2.0.1 torchvision==0.15.2 pytorch-cuda=11.7 -c pytorch -c nvidia + ``` + 或者使用 + ```shell + pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu117 + ``` + +- 安装其他依赖: + + ```shell + pip install -r requirements.txt + ``` + + +#### 4. 运行程序 + +- 运行图形化界面 + +```shell +python gui.py +``` + +- 运行命令行版本(CLI) + +```shell +python ./backend/main.py +``` + +## 常见问题 +1. CondaHTTPError + +将项目中的.condarc放在用户目录下(C:/Users/<你的用户名>),如果用户目录已经存在该文件则覆盖 + +解决方案:https://zhuanlan.zhihu.com/p/260034241 + +2. 7z文件解压错误 + +解决方案:升级7-zip解压程序到最新版本 + +## 赞助 + + +| 捐赠者 | 累计捐赠金额 | 赞助席位 | +| --- | --- | --- | +| 很奇异| 15.00 RMB | 金牌赞助席位 | +| Leo| 1.00 RMB | 银牌赞助席位 | +| 暂无| 暂无 | 铜牌赞助席位 | + diff --git a/README_en.md b/README_en.md new file mode 100755 index 0000000..d74d155 --- /dev/null +++ b/README_en.md @@ -0,0 +1,177 @@ +[简体中文](README.md) | English + +## Project Introduction + +![License](https://img.shields.io/badge/License-Apache%202-red.svg) +![python version](https://img.shields.io/badge/Python-3.8+-blue.svg) +![support os](https://img.shields.io/badge/OS-Windows/macOS/Linux-green.svg) + +Video-subtitle-remover (VSR) is an AI-based software that removes hardcoded subtitles from videos. It mainly implements the following functionalities: + +- **Lossless resolution**: Removes hardcoded subtitles from videos and generates files without subtitles. +- Fills in the removed subtitle text area using a powerful AI algorithm model (non-adjacent pixel filling and mosaic removal). +- Supports custom subtitle positions by only removing subtitles in the defined location (input position). +- Supports automatic removal of all text throughout the entire video (without inputting a position). + +

demo.png

+ +> Download the .7z package directly, extract, and run it. If it cannot run, follow the tutorial below to try installing the conda environment and running the source code. + +**Download Links:** + +Windows GPU Version v1.0.0 (GPU): + +- Baidu Cloud Disk: vsr_windows_gpu_v1.0.0.7z Extraction Code: **vsr1** + +- Google Drive: vsr_windows_gpu_v1.0.0.7z + +> For use only by users with Nvidia graphics cards (AMD graphics cards are not supported). + +## Demonstration + +- GUI: + +

demo2.gif

+ +- Click to view demo video👇 + +

demo.gif

+ +## Source Code Usage Instructions + +> **Do not use this project without an Nvidia graphics card**. The minimum requirements are: +> +> **GPU**: GTX 1060 or higher graphics card +> +> CPU: Supports AVX instruction set + +#### 1. Download and install Miniconda + +- Windows: Miniconda3-py38_4.11.0-Windows-x86_64.exe + +- Linux: Miniconda3-py38_4.11.0-Linux-x86_64.sh + +#### 2. Create and activate a virtual environment + +(1) Switch to the source code directory: + +```shell +cd +``` + +> For example, if your source code is in the `tools` folder on drive D, and the source code folder name is `video-subtitle-remover`, enter `cd D:/tools/video-subtitle-remover-main`. + +(2) Create and activate the conda environment: + +```shell +conda create -n videoEnv python=3.8 +``` + +```shell +conda activate videoEnv +``` + +#### 3. Install dependencies + +Please make sure you have already installed Python 3.8+, use conda to create a project virtual environment and activate the environment (it is recommended to create a virtual environment to run to avoid subsequent problems). + + - Install **CUDA** and **cuDNN** + +
+ Linux +
(1) Download CUDA 11.7
+
wget https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
+
(2) Install CUDA 11.7
+
sudo sh cuda_11.7.0_515.43.04_linux.run
+

1. Input accept

+ +

2. make sure CUDA Toolkit 11.7 is chosen (If you have already installed driver, do not select Driver)

+ +

3. Add environment variables

+

add the following content in ~/.bashrc

+
# CUDA
+      export PATH=/usr/local/cuda-11.7/bin${PATH:+:${PATH}}
+      export LD_LIBRARY_PATH=/usr/local/cuda-11.7/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+

Make sure it works

+
source ~/.bashrc
+
(3) Download cuDNN 8.4.1
+

cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz

+
(4) Install cuDNN 8.4.1
+
 tar -xf cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz
+     mv cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive cuda
+     sudo cp ./cuda/include/* /usr/local/cuda-11.7/include/
+     sudo cp ./cuda/lib/* /usr/local/cuda-11.7/lib64/
+     sudo chmod a+r /usr/local/cuda-11.7/lib64/*
+     sudo chmod a+r /usr/local/cuda-11.7/include/*
+
+ +
+ Windows +
(1) Download CUDA 11.7
+ cuda_11.7.0_516.01_windows.exe +
(2) Install CUDA 11.7
+
(3) Download cuDNN 8.2.4
+

cudnn-windows-x64-v8.2.4.15.zip

+
(4) Install cuDNN 8.2.4
+

+ unzip "cudnn-windows-x64-v8.2.4.15.zip", then move all files in "bin, include, lib" in cuda + directory to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\ +

+
+ + +- Install GPU version of Paddlepaddle: + - windows: + + ```shell + python -m pip install paddlepaddle-gpu==2.4.2.post117 -f https://www.paddlepaddle.org.cn/whl/windows/mkl/avx/stable.html + ``` + + - Linux: + + ```shell + python -m pip install paddlepaddle-gpu==2.4.2.post117 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html + ``` + +- Install GPU version of Pytorch: + + ```shell + conda install pytorch==2.0.1 torchvision==0.15.2 pytorch-cuda=11.7 -c pytorch -c nvidia + ``` + or use + + ```shell + pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu117 + ``` + +- Install other dependencies: + + ```shell + pip install -r requirements.txt + ``` + + +#### 4. Run the program + +- Run the graphical interface + +```shell +python gui.py +``` + +- Run the command line version (CLI) + +```shell +python ./backend/main.py +``` + +## Common Issues +1. CondaHTTPError + +Place the .condarc file from the project in the user directory (C:/Users/). If the file already exists in the user directory, overwrite it. + +Solution: https://zhuanlan.zhihu.com/p/260034241 + +2. 7z file extraction error + +Solution: Upgrade the 7-zip extraction program to the latest version. diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000..144e5fd --- /dev/null +++ b/backend/__init__.py @@ -0,0 +1,3 @@ +import warnings +# 忽略所有的 DeprecationWarning +warnings.filterwarnings("ignore", category=DeprecationWarning) \ No newline at end of file diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..917c82b --- /dev/null +++ b/backend/config.py @@ -0,0 +1,79 @@ +import warnings +warnings.filterwarnings('ignore') +import os +import torch +import logging +import platform +import stat +from fsplit.filesplit import Filesplit +import paddle +paddle.disable_signal_handler() +logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印 +logging.disable(logging.WARNING) # 关闭WARNING日志的打印 +device = "cuda" if torch.cuda.is_available() else "cpu" +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +LAMA_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'big-lama') +VIDEO_INPAINT_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'video') +MODEL_VERSION = 'V4' +DET_MODEL_BASE = os.path.join(BASE_DIR, 'models') +DET_MODEL_PATH = os.path.join(DET_MODEL_BASE, MODEL_VERSION, 'ch_det') + +# ×××××××××××××××××××× [可以改] start ×××××××××××××××××××× +# 容忍的像素点偏差 +PIXEL_TOLERANCE_Y = 20 # 允许检测框纵向偏差50个像素点 +PIXEL_TOLERANCE_X = 20 # 允许检测框横向偏差100个像素点 +# 字幕区域偏移量, 放大诗歌像素点,防止字幕偏移 +SUBTITLE_AREA_DEVIATION_PIXEL = 10 +# 20个像素点以内的差距认为是同一行 +TOLERANCE_Y = 20 +# 高度差阈值 +THRESHOLD_HEIGHT_DIFFERENCE = 20 +# 【根据自己的GPU显存大小设置】最大同时处理的图片数量,设置越大处理效果越好,但是要求显存越高 +# 1280x720p视频设置80需要25G显存,设置50需要19G显存 +# 720x480p视频设置80需要8G显存,设置50需要7G显存 +MAX_PROCESS_NUM = 70 +# 【根据自己内存大小设置,应该大于等于MAX_PROCESS_NUM】 +MAX_LOAD_NUM = 70 +# 是否开启精细模式,开启精细模式将消耗大量GPU显存,如果您的显卡显存较少,建议设置为False +ACCURATE_MODE = True +# 是否开启快速模型,不保证inpaint效果 +FAST_MODE = False +# ×××××××××××××××××××× [可以改] start ×××××××××××××××××××× + + +# ×××××××××××××××××××× [不要改] start ×××××××××××××××××××× +# 查看该路径下是否有模型完整文件,没有的话合并小文件生成完整文件 +if 'big-lama.pt' not in (os.listdir(LAMA_MODEL_PATH)): + fs = Filesplit() + fs.merge(input_dir=LAMA_MODEL_PATH) + +if 'inference.pdiparams' not in os.listdir(DET_MODEL_PATH): + fs = Filesplit() + fs.merge(input_dir=DET_MODEL_PATH) + +if 'ProPainter.pth' not in os.listdir(VIDEO_INPAINT_MODEL_PATH): + fs = Filesplit() + fs.merge(input_dir=VIDEO_INPAINT_MODEL_PATH) + +# 指定ffmpeg可执行程序路径 +sys_str = platform.system() +if sys_str == "Windows": + ffmpeg_bin = os.path.join('win_x64', 'ffmpeg.exe') +elif sys_str == "Linux": + ffmpeg_bin = os.path.join('linux_x64', 'ffmpeg') +else: + ffmpeg_bin = os.path.join('macos', 'ffmpeg') +FFMPEG_PATH = os.path.join(BASE_DIR, '', 'ffmpeg', ffmpeg_bin) + +if 'ffmpeg.exe' not in os.listdir(os.path.join(BASE_DIR, '', 'ffmpeg', 'win_x64')): + fs = Filesplit() + fs.merge(input_dir=os.path.join(BASE_DIR, '', 'ffmpeg', 'win_x64')) +# 将ffmpeg添加可执行权限 +os.chmod(FFMPEG_PATH, stat.S_IRWXU+stat.S_IRWXG+stat.S_IRWXO) + +os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' + +# 如果开启了快速模式,则强制关闭ACCURATE_MODE +if FAST_MODE: + ACCURATE_MODE = False +# ×××××××××××××××××××× [不要改] end ×××××××××××××××××××× diff --git a/backend/ffmpeg/linux_x64/ffmpeg b/backend/ffmpeg/linux_x64/ffmpeg new file mode 100755 index 0000000..13e56c8 Binary files /dev/null and b/backend/ffmpeg/linux_x64/ffmpeg differ diff --git a/backend/ffmpeg/macos/ffmpeg b/backend/ffmpeg/macos/ffmpeg new file mode 100644 index 0000000..b8c4347 Binary files /dev/null and b/backend/ffmpeg/macos/ffmpeg differ diff --git a/backend/ffmpeg/win_x64/ffmpeg_1.exe b/backend/ffmpeg/win_x64/ffmpeg_1.exe new file mode 100644 index 0000000..5b75d5f Binary files /dev/null and b/backend/ffmpeg/win_x64/ffmpeg_1.exe differ diff --git a/backend/ffmpeg/win_x64/ffmpeg_2.exe b/backend/ffmpeg/win_x64/ffmpeg_2.exe new file mode 100644 index 0000000..902df3f Binary files /dev/null and b/backend/ffmpeg/win_x64/ffmpeg_2.exe differ diff --git a/backend/ffmpeg/win_x64/ffmpeg_3.exe b/backend/ffmpeg/win_x64/ffmpeg_3.exe new file mode 100644 index 0000000..c0ff50e Binary files /dev/null and b/backend/ffmpeg/win_x64/ffmpeg_3.exe differ diff --git a/backend/ffmpeg/win_x64/fs_manifest.csv b/backend/ffmpeg/win_x64/fs_manifest.csv new file mode 100644 index 0000000..501738d --- /dev/null +++ b/backend/ffmpeg/win_x64/fs_manifest.csv @@ -0,0 +1,4 @@ +filename,filesize,encoding,header +ffmpeg_1.exe,50000000,, +ffmpeg_2.exe,50000000,, +ffmpeg_3.exe,13721856,, diff --git a/backend/inpaint/__init__.py b/backend/inpaint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/inpaint/lama_inpaint.py b/backend/inpaint/lama_inpaint.py new file mode 100644 index 0000000..e39a3bd --- /dev/null +++ b/backend/inpaint/lama_inpaint.py @@ -0,0 +1,27 @@ +import os +from typing import Union + +import torch +import numpy as np +from PIL import Image +from backend.inpaint.utils.lama_util import prepare_img_and_mask +from backend import config + + +class LamaInpaint: + def __init__(self, device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"), model_path=None) -> None: + if model_path is None: + model_path = os.path.join(config.LAMA_MODEL_PATH, 'big-lama.pt') + self.model = torch.jit.load(model_path, map_location=device) + self.model.eval() + self.model.to(device) + self.device = device + + def __call__(self, image: Union[Image.Image, np.ndarray], mask: Union[Image.Image, np.ndarray]): + image, mask = prepare_img_and_mask(image, mask, self.device) + with torch.inference_mode(): + inpainted = self.model(image, mask) + cur_res = inpainted[0].permute(1, 2, 0).detach().cpu().numpy() + cur_res = np.clip(cur_res * 255, 0, 255).astype(np.uint8) + return cur_res + diff --git a/backend/inpaint/utils/__init__.py b/backend/inpaint/utils/__init__.py new file mode 100644 index 0000000..90f60fd --- /dev/null +++ b/backend/inpaint/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/backend/inpaint/utils/lama_util.py b/backend/inpaint/utils/lama_util.py new file mode 100644 index 0000000..7936da5 --- /dev/null +++ b/backend/inpaint/utils/lama_util.py @@ -0,0 +1,80 @@ +import os +import sys +import torch +import numpy as np +import cv2 +from PIL import Image +from torch.hub import download_url_to_file, get_dir +from urllib.parse import urlparse + + +# Source https://github.com/advimman/lama +def get_image(image): + if isinstance(image, Image.Image): + img = np.array(image) + elif isinstance(image, np.ndarray): + img = image.copy() + else: + raise Exception("Input image should be either PIL Image or numpy array!") + + if img.ndim == 3: + img = np.transpose(img, (2, 0, 1)) # chw + elif img.ndim == 2: + img = img[np.newaxis, ...] + + assert img.ndim == 3 + + img = img.astype(np.float32) / 255 + return img + + +def ceil_modulo(x, mod): + if x % mod == 0: + return x + return (x // mod + 1) * mod + + +def scale_image(img, factor, interpolation=cv2.INTER_AREA): + if img.shape[0] == 1: + img = img[0] + else: + img = np.transpose(img, (1, 2, 0)) + + img = cv2.resize(img, dsize=None, fx=factor, fy=factor, interpolation=interpolation) + + if img.ndim == 2: + img = img[None, ...] + else: + img = np.transpose(img, (2, 0, 1)) + return img + + +def pad_img_to_modulo(img, mod): + channels, height, width = img.shape + out_height = ceil_modulo(height, mod) + out_width = ceil_modulo(width, mod) + return np.pad( + img, + ((0, 0), (0, out_height - height), (0, out_width - width)), + mode="symmetric", + ) + + +def prepare_img_and_mask(image, mask, device, pad_out_to_modulo=8, scale_factor=None): + out_image = get_image(image) + out_mask = get_image(mask) + + if scale_factor is not None: + out_image = scale_image(out_image, scale_factor) + out_mask = scale_image(out_mask, scale_factor, interpolation=cv2.INTER_NEAREST) + + if pad_out_to_modulo is not None and pad_out_to_modulo > 1: + out_image = pad_img_to_modulo(out_image, pad_out_to_modulo) + out_mask = pad_img_to_modulo(out_mask, pad_out_to_modulo) + + out_image = torch.from_numpy(out_image).unsqueeze(0).to(device) + out_mask = torch.from_numpy(out_mask).unsqueeze(0).to(device) + + out_mask = (out_mask > 0) * 1 + + return out_image, out_mask diff --git a/backend/inpaint/utils/utils.py b/backend/inpaint/utils/utils.py new file mode 100644 index 0000000..ab1ef7f --- /dev/null +++ b/backend/inpaint/utils/utils.py @@ -0,0 +1,85 @@ +import cv2 +import numpy as np +from PIL import Image +from typing import Any, Dict, List + + +def load_img_to_array(img_p): + img = Image.open(img_p) + if img.mode == "RGBA": + img = img.convert("RGB") + return np.array(img) + + +def save_array_to_img(img_arr, img_p): + Image.fromarray(img_arr.astype(np.uint8)).save(img_p) + + +def dilate_mask(mask, dilate_factor=15): + mask = mask.astype(np.uint8) + mask = cv2.dilate( + mask, + np.ones((dilate_factor, dilate_factor), np.uint8), + iterations=1 + ) + return mask + +def erode_mask(mask, dilate_factor=15): + mask = mask.astype(np.uint8) + mask = cv2.erode( + mask, + np.ones((dilate_factor, dilate_factor), np.uint8), + iterations=1 + ) + return mask + +def show_mask(ax, mask: np.ndarray, random_color=False): + mask = mask.astype(np.uint8) + if np.max(mask) == 255: + mask = mask / 255 + if random_color: + color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) + else: + color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) + h, w = mask.shape[-2:] + mask_img = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) + ax.imshow(mask_img) + + +def show_points(ax, coords: List[List[float]], labels: List[int], size=375): + coords = np.array(coords) + labels = np.array(labels) + color_table = {0: 'red', 1: 'green'} + for label_value, color in color_table.items(): + points = coords[labels == label_value] + ax.scatter(points[:, 0], points[:, 1], color=color, marker='*', + s=size, edgecolor='white', linewidth=1.25) + +def get_clicked_point(img_path): + img = cv2.imread(img_path) + cv2.namedWindow("image") + cv2.imshow("image", img) + + last_point = [] + keep_looping = True + + def mouse_callback(event, x, y, flags, param): + nonlocal last_point, keep_looping, img + + if event == cv2.EVENT_LBUTTONDOWN: + if last_point: + cv2.circle(img, tuple(last_point), 5, (0, 0, 0), -1) + last_point = [x, y] + cv2.circle(img, tuple(last_point), 5, (0, 0, 255), -1) + cv2.imshow("image", img) + elif event == cv2.EVENT_RBUTTONDOWN: + keep_looping = False + + cv2.setMouseCallback("image", mouse_callback) + + while keep_looping: + cv2.waitKey(1) + + cv2.destroyAllWindows() + + return last_point \ No newline at end of file diff --git a/backend/inpaint/video/core/dataset.py b/backend/inpaint/video/core/dataset.py new file mode 100644 index 0000000..27b135b --- /dev/null +++ b/backend/inpaint/video/core/dataset.py @@ -0,0 +1,232 @@ +import os +import json +import random + +import cv2 +from PIL import Image +import numpy as np + +import torch +import torchvision.transforms as transforms + +from utils.file_client import FileClient +from utils.img_util import imfrombytes +from utils.flow_util import resize_flow, flowread +from core.utils import (create_random_shape_with_random_motion, Stack, + ToTorchFormatTensor, GroupRandomHorizontalFlip,GroupRandomHorizontalFlowFlip) + + +class TrainDataset(torch.utils.data.Dataset): + def __init__(self, args: dict): + self.args = args + self.video_root = args['video_root'] + self.flow_root = args['flow_root'] + self.num_local_frames = args['num_local_frames'] + self.num_ref_frames = args['num_ref_frames'] + self.size = self.w, self.h = (args['w'], args['h']) + + self.load_flow = args['load_flow'] + if self.load_flow: + assert os.path.exists(self.flow_root) + + json_path = os.path.join('./datasets', args['name'], 'train.json') + + with open(json_path, 'r') as f: + self.video_train_dict = json.load(f) + self.video_names = sorted(list(self.video_train_dict.keys())) + + # self.video_names = sorted(os.listdir(self.video_root)) + self.video_dict = {} + self.frame_dict = {} + + for v in self.video_names: + frame_list = sorted(os.listdir(os.path.join(self.video_root, v))) + v_len = len(frame_list) + if v_len > self.num_local_frames + self.num_ref_frames: + self.video_dict[v] = v_len + self.frame_dict[v] = frame_list + + + self.video_names = list(self.video_dict.keys()) # update names + + self._to_tensors = transforms.Compose([ + Stack(), + ToTorchFormatTensor(), + ]) + self.file_client = FileClient('disk') + + def __len__(self): + return len(self.video_names) + + def _sample_index(self, length, sample_length, num_ref_frame=3): + complete_idx_set = list(range(length)) + pivot = random.randint(0, length - sample_length) + local_idx = complete_idx_set[pivot:pivot + sample_length] + remain_idx = list(set(complete_idx_set) - set(local_idx)) + ref_index = sorted(random.sample(remain_idx, num_ref_frame)) + + return local_idx + ref_index + + def __getitem__(self, index): + video_name = self.video_names[index] + # create masks + all_masks = create_random_shape_with_random_motion( + self.video_dict[video_name], imageHeight=self.h, imageWidth=self.w) + + # create sample index + selected_index = self._sample_index(self.video_dict[video_name], + self.num_local_frames, + self.num_ref_frames) + + # read video frames + frames = [] + masks = [] + flows_f, flows_b = [], [] + for idx in selected_index: + frame_list = self.frame_dict[video_name] + img_path = os.path.join(self.video_root, video_name, frame_list[idx]) + img_bytes = self.file_client.get(img_path, 'img') + img = imfrombytes(img_bytes, float32=False) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = cv2.resize(img, self.size, interpolation=cv2.INTER_LINEAR) + img = Image.fromarray(img) + + frames.append(img) + masks.append(all_masks[idx]) + + if len(frames) <= self.num_local_frames-1 and self.load_flow: + current_n = frame_list[idx][:-4] + next_n = frame_list[idx+1][:-4] + flow_f_path = os.path.join(self.flow_root, video_name, f'{current_n}_{next_n}_f.flo') + flow_b_path = os.path.join(self.flow_root, video_name, f'{next_n}_{current_n}_b.flo') + flow_f = flowread(flow_f_path, quantize=False) + flow_b = flowread(flow_b_path, quantize=False) + flow_f = resize_flow(flow_f, self.h, self.w) + flow_b = resize_flow(flow_b, self.h, self.w) + flows_f.append(flow_f) + flows_b.append(flow_b) + + if len(frames) == self.num_local_frames: # random reverse + if random.random() < 0.5: + frames.reverse() + masks.reverse() + if self.load_flow: + flows_f.reverse() + flows_b.reverse() + flows_ = flows_f + flows_f = flows_b + flows_b = flows_ + + if self.load_flow: + frames, flows_f, flows_b = GroupRandomHorizontalFlowFlip()(frames, flows_f, flows_b) + else: + frames = GroupRandomHorizontalFlip()(frames) + + # normalizate, to tensors + frame_tensors = self._to_tensors(frames) * 2.0 - 1.0 + mask_tensors = self._to_tensors(masks) + if self.load_flow: + flows_f = np.stack(flows_f, axis=-1) # H W 2 T-1 + flows_b = np.stack(flows_b, axis=-1) + flows_f = torch.from_numpy(flows_f).permute(3, 2, 0, 1).contiguous().float() + flows_b = torch.from_numpy(flows_b).permute(3, 2, 0, 1).contiguous().float() + + # img [-1,1] mask [0,1] + if self.load_flow: + return frame_tensors, mask_tensors, flows_f, flows_b, video_name + else: + return frame_tensors, mask_tensors, 'None', 'None', video_name + + +class TestDataset(torch.utils.data.Dataset): + def __init__(self, args): + self.args = args + self.size = self.w, self.h = args['size'] + + self.video_root = args['video_root'] + self.mask_root = args['mask_root'] + self.flow_root = args['flow_root'] + + self.load_flow = args['load_flow'] + if self.load_flow: + assert os.path.exists(self.flow_root) + self.video_names = sorted(os.listdir(self.mask_root)) + + self.video_dict = {} + self.frame_dict = {} + + for v in self.video_names: + frame_list = sorted(os.listdir(os.path.join(self.video_root, v))) + v_len = len(frame_list) + self.video_dict[v] = v_len + self.frame_dict[v] = frame_list + + self._to_tensors = transforms.Compose([ + Stack(), + ToTorchFormatTensor(), + ]) + self.file_client = FileClient('disk') + + def __len__(self): + return len(self.video_names) + + def __getitem__(self, index): + video_name = self.video_names[index] + selected_index = list(range(self.video_dict[video_name])) + + # read video frames + frames = [] + masks = [] + flows_f, flows_b = [], [] + for idx in selected_index: + frame_list = self.frame_dict[video_name] + frame_path = os.path.join(self.video_root, video_name, frame_list[idx]) + + img_bytes = self.file_client.get(frame_path, 'input') + img = imfrombytes(img_bytes, float32=False) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = cv2.resize(img, self.size, interpolation=cv2.INTER_LINEAR) + img = Image.fromarray(img) + + frames.append(img) + + mask_path = os.path.join(self.mask_root, video_name, str(idx).zfill(5) + '.png') + mask = Image.open(mask_path).resize(self.size, Image.NEAREST).convert('L') + + # origin: 0 indicates missing. now: 1 indicates missing + mask = np.asarray(mask) + m = np.array(mask > 0).astype(np.uint8) + + m = cv2.dilate(m, + cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)), + iterations=4) + mask = Image.fromarray(m * 255) + masks.append(mask) + + if len(frames) <= len(selected_index)-1 and self.load_flow: + current_n = frame_list[idx][:-4] + next_n = frame_list[idx+1][:-4] + flow_f_path = os.path.join(self.flow_root, video_name, f'{current_n}_{next_n}_f.flo') + flow_b_path = os.path.join(self.flow_root, video_name, f'{next_n}_{current_n}_b.flo') + flow_f = flowread(flow_f_path, quantize=False) + flow_b = flowread(flow_b_path, quantize=False) + flow_f = resize_flow(flow_f, self.h, self.w) + flow_b = resize_flow(flow_b, self.h, self.w) + flows_f.append(flow_f) + flows_b.append(flow_b) + + # normalizate, to tensors + frames_PIL = [np.array(f).astype(np.uint8) for f in frames] + frame_tensors = self._to_tensors(frames) * 2.0 - 1.0 + mask_tensors = self._to_tensors(masks) + + if self.load_flow: + flows_f = np.stack(flows_f, axis=-1) # H W 2 T-1 + flows_b = np.stack(flows_b, axis=-1) + flows_f = torch.from_numpy(flows_f).permute(3, 2, 0, 1).contiguous().float() + flows_b = torch.from_numpy(flows_b).permute(3, 2, 0, 1).contiguous().float() + + if self.load_flow: + return frame_tensors, mask_tensors, flows_f, flows_b, video_name, frames_PIL + else: + return frame_tensors, mask_tensors, 'None', 'None', video_name \ No newline at end of file diff --git a/backend/inpaint/video/core/dist.py b/backend/inpaint/video/core/dist.py new file mode 100644 index 0000000..4e4e9e6 --- /dev/null +++ b/backend/inpaint/video/core/dist.py @@ -0,0 +1,47 @@ +import os +import torch + + +def get_world_size(): + """Find OMPI world size without calling mpi functions + :rtype: int + """ + if os.environ.get('PMI_SIZE') is not None: + return int(os.environ.get('PMI_SIZE') or 1) + elif os.environ.get('OMPI_COMM_WORLD_SIZE') is not None: + return int(os.environ.get('OMPI_COMM_WORLD_SIZE') or 1) + else: + return torch.cuda.device_count() + + +def get_global_rank(): + """Find OMPI world rank without calling mpi functions + :rtype: int + """ + if os.environ.get('PMI_RANK') is not None: + return int(os.environ.get('PMI_RANK') or 0) + elif os.environ.get('OMPI_COMM_WORLD_RANK') is not None: + return int(os.environ.get('OMPI_COMM_WORLD_RANK') or 0) + else: + return 0 + + +def get_local_rank(): + """Find OMPI local rank without calling mpi functions + :rtype: int + """ + if os.environ.get('MPI_LOCALRANKID') is not None: + return int(os.environ.get('MPI_LOCALRANKID') or 0) + elif os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') is not None: + return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') or 0) + else: + return 0 + + +def get_master_ip(): + if os.environ.get('AZ_BATCH_MASTER_NODE') is not None: + return os.environ.get('AZ_BATCH_MASTER_NODE').split(':')[0] + elif os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') is not None: + return os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') + else: + return "127.0.0.1" diff --git a/backend/inpaint/video/core/loss.py b/backend/inpaint/video/core/loss.py new file mode 100644 index 0000000..b1d94d0 --- /dev/null +++ b/backend/inpaint/video/core/loss.py @@ -0,0 +1,180 @@ +import torch +import torch.nn as nn +import lpips +from model.vgg_arch import VGGFeatureExtractor + +class PerceptualLoss(nn.Module): + """Perceptual loss with commonly used style loss. + + Args: + layer_weights (dict): The weight for each layer of vgg feature. + Here is an example: {'conv5_4': 1.}, which means the conv5_4 + feature layer (before relu5_4) will be extracted with weight + 1.0 in calculting losses. + vgg_type (str): The type of vgg network used as feature extractor. + Default: 'vgg19'. + use_input_norm (bool): If True, normalize the input image in vgg. + Default: True. + range_norm (bool): If True, norm images with range [-1, 1] to [0, 1]. + Default: False. + perceptual_weight (float): If `perceptual_weight > 0`, the perceptual + loss will be calculated and the loss will multiplied by the + weight. Default: 1.0. + style_weight (float): If `style_weight > 0`, the style loss will be + calculated and the loss will multiplied by the weight. + Default: 0. + criterion (str): Criterion used for perceptual loss. Default: 'l1'. + """ + + def __init__(self, + layer_weights, + vgg_type='vgg19', + use_input_norm=True, + range_norm=False, + perceptual_weight=1.0, + style_weight=0., + criterion='l1'): + super(PerceptualLoss, self).__init__() + self.perceptual_weight = perceptual_weight + self.style_weight = style_weight + self.layer_weights = layer_weights + self.vgg = VGGFeatureExtractor( + layer_name_list=list(layer_weights.keys()), + vgg_type=vgg_type, + use_input_norm=use_input_norm, + range_norm=range_norm) + + self.criterion_type = criterion + if self.criterion_type == 'l1': + self.criterion = torch.nn.L1Loss() + elif self.criterion_type == 'l2': + self.criterion = torch.nn.L2loss() + elif self.criterion_type == 'mse': + self.criterion = torch.nn.MSELoss(reduction='mean') + elif self.criterion_type == 'fro': + self.criterion = None + else: + raise NotImplementedError(f'{criterion} criterion has not been supported.') + + def forward(self, x, gt): + """Forward function. + + Args: + x (Tensor): Input tensor with shape (n, c, h, w). + gt (Tensor): Ground-truth tensor with shape (n, c, h, w). + + Returns: + Tensor: Forward results. + """ + # extract vgg features + x_features = self.vgg(x) + gt_features = self.vgg(gt.detach()) + + # calculate perceptual loss + if self.perceptual_weight > 0: + percep_loss = 0 + for k in x_features.keys(): + if self.criterion_type == 'fro': + percep_loss += torch.norm(x_features[k] - gt_features[k], p='fro') * self.layer_weights[k] + else: + percep_loss += self.criterion(x_features[k], gt_features[k]) * self.layer_weights[k] + percep_loss *= self.perceptual_weight + else: + percep_loss = None + + # calculate style loss + if self.style_weight > 0: + style_loss = 0 + for k in x_features.keys(): + if self.criterion_type == 'fro': + style_loss += torch.norm( + self._gram_mat(x_features[k]) - self._gram_mat(gt_features[k]), p='fro') * self.layer_weights[k] + else: + style_loss += self.criterion(self._gram_mat(x_features[k]), self._gram_mat( + gt_features[k])) * self.layer_weights[k] + style_loss *= self.style_weight + else: + style_loss = None + + return percep_loss, style_loss + + def _gram_mat(self, x): + """Calculate Gram matrix. + + Args: + x (torch.Tensor): Tensor with shape of (n, c, h, w). + + Returns: + torch.Tensor: Gram matrix. + """ + n, c, h, w = x.size() + features = x.view(n, c, w * h) + features_t = features.transpose(1, 2) + gram = features.bmm(features_t) / (c * h * w) + return gram + +class LPIPSLoss(nn.Module): + def __init__(self, + loss_weight=1.0, + use_input_norm=True, + range_norm=False,): + super(LPIPSLoss, self).__init__() + self.perceptual = lpips.LPIPS(net="vgg", spatial=False).eval() + self.loss_weight = loss_weight + self.use_input_norm = use_input_norm + self.range_norm = range_norm + + if self.use_input_norm: + # the mean is for image with range [0, 1] + self.register_buffer('mean', torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)) + # the std is for image with range [0, 1] + self.register_buffer('std', torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)) + + def forward(self, pred, target): + if self.range_norm: + pred = (pred + 1) / 2 + target = (target + 1) / 2 + if self.use_input_norm: + pred = (pred - self.mean) / self.std + target = (target - self.mean) / self.std + lpips_loss = self.perceptual(target.contiguous(), pred.contiguous()) + return self.loss_weight * lpips_loss.mean(), None + + +class AdversarialLoss(nn.Module): + r""" + Adversarial loss + https://arxiv.org/abs/1711.10337 + """ + def __init__(self, + type='nsgan', + target_real_label=1.0, + target_fake_label=0.0): + r""" + type = nsgan | lsgan | hinge + """ + super(AdversarialLoss, self).__init__() + self.type = type + self.register_buffer('real_label', torch.tensor(target_real_label)) + self.register_buffer('fake_label', torch.tensor(target_fake_label)) + + if type == 'nsgan': + self.criterion = nn.BCELoss() + elif type == 'lsgan': + self.criterion = nn.MSELoss() + elif type == 'hinge': + self.criterion = nn.ReLU() + + def __call__(self, outputs, is_real, is_disc=None): + if self.type == 'hinge': + if is_disc: + if is_real: + outputs = -outputs + return self.criterion(1 + outputs).mean() + else: + return (-outputs).mean() + else: + labels = (self.real_label + if is_real else self.fake_label).expand_as(outputs) + loss = self.criterion(outputs, labels) + return loss diff --git a/backend/inpaint/video/core/lr_scheduler.py b/backend/inpaint/video/core/lr_scheduler.py new file mode 100644 index 0000000..1bd1341 --- /dev/null +++ b/backend/inpaint/video/core/lr_scheduler.py @@ -0,0 +1,112 @@ +""" + LR scheduler from BasicSR https://github.com/xinntao/BasicSR +""" +import math +from collections import Counter +from torch.optim.lr_scheduler import _LRScheduler + + +class MultiStepRestartLR(_LRScheduler): + """ MultiStep with restarts learning rate scheme. + Args: + optimizer (torch.nn.optimizer): Torch optimizer. + milestones (list): Iterations that will decrease learning rate. + gamma (float): Decrease ratio. Default: 0.1. + restarts (list): Restart iterations. Default: [0]. + restart_weights (list): Restart weights at each restart iteration. + Default: [1]. + last_epoch (int): Used in _LRScheduler. Default: -1. + """ + def __init__(self, + optimizer, + milestones, + gamma=0.1, + restarts=(0, ), + restart_weights=(1, ), + last_epoch=-1): + self.milestones = Counter(milestones) + self.gamma = gamma + self.restarts = restarts + self.restart_weights = restart_weights + assert len(self.restarts) == len( + self.restart_weights), 'restarts and their weights do not match.' + super(MultiStepRestartLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch in self.restarts: + weight = self.restart_weights[self.restarts.index(self.last_epoch)] + return [ + group['initial_lr'] * weight + for group in self.optimizer.param_groups + ] + if self.last_epoch not in self.milestones: + return [group['lr'] for group in self.optimizer.param_groups] + return [ + group['lr'] * self.gamma**self.milestones[self.last_epoch] + for group in self.optimizer.param_groups + ] + + +def get_position_from_periods(iteration, cumulative_period): + """Get the position from a period list. + It will return the index of the right-closest number in the period list. + For example, the cumulative_period = [100, 200, 300, 400], + if iteration == 50, return 0; + if iteration == 210, return 2; + if iteration == 300, return 2. + Args: + iteration (int): Current iteration. + cumulative_period (list[int]): Cumulative period list. + Returns: + int: The position of the right-closest number in the period list. + """ + for i, period in enumerate(cumulative_period): + if iteration <= period: + return i + + +class CosineAnnealingRestartLR(_LRScheduler): + """ Cosine annealing with restarts learning rate scheme. + An example of config: + periods = [10, 10, 10, 10] + restart_weights = [1, 0.5, 0.5, 0.5] + eta_min=1e-7 + It has four cycles, each has 10 iterations. At 10th, 20th, 30th, the + scheduler will restart with the weights in restart_weights. + Args: + optimizer (torch.nn.optimizer): Torch optimizer. + periods (list): Period for each cosine anneling cycle. + restart_weights (list): Restart weights at each restart iteration. + Default: [1]. + eta_min (float): The mimimum lr. Default: 0. + last_epoch (int): Used in _LRScheduler. Default: -1. + """ + def __init__(self, + optimizer, + periods, + restart_weights=(1, ), + eta_min=1e-7, + last_epoch=-1): + self.periods = periods + self.restart_weights = restart_weights + self.eta_min = eta_min + assert (len(self.periods) == len(self.restart_weights) + ), 'periods and restart_weights should have the same length.' + self.cumulative_period = [ + sum(self.periods[0:i + 1]) for i in range(0, len(self.periods)) + ] + super(CosineAnnealingRestartLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + idx = get_position_from_periods(self.last_epoch, + self.cumulative_period) + current_weight = self.restart_weights[idx] + nearest_restart = 0 if idx == 0 else self.cumulative_period[idx - 1] + current_period = self.periods[idx] + + return [ + self.eta_min + current_weight * 0.5 * (base_lr - self.eta_min) * + (1 + math.cos(math.pi * ( + (self.last_epoch - nearest_restart) / current_period))) + for base_lr in self.base_lrs + ] diff --git a/backend/inpaint/video/core/metrics.py b/backend/inpaint/video/core/metrics.py new file mode 100644 index 0000000..d0dfb73 --- /dev/null +++ b/backend/inpaint/video/core/metrics.py @@ -0,0 +1,569 @@ +import numpy as np +from skimage import measure +from scipy import linalg + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.utils import to_tensors + + +def calculate_epe(flow1, flow2): + """Calculate End point errors.""" + + epe = torch.sum((flow1 - flow2)**2, dim=1).sqrt() + epe = epe.view(-1) + return epe.mean().item() + + +def calculate_psnr(img1, img2): + """Calculate PSNR (Peak Signal-to-Noise Ratio). + Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio + Args: + img1 (ndarray): Images with range [0, 255]. + img2 (ndarray): Images with range [0, 255]. + Returns: + float: psnr result. + """ + + assert img1.shape == img2.shape, \ + (f'Image shapes are differnet: {img1.shape}, {img2.shape}.') + + mse = np.mean((img1 - img2)**2) + if mse == 0: + return float('inf') + return 20. * np.log10(255. / np.sqrt(mse)) + + +def calc_psnr_and_ssim(img1, img2): + """Calculate PSNR and SSIM for images. + img1: ndarray, range [0, 255] + img2: ndarray, range [0, 255] + """ + img1 = img1.astype(np.float64) + img2 = img2.astype(np.float64) + + psnr = calculate_psnr(img1, img2) + ssim = measure.compare_ssim(img1, + img2, + data_range=255, + multichannel=True, + win_size=65) + + return psnr, ssim + + +########################### +# I3D models +########################### + + +def init_i3d_model(i3d_model_path): + print(f"[Loading I3D model from {i3d_model_path} for FID score ..]") + i3d_model = InceptionI3d(400, in_channels=3, final_endpoint='Logits') + i3d_model.load_state_dict(torch.load(i3d_model_path)) + i3d_model.to(torch.device('cuda:0')) + return i3d_model + + +def calculate_i3d_activations(video1, video2, i3d_model, device): + """Calculate VFID metric. + video1: list[PIL.Image] + video2: list[PIL.Image] + """ + video1 = to_tensors()(video1).unsqueeze(0).to(device) + video2 = to_tensors()(video2).unsqueeze(0).to(device) + video1_activations = get_i3d_activations( + video1, i3d_model).cpu().numpy().flatten() + video2_activations = get_i3d_activations( + video2, i3d_model).cpu().numpy().flatten() + + return video1_activations, video2_activations + + +def calculate_vfid(real_activations, fake_activations): + """ + Given two distribution of features, compute the FID score between them + Params: + real_activations: list[ndarray] + fake_activations: list[ndarray] + """ + m1 = np.mean(real_activations, axis=0) + m2 = np.mean(fake_activations, axis=0) + s1 = np.cov(real_activations, rowvar=False) + s2 = np.cov(fake_activations, rowvar=False) + return calculate_frechet_distance(m1, s1, m2, s2) + + +def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): + """Numpy implementation of the Frechet Distance. + The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) + and X_2 ~ N(mu_2, C_2) is + d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). + Stable version by Dougal J. Sutherland. + Params: + -- mu1 : Numpy array containing the activations of a layer of the + inception net (like returned by the function 'get_predictions') + for generated samples. + -- mu2 : The sample mean over activations, precalculated on an + representive data set. + -- sigma1: The covariance matrix over activations for generated samples. + -- sigma2: The covariance matrix over activations, precalculated on an + representive data set. + Returns: + -- : The Frechet Distance. + """ + + mu1 = np.atleast_1d(mu1) + mu2 = np.atleast_1d(mu2) + + sigma1 = np.atleast_2d(sigma1) + sigma2 = np.atleast_2d(sigma2) + + assert mu1.shape == mu2.shape, \ + 'Training and test mean vectors have different lengths' + assert sigma1.shape == sigma2.shape, \ + 'Training and test covariances have different dimensions' + + diff = mu1 - mu2 + + # Product might be almost singular + covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) + if not np.isfinite(covmean).all(): + msg = ('fid calculation produces singular product; ' + 'adding %s to diagonal of cov estimates') % eps + print(msg) + offset = np.eye(sigma1.shape[0]) * eps + covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) + + # Numerical error might give slight imaginary component + if np.iscomplexobj(covmean): + if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): + m = np.max(np.abs(covmean.imag)) + raise ValueError('Imaginary component {}'.format(m)) + covmean = covmean.real + + tr_covmean = np.trace(covmean) + + return (diff.dot(diff) + np.trace(sigma1) + # NOQA + np.trace(sigma2) - 2 * tr_covmean) + + +def get_i3d_activations(batched_video, + i3d_model, + target_endpoint='Logits', + flatten=True, + grad_enabled=False): + """ + Get features from i3d model and flatten them to 1d feature, + valid target endpoints are defined in InceptionI3d.VALID_ENDPOINTS + VALID_ENDPOINTS = ( + 'Conv3d_1a_7x7', + 'MaxPool3d_2a_3x3', + 'Conv3d_2b_1x1', + 'Conv3d_2c_3x3', + 'MaxPool3d_3a_3x3', + 'Mixed_3b', + 'Mixed_3c', + 'MaxPool3d_4a_3x3', + 'Mixed_4b', + 'Mixed_4c', + 'Mixed_4d', + 'Mixed_4e', + 'Mixed_4f', + 'MaxPool3d_5a_2x2', + 'Mixed_5b', + 'Mixed_5c', + 'Logits', + 'Predictions', + ) + """ + with torch.set_grad_enabled(grad_enabled): + feat = i3d_model.extract_features(batched_video.transpose(1, 2), + target_endpoint) + if flatten: + feat = feat.view(feat.size(0), -1) + + return feat + + +# This code is from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py +# I only fix flake8 errors and do some cleaning here + + +class MaxPool3dSamePadding(nn.MaxPool3d): + def compute_pad(self, dim, s): + if s % self.stride[dim] == 0: + return max(self.kernel_size[dim] - self.stride[dim], 0) + else: + return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) + + def forward(self, x): + # compute 'same' padding + (batch, channel, t, h, w) = x.size() + pad_t = self.compute_pad(0, t) + pad_h = self.compute_pad(1, h) + pad_w = self.compute_pad(2, w) + + pad_t_f = pad_t // 2 + pad_t_b = pad_t - pad_t_f + pad_h_f = pad_h // 2 + pad_h_b = pad_h - pad_h_f + pad_w_f = pad_w // 2 + pad_w_b = pad_w - pad_w_f + + pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) + x = F.pad(x, pad) + return super(MaxPool3dSamePadding, self).forward(x) + + +class Unit3D(nn.Module): + def __init__(self, + in_channels, + output_channels, + kernel_shape=(1, 1, 1), + stride=(1, 1, 1), + padding=0, + activation_fn=F.relu, + use_batch_norm=True, + use_bias=False, + name='unit_3d'): + """Initializes Unit3D module.""" + super(Unit3D, self).__init__() + + self._output_channels = output_channels + self._kernel_shape = kernel_shape + self._stride = stride + self._use_batch_norm = use_batch_norm + self._activation_fn = activation_fn + self._use_bias = use_bias + self.name = name + self.padding = padding + + self.conv3d = nn.Conv3d( + in_channels=in_channels, + out_channels=self._output_channels, + kernel_size=self._kernel_shape, + stride=self._stride, + padding=0, # we always want padding to be 0 here. We will + # dynamically pad based on input size in forward function + bias=self._use_bias) + + if self._use_batch_norm: + self.bn = nn.BatchNorm3d(self._output_channels, + eps=0.001, + momentum=0.01) + + def compute_pad(self, dim, s): + if s % self._stride[dim] == 0: + return max(self._kernel_shape[dim] - self._stride[dim], 0) + else: + return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) + + def forward(self, x): + # compute 'same' padding + (batch, channel, t, h, w) = x.size() + pad_t = self.compute_pad(0, t) + pad_h = self.compute_pad(1, h) + pad_w = self.compute_pad(2, w) + + pad_t_f = pad_t // 2 + pad_t_b = pad_t - pad_t_f + pad_h_f = pad_h // 2 + pad_h_b = pad_h - pad_h_f + pad_w_f = pad_w // 2 + pad_w_b = pad_w - pad_w_f + + pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) + x = F.pad(x, pad) + + x = self.conv3d(x) + if self._use_batch_norm: + x = self.bn(x) + if self._activation_fn is not None: + x = self._activation_fn(x) + return x + + +class InceptionModule(nn.Module): + def __init__(self, in_channels, out_channels, name): + super(InceptionModule, self).__init__() + + self.b0 = Unit3D(in_channels=in_channels, + output_channels=out_channels[0], + kernel_shape=[1, 1, 1], + padding=0, + name=name + '/Branch_0/Conv3d_0a_1x1') + self.b1a = Unit3D(in_channels=in_channels, + output_channels=out_channels[1], + kernel_shape=[1, 1, 1], + padding=0, + name=name + '/Branch_1/Conv3d_0a_1x1') + self.b1b = Unit3D(in_channels=out_channels[1], + output_channels=out_channels[2], + kernel_shape=[3, 3, 3], + name=name + '/Branch_1/Conv3d_0b_3x3') + self.b2a = Unit3D(in_channels=in_channels, + output_channels=out_channels[3], + kernel_shape=[1, 1, 1], + padding=0, + name=name + '/Branch_2/Conv3d_0a_1x1') + self.b2b = Unit3D(in_channels=out_channels[3], + output_channels=out_channels[4], + kernel_shape=[3, 3, 3], + name=name + '/Branch_2/Conv3d_0b_3x3') + self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], + stride=(1, 1, 1), + padding=0) + self.b3b = Unit3D(in_channels=in_channels, + output_channels=out_channels[5], + kernel_shape=[1, 1, 1], + padding=0, + name=name + '/Branch_3/Conv3d_0b_1x1') + self.name = name + + def forward(self, x): + b0 = self.b0(x) + b1 = self.b1b(self.b1a(x)) + b2 = self.b2b(self.b2a(x)) + b3 = self.b3b(self.b3a(x)) + return torch.cat([b0, b1, b2, b3], dim=1) + + +class InceptionI3d(nn.Module): + """Inception-v1 I3D architecture. + The model is introduced in: + Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset + Joao Carreira, Andrew Zisserman + https://arxiv.org/pdf/1705.07750v1.pdf. + See also the Inception architecture, introduced in: + Going deeper with convolutions + Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, + Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. + http://arxiv.org/pdf/1409.4842v1.pdf. + """ + + # Endpoints of the model in order. During construction, all the endpoints up + # to a designated `final_endpoint` are returned in a dictionary as the + # second return value. + VALID_ENDPOINTS = ( + 'Conv3d_1a_7x7', + 'MaxPool3d_2a_3x3', + 'Conv3d_2b_1x1', + 'Conv3d_2c_3x3', + 'MaxPool3d_3a_3x3', + 'Mixed_3b', + 'Mixed_3c', + 'MaxPool3d_4a_3x3', + 'Mixed_4b', + 'Mixed_4c', + 'Mixed_4d', + 'Mixed_4e', + 'Mixed_4f', + 'MaxPool3d_5a_2x2', + 'Mixed_5b', + 'Mixed_5c', + 'Logits', + 'Predictions', + ) + + def __init__(self, + num_classes=400, + spatial_squeeze=True, + final_endpoint='Logits', + name='inception_i3d', + in_channels=3, + dropout_keep_prob=0.5): + """Initializes I3D model instance. + Args: + num_classes: The number of outputs in the logit layer (default 400, which + matches the Kinetics dataset). + spatial_squeeze: Whether to squeeze the spatial dimensions for the logits + before returning (default True). + final_endpoint: The model contains many possible endpoints. + `final_endpoint` specifies the last endpoint for the model to be built + up to. In addition to the output at `final_endpoint`, all the outputs + at endpoints up to `final_endpoint` will also be returned, in a + dictionary. `final_endpoint` must be one of + InceptionI3d.VALID_ENDPOINTS (default 'Logits'). + name: A string (optional). The name of this module. + Raises: + ValueError: if `final_endpoint` is not recognized. + """ + + if final_endpoint not in self.VALID_ENDPOINTS: + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + super(InceptionI3d, self).__init__() + self._num_classes = num_classes + self._spatial_squeeze = spatial_squeeze + self._final_endpoint = final_endpoint + self.logits = None + + if self._final_endpoint not in self.VALID_ENDPOINTS: + raise ValueError('Unknown final endpoint %s' % + self._final_endpoint) + + self.end_points = {} + end_point = 'Conv3d_1a_7x7' + self.end_points[end_point] = Unit3D(in_channels=in_channels, + output_channels=64, + kernel_shape=[7, 7, 7], + stride=(2, 2, 2), + padding=(3, 3, 3), + name=name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'MaxPool3d_2a_3x3' + self.end_points[end_point] = MaxPool3dSamePadding( + kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) + if self._final_endpoint == end_point: + return + + end_point = 'Conv3d_2b_1x1' + self.end_points[end_point] = Unit3D(in_channels=64, + output_channels=64, + kernel_shape=[1, 1, 1], + padding=0, + name=name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Conv3d_2c_3x3' + self.end_points[end_point] = Unit3D(in_channels=64, + output_channels=192, + kernel_shape=[3, 3, 3], + padding=1, + name=name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'MaxPool3d_3a_3x3' + self.end_points[end_point] = MaxPool3dSamePadding( + kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_3b' + self.end_points[end_point] = InceptionModule(192, + [64, 96, 128, 16, 32, 32], + name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_3c' + self.end_points[end_point] = InceptionModule( + 256, [128, 128, 192, 32, 96, 64], name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'MaxPool3d_4a_3x3' + self.end_points[end_point] = MaxPool3dSamePadding( + kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_4b' + self.end_points[end_point] = InceptionModule( + 128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_4c' + self.end_points[end_point] = InceptionModule( + 192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_4d' + self.end_points[end_point] = InceptionModule( + 160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_4e' + self.end_points[end_point] = InceptionModule( + 128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_4f' + self.end_points[end_point] = InceptionModule( + 112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], + name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'MaxPool3d_5a_2x2' + self.end_points[end_point] = MaxPool3dSamePadding( + kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_5b' + self.end_points[end_point] = InceptionModule( + 256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], + name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Mixed_5c' + self.end_points[end_point] = InceptionModule( + 256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], + name + end_point) + if self._final_endpoint == end_point: + return + + end_point = 'Logits' + self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1)) + self.dropout = nn.Dropout(dropout_keep_prob) + self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, + output_channels=self._num_classes, + kernel_shape=[1, 1, 1], + padding=0, + activation_fn=None, + use_batch_norm=False, + use_bias=True, + name='logits') + + self.build() + + def replace_logits(self, num_classes): + self._num_classes = num_classes + self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, + output_channels=self._num_classes, + kernel_shape=[1, 1, 1], + padding=0, + activation_fn=None, + use_batch_norm=False, + use_bias=True, + name='logits') + + def build(self): + for k in self.end_points.keys(): + self.add_module(k, self.end_points[k]) + + def forward(self, x): + for end_point in self.VALID_ENDPOINTS: + if end_point in self.end_points: + x = self._modules[end_point]( + x) # use _modules to work with dataparallel + + x = self.logits(self.dropout(self.avg_pool(x))) + if self._spatial_squeeze: + logits = x.squeeze(3).squeeze(3) + # logits is batch X time X classes, which is what we want to work with + return logits + + def extract_features(self, x, target_endpoint='Logits'): + for end_point in self.VALID_ENDPOINTS: + if end_point in self.end_points: + x = self._modules[end_point](x) + if end_point == target_endpoint: + break + if target_endpoint == 'Logits': + return x.mean(4).mean(3).mean(2) + else: + return x diff --git a/backend/inpaint/video/core/prefetch_dataloader.py b/backend/inpaint/video/core/prefetch_dataloader.py new file mode 100644 index 0000000..5088425 --- /dev/null +++ b/backend/inpaint/video/core/prefetch_dataloader.py @@ -0,0 +1,125 @@ +import queue as Queue +import threading +import torch +from torch.utils.data import DataLoader + + +class PrefetchGenerator(threading.Thread): + """A general prefetch generator. + + Ref: + https://stackoverflow.com/questions/7323664/python-generator-pre-fetch + + Args: + generator: Python generator. + num_prefetch_queue (int): Number of prefetch queue. + """ + + def __init__(self, generator, num_prefetch_queue): + threading.Thread.__init__(self) + self.queue = Queue.Queue(num_prefetch_queue) + self.generator = generator + self.daemon = True + self.start() + + def run(self): + for item in self.generator: + self.queue.put(item) + self.queue.put(None) + + def __next__(self): + next_item = self.queue.get() + if next_item is None: + raise StopIteration + return next_item + + def __iter__(self): + return self + + +class PrefetchDataLoader(DataLoader): + """Prefetch version of dataloader. + + Ref: + https://github.com/IgorSusmelj/pytorch-styleguide/issues/5# + + TODO: + Need to test on single gpu and ddp (multi-gpu). There is a known issue in + ddp. + + Args: + num_prefetch_queue (int): Number of prefetch queue. + kwargs (dict): Other arguments for dataloader. + """ + + def __init__(self, num_prefetch_queue, **kwargs): + self.num_prefetch_queue = num_prefetch_queue + super(PrefetchDataLoader, self).__init__(**kwargs) + + def __iter__(self): + return PrefetchGenerator(super().__iter__(), self.num_prefetch_queue) + + +class CPUPrefetcher(): + """CPU prefetcher. + + Args: + loader: Dataloader. + """ + + def __init__(self, loader): + self.ori_loader = loader + self.loader = iter(loader) + + def next(self): + try: + return next(self.loader) + except StopIteration: + return None + + def reset(self): + self.loader = iter(self.ori_loader) + + +class CUDAPrefetcher(): + """CUDA prefetcher. + + Ref: + https://github.com/NVIDIA/apex/issues/304# + + It may consums more GPU memory. + + Args: + loader: Dataloader. + opt (dict): Options. + """ + + def __init__(self, loader, opt): + self.ori_loader = loader + self.loader = iter(loader) + self.opt = opt + self.stream = torch.cuda.Stream() + self.device = torch.device('cuda' if opt['num_gpu'] != 0 else 'cpu') + self.preload() + + def preload(self): + try: + self.batch = next(self.loader) # self.batch is a dict + except StopIteration: + self.batch = None + return None + # put tensors to gpu + with torch.cuda.stream(self.stream): + for k, v in self.batch.items(): + if torch.is_tensor(v): + self.batch[k] = self.batch[k].to(device=self.device, non_blocking=True) + + def next(self): + torch.cuda.current_stream().wait_stream(self.stream) + batch = self.batch + self.preload() + return batch + + def reset(self): + self.loader = iter(self.ori_loader) + self.preload() diff --git a/backend/inpaint/video/core/trainer.py b/backend/inpaint/video/core/trainer.py new file mode 100644 index 0000000..e90ec8c --- /dev/null +++ b/backend/inpaint/video/core/trainer.py @@ -0,0 +1,509 @@ +import os +import glob +import logging +import importlib +from tqdm import tqdm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.prefetch_dataloader import PrefetchDataLoader, CPUPrefetcher +from torch.utils.data.distributed import DistributedSampler +from torch.nn.parallel import DistributedDataParallel as DDP +import torchvision +from torch.utils.tensorboard import SummaryWriter + +from core.lr_scheduler import MultiStepRestartLR, CosineAnnealingRestartLR +from core.loss import AdversarialLoss, PerceptualLoss, LPIPSLoss +from core.dataset import TrainDataset + +from model.modules.flow_comp_raft import RAFT_bi, FlowLoss, EdgeLoss +from model.recurrent_flow_completion import RecurrentFlowCompleteNet + +from RAFT.utils.flow_viz_pt import flow_to_image + + +class Trainer: + def __init__(self, config): + self.config = config + self.epoch = 0 + self.iteration = 0 + self.num_local_frames = config['train_data_loader']['num_local_frames'] + self.num_ref_frames = config['train_data_loader']['num_ref_frames'] + + # setup data set and data loader + self.train_dataset = TrainDataset(config['train_data_loader']) + + self.train_sampler = None + self.train_args = config['trainer'] + if config['distributed']: + self.train_sampler = DistributedSampler( + self.train_dataset, + num_replicas=config['world_size'], + rank=config['global_rank']) + + dataloader_args = dict( + dataset=self.train_dataset, + batch_size=self.train_args['batch_size'] // config['world_size'], + shuffle=(self.train_sampler is None), + num_workers=self.train_args['num_workers'], + sampler=self.train_sampler, + drop_last=True) + + self.train_loader = PrefetchDataLoader(self.train_args['num_prefetch_queue'], **dataloader_args) + self.prefetcher = CPUPrefetcher(self.train_loader) + + # set loss functions + self.adversarial_loss = AdversarialLoss(type=self.config['losses']['GAN_LOSS']) + self.adversarial_loss = self.adversarial_loss.to(self.config['device']) + self.l1_loss = nn.L1Loss() + # self.perc_loss = PerceptualLoss( + # layer_weights={'conv3_4': 0.25, 'conv4_4': 0.25, 'conv5_4': 0.5}, + # use_input_norm=True, + # range_norm=True, + # criterion='l1' + # ).to(self.config['device']) + + if self.config['losses']['perceptual_weight'] > 0: + self.perc_loss = LPIPSLoss(use_input_norm=True, range_norm=True).to(self.config['device']) + + # self.flow_comp_loss = FlowCompletionLoss().to(self.config['device']) + # self.flow_comp_loss = FlowCompletionLoss(self.config['device']) + + # set raft + self.fix_raft = RAFT_bi(device = self.config['device']) + self.fix_flow_complete = RecurrentFlowCompleteNet('/mnt/lustre/sczhou/VQGANs/CodeMOVI/experiments_model/recurrent_flow_completion_v5_train_flowcomp_v5/gen_760000.pth') + for p in self.fix_flow_complete.parameters(): + p.requires_grad = False + self.fix_flow_complete.to(self.config['device']) + self.fix_flow_complete.eval() + + # self.flow_loss = FlowLoss() + + # setup models including generator and discriminator + net = importlib.import_module('model.' + config['model']['net']) + self.netG = net.InpaintGenerator() + # print(self.netG) + self.netG = self.netG.to(self.config['device']) + if not self.config['model'].get('no_dis', False): + if self.config['model'].get('dis_2d', False): + self.netD = net.Discriminator_2D( + in_channels=3, + use_sigmoid=config['losses']['GAN_LOSS'] != 'hinge') + else: + self.netD = net.Discriminator( + in_channels=3, + use_sigmoid=config['losses']['GAN_LOSS'] != 'hinge') + self.netD = self.netD.to(self.config['device']) + + self.interp_mode = self.config['model']['interp_mode'] + # setup optimizers and schedulers + self.setup_optimizers() + self.setup_schedulers() + self.load() + + if config['distributed']: + self.netG = DDP(self.netG, + device_ids=[self.config['local_rank']], + output_device=self.config['local_rank'], + broadcast_buffers=True, + find_unused_parameters=True) + if not self.config['model']['no_dis']: + self.netD = DDP(self.netD, + device_ids=[self.config['local_rank']], + output_device=self.config['local_rank'], + broadcast_buffers=True, + find_unused_parameters=False) + + # set summary writer + self.dis_writer = None + self.gen_writer = None + self.summary = {} + if self.config['global_rank'] == 0 or (not config['distributed']): + if not self.config['model']['no_dis']: + self.dis_writer = SummaryWriter( + os.path.join(config['save_dir'], 'dis')) + self.gen_writer = SummaryWriter( + os.path.join(config['save_dir'], 'gen')) + + def setup_optimizers(self): + """Set up optimizers.""" + backbone_params = [] + for name, param in self.netG.named_parameters(): + if param.requires_grad: + backbone_params.append(param) + else: + print(f'Params {name} will not be optimized.') + + optim_params = [ + { + 'params': backbone_params, + 'lr': self.config['trainer']['lr'] + }, + ] + + self.optimG = torch.optim.Adam(optim_params, + betas=(self.config['trainer']['beta1'], + self.config['trainer']['beta2'])) + + if not self.config['model']['no_dis']: + self.optimD = torch.optim.Adam( + self.netD.parameters(), + lr=self.config['trainer']['lr'], + betas=(self.config['trainer']['beta1'], + self.config['trainer']['beta2'])) + + def setup_schedulers(self): + """Set up schedulers.""" + scheduler_opt = self.config['trainer']['scheduler'] + scheduler_type = scheduler_opt.pop('type') + + if scheduler_type in ['MultiStepLR', 'MultiStepRestartLR']: + self.scheG = MultiStepRestartLR( + self.optimG, + milestones=scheduler_opt['milestones'], + gamma=scheduler_opt['gamma']) + if not self.config['model']['no_dis']: + self.scheD = MultiStepRestartLR( + self.optimD, + milestones=scheduler_opt['milestones'], + gamma=scheduler_opt['gamma']) + elif scheduler_type == 'CosineAnnealingRestartLR': + self.scheG = CosineAnnealingRestartLR( + self.optimG, + periods=scheduler_opt['periods'], + restart_weights=scheduler_opt['restart_weights'], + eta_min=scheduler_opt['eta_min']) + if not self.config['model']['no_dis']: + self.scheD = CosineAnnealingRestartLR( + self.optimD, + periods=scheduler_opt['periods'], + restart_weights=scheduler_opt['restart_weights'], + eta_min=scheduler_opt['eta_min']) + else: + raise NotImplementedError( + f'Scheduler {scheduler_type} is not implemented yet.') + + def update_learning_rate(self): + """Update learning rate.""" + self.scheG.step() + if not self.config['model']['no_dis']: + self.scheD.step() + + def get_lr(self): + """Get current learning rate.""" + return self.optimG.param_groups[0]['lr'] + + def add_summary(self, writer, name, val): + """Add tensorboard summary.""" + if name not in self.summary: + self.summary[name] = 0 + self.summary[name] += val + n = self.train_args['log_freq'] + if writer is not None and self.iteration % n == 0: + writer.add_scalar(name, self.summary[name] / n, self.iteration) + self.summary[name] = 0 + + def load(self): + """Load netG (and netD).""" + # get the latest checkpoint + model_path = self.config['save_dir'] + # TODO: add resume name + if os.path.isfile(os.path.join(model_path, 'latest.ckpt')): + latest_epoch = open(os.path.join(model_path, 'latest.ckpt'), + 'r').read().splitlines()[-1] + else: + ckpts = [ + os.path.basename(i).split('.pth')[0] + for i in glob.glob(os.path.join(model_path, '*.pth')) + ] + ckpts.sort() + latest_epoch = ckpts[-1][4:] if len(ckpts) > 0 else None + + if latest_epoch is not None: + gen_path = os.path.join(model_path, + f'gen_{int(latest_epoch):06d}.pth') + dis_path = os.path.join(model_path, + f'dis_{int(latest_epoch):06d}.pth') + opt_path = os.path.join(model_path, + f'opt_{int(latest_epoch):06d}.pth') + + if self.config['global_rank'] == 0: + print(f'Loading model from {gen_path}...') + dataG = torch.load(gen_path, map_location=self.config['device']) + self.netG.load_state_dict(dataG) + if not self.config['model']['no_dis'] and self.config['model']['load_d']: + dataD = torch.load(dis_path, map_location=self.config['device']) + self.netD.load_state_dict(dataD) + + data_opt = torch.load(opt_path, map_location=self.config['device']) + self.optimG.load_state_dict(data_opt['optimG']) + # self.scheG.load_state_dict(data_opt['scheG']) + if not self.config['model']['no_dis'] and self.config['model']['load_d']: + self.optimD.load_state_dict(data_opt['optimD']) + # self.scheD.load_state_dict(data_opt['scheD']) + self.epoch = data_opt['epoch'] + self.iteration = data_opt['iteration'] + else: + gen_path = self.config['trainer'].get('gen_path', None) + dis_path = self.config['trainer'].get('dis_path', None) + opt_path = self.config['trainer'].get('opt_path', None) + if gen_path is not None: + if self.config['global_rank'] == 0: + print(f'Loading Gen-Net from {gen_path}...') + dataG = torch.load(gen_path, map_location=self.config['device']) + self.netG.load_state_dict(dataG) + + if dis_path is not None and not self.config['model']['no_dis'] and self.config['model']['load_d']: + if self.config['global_rank'] == 0: + print(f'Loading Dis-Net from {dis_path}...') + dataD = torch.load(dis_path, map_location=self.config['device']) + self.netD.load_state_dict(dataD) + if opt_path is not None: + data_opt = torch.load(opt_path, map_location=self.config['device']) + self.optimG.load_state_dict(data_opt['optimG']) + self.scheG.load_state_dict(data_opt['scheG']) + if not self.config['model']['no_dis'] and self.config['model']['load_d']: + self.optimD.load_state_dict(data_opt['optimD']) + self.scheD.load_state_dict(data_opt['scheD']) + else: + if self.config['global_rank'] == 0: + print('Warnning: There is no trained model found.' + 'An initialized model will be used.') + + def save(self, it): + """Save parameters every eval_epoch""" + if self.config['global_rank'] == 0: + # configure path + gen_path = os.path.join(self.config['save_dir'], + f'gen_{it:06d}.pth') + dis_path = os.path.join(self.config['save_dir'], + f'dis_{it:06d}.pth') + opt_path = os.path.join(self.config['save_dir'], + f'opt_{it:06d}.pth') + print(f'\nsaving model to {gen_path} ...') + + # remove .module for saving + if isinstance(self.netG, torch.nn.DataParallel) or isinstance(self.netG, DDP): + netG = self.netG.module + if not self.config['model']['no_dis']: + netD = self.netD.module + else: + netG = self.netG + if not self.config['model']['no_dis']: + netD = self.netD + + # save checkpoints + torch.save(netG.state_dict(), gen_path) + if not self.config['model']['no_dis']: + torch.save(netD.state_dict(), dis_path) + torch.save( + { + 'epoch': self.epoch, + 'iteration': self.iteration, + 'optimG': self.optimG.state_dict(), + 'optimD': self.optimD.state_dict(), + 'scheG': self.scheG.state_dict(), + 'scheD': self.scheD.state_dict() + }, opt_path) + else: + torch.save( + { + 'epoch': self.epoch, + 'iteration': self.iteration, + 'optimG': self.optimG.state_dict(), + 'scheG': self.scheG.state_dict() + }, opt_path) + + latest_path = os.path.join(self.config['save_dir'], 'latest.ckpt') + os.system(f"echo {it:06d} > {latest_path}") + + def train(self): + """training entry""" + pbar = range(int(self.train_args['iterations'])) + if self.config['global_rank'] == 0: + pbar = tqdm(pbar, + initial=self.iteration, + dynamic_ncols=True, + smoothing=0.01) + + os.makedirs('logs', exist_ok=True) + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(filename)s[line:%(lineno)d]" + "%(levelname)s %(message)s", + datefmt="%a, %d %b %Y %H:%M:%S", + filename=f"logs/{self.config['save_dir'].split('/')[-1]}.log", + filemode='w') + + while True: + self.epoch += 1 + self.prefetcher.reset() + if self.config['distributed']: + self.train_sampler.set_epoch(self.epoch) + self._train_epoch(pbar) + if self.iteration > self.train_args['iterations']: + break + print('\nEnd training....') + + def _train_epoch(self, pbar): + """Process input and calculate loss every training epoch""" + device = self.config['device'] + train_data = self.prefetcher.next() + while train_data is not None: + self.iteration += 1 + frames, masks, flows_f, flows_b, _ = train_data + frames, masks = frames.to(device), masks.to(device).float() + l_t = self.num_local_frames + b, t, c, h, w = frames.size() + gt_local_frames = frames[:, :l_t, ...] + local_masks = masks[:, :l_t, ...].contiguous() + + masked_frames = frames * (1 - masks) + masked_local_frames = masked_frames[:, :l_t, ...] + # get gt optical flow + if flows_f[0] == 'None' or flows_b[0] == 'None': + gt_flows_bi = self.fix_raft(gt_local_frames) + else: + gt_flows_bi = (flows_f.to(device), flows_b.to(device)) + + # ---- complete flow ---- + pred_flows_bi, _ = self.fix_flow_complete.forward_bidirect_flow(gt_flows_bi, local_masks) + pred_flows_bi = self.fix_flow_complete.combine_flow(gt_flows_bi, pred_flows_bi, local_masks) + # pred_flows_bi = gt_flows_bi + + # ---- image propagation ---- + prop_imgs, updated_local_masks = self.netG.module.img_propagation(masked_local_frames, pred_flows_bi, local_masks, interpolation=self.interp_mode) + updated_masks = masks.clone() + updated_masks[:, :l_t, ...] = updated_local_masks.view(b, l_t, 1, h, w) + updated_frames = masked_frames.clone() + prop_local_frames = gt_local_frames * (1-local_masks) + prop_imgs.view(b, l_t, 3, h, w) * local_masks # merge + updated_frames[:, :l_t, ...] = prop_local_frames + + # ---- feature propagation + Transformer ---- + pred_imgs = self.netG(updated_frames, pred_flows_bi, masks, updated_masks, l_t) + pred_imgs = pred_imgs.view(b, -1, c, h, w) + + # get the local frames + pred_local_frames = pred_imgs[:, :l_t, ...] + comp_local_frames = gt_local_frames * (1. - local_masks) + pred_local_frames * local_masks + comp_imgs = frames * (1. - masks) + pred_imgs * masks + + gen_loss = 0 + dis_loss = 0 + # optimize net_g + if not self.config['model']['no_dis']: + for p in self.netD.parameters(): + p.requires_grad = False + + self.optimG.zero_grad() + + # generator l1 loss + hole_loss = self.l1_loss(pred_imgs * masks, frames * masks) + hole_loss = hole_loss / torch.mean(masks) * self.config['losses']['hole_weight'] + gen_loss += hole_loss + self.add_summary(self.gen_writer, 'loss/hole_loss', hole_loss.item()) + + valid_loss = self.l1_loss(pred_imgs * (1 - masks), frames * (1 - masks)) + valid_loss = valid_loss / torch.mean(1-masks) * self.config['losses']['valid_weight'] + gen_loss += valid_loss + self.add_summary(self.gen_writer, 'loss/valid_loss', valid_loss.item()) + + # perceptual loss + if self.config['losses']['perceptual_weight'] > 0: + perc_loss = self.perc_loss(pred_imgs.view(-1,3,h,w), frames.view(-1,3,h,w))[0] * self.config['losses']['perceptual_weight'] + gen_loss += perc_loss + self.add_summary(self.gen_writer, 'loss/perc_loss', perc_loss.item()) + + # gan loss + if not self.config['model']['no_dis']: + # generator adversarial loss + gen_clip = self.netD(comp_imgs) + gan_loss = self.adversarial_loss(gen_clip, True, False) + gan_loss = gan_loss * self.config['losses']['adversarial_weight'] + gen_loss += gan_loss + self.add_summary(self.gen_writer, 'loss/gan_loss', gan_loss.item()) + gen_loss.backward() + self.optimG.step() + + if not self.config['model']['no_dis']: + # optimize net_d + for p in self.netD.parameters(): + p.requires_grad = True + self.optimD.zero_grad() + + # discriminator adversarial loss + real_clip = self.netD(frames) + fake_clip = self.netD(comp_imgs.detach()) + dis_real_loss = self.adversarial_loss(real_clip, True, True) + dis_fake_loss = self.adversarial_loss(fake_clip, False, True) + dis_loss += (dis_real_loss + dis_fake_loss) / 2 + self.add_summary(self.dis_writer, 'loss/dis_vid_real', dis_real_loss.item()) + self.add_summary(self.dis_writer, 'loss/dis_vid_fake', dis_fake_loss.item()) + dis_loss.backward() + self.optimD.step() + + self.update_learning_rate() + + # write image to tensorboard + if self.iteration % 200 == 0: + # img to cpu + t = 0 + gt_local_frames_cpu = ((gt_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu() + masked_local_frames = ((masked_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu() + prop_local_frames_cpu = ((prop_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu() + pred_local_frames_cpu = ((pred_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu() + img_results = torch.cat([masked_local_frames[0][t], gt_local_frames_cpu[0][t], + prop_local_frames_cpu[0][t], pred_local_frames_cpu[0][t]], 1) + img_results = torchvision.utils.make_grid(img_results, nrow=1, normalize=True) + if self.gen_writer is not None: + self.gen_writer.add_image(f'img/img:inp-gt-res-{t}', img_results, self.iteration) + + t = 5 + if masked_local_frames.shape[1] > 5: + img_results = torch.cat([masked_local_frames[0][t], gt_local_frames_cpu[0][t], + prop_local_frames_cpu[0][t], pred_local_frames_cpu[0][t]], 1) + img_results = torchvision.utils.make_grid(img_results, nrow=1, normalize=True) + if self.gen_writer is not None: + self.gen_writer.add_image(f'img/img:inp-gt-res-{t}', img_results, self.iteration) + + # flow to cpu + gt_flows_forward_cpu = flow_to_image(gt_flows_bi[0][0]).cpu() + masked_flows_forward_cpu = (gt_flows_forward_cpu[0] * (1-local_masks[0][0].cpu())).to(gt_flows_forward_cpu) + pred_flows_forward_cpu = flow_to_image(pred_flows_bi[0][0]).cpu() + + flow_results = torch.cat([gt_flows_forward_cpu[0], masked_flows_forward_cpu, pred_flows_forward_cpu[0]], 1) + if self.gen_writer is not None: + self.gen_writer.add_image('img/flow:gt-pred', flow_results, self.iteration) + + # console logs + if self.config['global_rank'] == 0: + pbar.update(1) + if not self.config['model']['no_dis']: + pbar.set_description((f"d: {dis_loss.item():.3f}; " + f"hole: {hole_loss.item():.3f}; " + f"valid: {valid_loss.item():.3f}")) + else: + pbar.set_description((f"hole: {hole_loss.item():.3f}; " + f"valid: {valid_loss.item():.3f}")) + + if self.iteration % self.train_args['log_freq'] == 0: + if not self.config['model']['no_dis']: + logging.info(f"[Iter {self.iteration}] " + f"d: {dis_loss.item():.4f}; " + f"hole: {hole_loss.item():.4f}; " + f"valid: {valid_loss.item():.4f}") + else: + logging.info(f"[Iter {self.iteration}] " + f"hole: {hole_loss.item():.4f}; " + f"valid: {valid_loss.item():.4f}") + + # saving models + if self.iteration % self.train_args['save_freq'] == 0: + self.save(int(self.iteration)) + + if self.iteration > self.train_args['iterations']: + break + + train_data = self.prefetcher.next() \ No newline at end of file diff --git a/backend/inpaint/video/core/trainer_flow_w_edge.py b/backend/inpaint/video/core/trainer_flow_w_edge.py new file mode 100644 index 0000000..d4eba04 --- /dev/null +++ b/backend/inpaint/video/core/trainer_flow_w_edge.py @@ -0,0 +1,380 @@ +import os +import glob +import logging +import importlib +from tqdm import tqdm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.prefetch_dataloader import PrefetchDataLoader, CPUPrefetcher +from torch.utils.data.distributed import DistributedSampler +from torch.nn.parallel import DistributedDataParallel as DDP + +from torch.utils.tensorboard import SummaryWriter + +from core.lr_scheduler import MultiStepRestartLR, CosineAnnealingRestartLR +from core.dataset import TrainDataset + +from model.modules.flow_comp_raft import RAFT_bi, FlowLoss, EdgeLoss + +# from skimage.feature import canny +from model.canny.canny_filter import Canny +from RAFT.utils.flow_viz_pt import flow_to_image + + +class Trainer: + def __init__(self, config): + self.config = config + self.epoch = 0 + self.iteration = 0 + self.num_local_frames = config['train_data_loader']['num_local_frames'] + self.num_ref_frames = config['train_data_loader']['num_ref_frames'] + + # setup data set and data loader + self.train_dataset = TrainDataset(config['train_data_loader']) + + self.train_sampler = None + self.train_args = config['trainer'] + if config['distributed']: + self.train_sampler = DistributedSampler( + self.train_dataset, + num_replicas=config['world_size'], + rank=config['global_rank']) + + dataloader_args = dict( + dataset=self.train_dataset, + batch_size=self.train_args['batch_size'] // config['world_size'], + shuffle=(self.train_sampler is None), + num_workers=self.train_args['num_workers'], + sampler=self.train_sampler, + drop_last=True) + + self.train_loader = PrefetchDataLoader(self.train_args['num_prefetch_queue'], **dataloader_args) + self.prefetcher = CPUPrefetcher(self.train_loader) + + # set raft + self.fix_raft = RAFT_bi(device = self.config['device']) + self.flow_loss = FlowLoss() + self.edge_loss = EdgeLoss() + self.canny = Canny(sigma=(2,2), low_threshold=0.1, high_threshold=0.2) + + # setup models including generator and discriminator + net = importlib.import_module('model.' + config['model']['net']) + self.netG = net.RecurrentFlowCompleteNet() + # print(self.netG) + self.netG = self.netG.to(self.config['device']) + + # setup optimizers and schedulers + self.setup_optimizers() + self.setup_schedulers() + self.load() + + if config['distributed']: + self.netG = DDP(self.netG, + device_ids=[self.config['local_rank']], + output_device=self.config['local_rank'], + broadcast_buffers=True, + find_unused_parameters=True) + + # set summary writer + self.dis_writer = None + self.gen_writer = None + self.summary = {} + if self.config['global_rank'] == 0 or (not config['distributed']): + self.gen_writer = SummaryWriter( + os.path.join(config['save_dir'], 'gen')) + + def setup_optimizers(self): + """Set up optimizers.""" + backbone_params = [] + for name, param in self.netG.named_parameters(): + if param.requires_grad: + backbone_params.append(param) + else: + print(f'Params {name} will not be optimized.') + + optim_params = [ + { + 'params': backbone_params, + 'lr': self.config['trainer']['lr'] + }, + ] + + self.optimG = torch.optim.Adam(optim_params, + betas=(self.config['trainer']['beta1'], + self.config['trainer']['beta2'])) + + + def setup_schedulers(self): + """Set up schedulers.""" + scheduler_opt = self.config['trainer']['scheduler'] + scheduler_type = scheduler_opt.pop('type') + + if scheduler_type in ['MultiStepLR', 'MultiStepRestartLR']: + self.scheG = MultiStepRestartLR( + self.optimG, + milestones=scheduler_opt['milestones'], + gamma=scheduler_opt['gamma']) + elif scheduler_type == 'CosineAnnealingRestartLR': + self.scheG = CosineAnnealingRestartLR( + self.optimG, + periods=scheduler_opt['periods'], + restart_weights=scheduler_opt['restart_weights']) + else: + raise NotImplementedError( + f'Scheduler {scheduler_type} is not implemented yet.') + + def update_learning_rate(self): + """Update learning rate.""" + self.scheG.step() + + def get_lr(self): + """Get current learning rate.""" + return self.optimG.param_groups[0]['lr'] + + def add_summary(self, writer, name, val): + """Add tensorboard summary.""" + if name not in self.summary: + self.summary[name] = 0 + self.summary[name] += val + n = self.train_args['log_freq'] + if writer is not None and self.iteration % n == 0: + writer.add_scalar(name, self.summary[name] / n, self.iteration) + self.summary[name] = 0 + + def load(self): + """Load netG.""" + # get the latest checkpoint + model_path = self.config['save_dir'] + if os.path.isfile(os.path.join(model_path, 'latest.ckpt')): + latest_epoch = open(os.path.join(model_path, 'latest.ckpt'), + 'r').read().splitlines()[-1] + else: + ckpts = [ + os.path.basename(i).split('.pth')[0] + for i in glob.glob(os.path.join(model_path, '*.pth')) + ] + ckpts.sort() + latest_epoch = ckpts[-1][4:] if len(ckpts) > 0 else None + + if latest_epoch is not None: + gen_path = os.path.join(model_path, f'gen_{int(latest_epoch):06d}.pth') + opt_path = os.path.join(model_path,f'opt_{int(latest_epoch):06d}.pth') + + if self.config['global_rank'] == 0: + print(f'Loading model from {gen_path}...') + dataG = torch.load(gen_path, map_location=self.config['device']) + self.netG.load_state_dict(dataG) + + + data_opt = torch.load(opt_path, map_location=self.config['device']) + self.optimG.load_state_dict(data_opt['optimG']) + self.scheG.load_state_dict(data_opt['scheG']) + + self.epoch = data_opt['epoch'] + self.iteration = data_opt['iteration'] + + else: + if self.config['global_rank'] == 0: + print('Warnning: There is no trained model found.' + 'An initialized model will be used.') + + def save(self, it): + """Save parameters every eval_epoch""" + if self.config['global_rank'] == 0: + # configure path + gen_path = os.path.join(self.config['save_dir'], + f'gen_{it:06d}.pth') + opt_path = os.path.join(self.config['save_dir'], + f'opt_{it:06d}.pth') + print(f'\nsaving model to {gen_path} ...') + + # remove .module for saving + if isinstance(self.netG, torch.nn.DataParallel) or isinstance(self.netG, DDP): + netG = self.netG.module + else: + netG = self.netG + + # save checkpoints + torch.save(netG.state_dict(), gen_path) + torch.save( + { + 'epoch': self.epoch, + 'iteration': self.iteration, + 'optimG': self.optimG.state_dict(), + 'scheG': self.scheG.state_dict() + }, opt_path) + + latest_path = os.path.join(self.config['save_dir'], 'latest.ckpt') + os.system(f"echo {it:06d} > {latest_path}") + + def train(self): + """training entry""" + pbar = range(int(self.train_args['iterations'])) + if self.config['global_rank'] == 0: + pbar = tqdm(pbar, + initial=self.iteration, + dynamic_ncols=True, + smoothing=0.01) + + os.makedirs('logs', exist_ok=True) + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(filename)s[line:%(lineno)d]" + "%(levelname)s %(message)s", + datefmt="%a, %d %b %Y %H:%M:%S", + filename=f"logs/{self.config['save_dir'].split('/')[-1]}.log", + filemode='w') + + while True: + self.epoch += 1 + self.prefetcher.reset() + if self.config['distributed']: + self.train_sampler.set_epoch(self.epoch) + self._train_epoch(pbar) + if self.iteration > self.train_args['iterations']: + break + print('\nEnd training....') + + # def get_edges(self, flows): # fgvc + # # (b, t, 2, H, W) + # b, t, _, h, w = flows.shape + # flows = flows.view(-1, 2, h, w) + # flows_list = flows.permute(0, 2, 3, 1).cpu().numpy() + # edges = [] + # for f in list(flows_list): + # flows_gray = (f[:, :, 0] ** 2 + f[:, :, 1] ** 2) ** 0.5 + # if flows_gray.max() < 1: + # flows_gray = flows_gray*0 + # else: + # flows_gray = flows_gray / flows_gray.max() + + # edge = canny(flows_gray, sigma=2, low_threshold=0.1, high_threshold=0.2) # fgvc + # edge = torch.from_numpy(edge).view(1, 1, h, w).float() + # edges.append(edge) + # edges = torch.stack(edges, dim=0).to(self.config['device']) + # edges = edges.view(b, t, 1, h, w) + # return edges + + def get_edges(self, flows): + # (b, t, 2, H, W) + b, t, _, h, w = flows.shape + flows = flows.view(-1, 2, h, w) + flows_gray = (flows[:, 0, None] ** 2 + flows[:, 1, None] ** 2) ** 0.5 + if flows_gray.max() < 1: + flows_gray = flows_gray*0 + else: + flows_gray = flows_gray / flows_gray.max() + + magnitude, edges = self.canny(flows_gray.float()) + edges = edges.view(b, t, 1, h, w) + return edges + + def _train_epoch(self, pbar): + """Process input and calculate loss every training epoch""" + device = self.config['device'] + train_data = self.prefetcher.next() + while train_data is not None: + self.iteration += 1 + frames, masks, flows_f, flows_b, _ = train_data + frames, masks = frames.to(device), masks.to(device) + masks = masks.float() + + l_t = self.num_local_frames + b, t, c, h, w = frames.size() + gt_local_frames = frames[:, :l_t, ...] + local_masks = masks[:, :l_t, ...].contiguous() + + # get gt optical flow + if flows_f[0] == 'None' or flows_b[0] == 'None': + gt_flows_bi = self.fix_raft(gt_local_frames) + else: + gt_flows_bi = (flows_f.to(device), flows_b.to(device)) + + # get gt edge + gt_edges_forward = self.get_edges(gt_flows_bi[0]) + gt_edges_backward = self.get_edges(gt_flows_bi[1]) + gt_edges_bi = [gt_edges_forward, gt_edges_backward] + + # complete flow + pred_flows_bi, pred_edges_bi = self.netG.module.forward_bidirect_flow(gt_flows_bi, local_masks) + + # optimize net_g + self.optimG.zero_grad() + + # compulte flow_loss + flow_loss, warp_loss = self.flow_loss(pred_flows_bi, gt_flows_bi, local_masks, gt_local_frames) + flow_loss = flow_loss * self.config['losses']['flow_weight'] + warp_loss = warp_loss * 0.01 + self.add_summary(self.gen_writer, 'loss/flow_loss', flow_loss.item()) + self.add_summary(self.gen_writer, 'loss/warp_loss', warp_loss.item()) + + # compute edge loss + edge_loss = self.edge_loss(pred_edges_bi, gt_edges_bi, local_masks) + edge_loss = edge_loss*1.0 + self.add_summary(self.gen_writer, 'loss/edge_loss', edge_loss.item()) + + loss = flow_loss + warp_loss + edge_loss + loss.backward() + self.optimG.step() + self.update_learning_rate() + + # write image to tensorboard + # if self.iteration % 200 == 0: + if self.iteration % 200 == 0 and self.gen_writer is not None: + t = 5 + # forward to cpu + gt_flows_forward_cpu = flow_to_image(gt_flows_bi[0][0]).cpu() + masked_flows_forward_cpu = (gt_flows_forward_cpu[t] * (1-local_masks[0][t].cpu())).to(gt_flows_forward_cpu) + pred_flows_forward_cpu = flow_to_image(pred_flows_bi[0][0]).cpu() + + flow_results = torch.cat([gt_flows_forward_cpu[t], masked_flows_forward_cpu, pred_flows_forward_cpu[t]], 1) + self.gen_writer.add_image('img/flow-f:gt-pred', flow_results, self.iteration) + + # backward to cpu + gt_flows_backward_cpu = flow_to_image(gt_flows_bi[1][0]).cpu() + masked_flows_backward_cpu = (gt_flows_backward_cpu[t] * (1-local_masks[0][t+1].cpu())).to(gt_flows_backward_cpu) + pred_flows_backward_cpu = flow_to_image(pred_flows_bi[1][0]).cpu() + + flow_results = torch.cat([gt_flows_backward_cpu[t], masked_flows_backward_cpu, pred_flows_backward_cpu[t]], 1) + self.gen_writer.add_image('img/flow-b:gt-pred', flow_results, self.iteration) + + # TODO: show edge + # forward + gt_edges_forward_cpu = gt_edges_bi[0][0].cpu() + masked_edges_forward_cpu = (gt_edges_forward_cpu[t] * (1-local_masks[0][t].cpu())).to(gt_edges_forward_cpu) + pred_edges_forward_cpu = pred_edges_bi[0][0].cpu() + + edge_results = torch.cat([gt_edges_forward_cpu[t], masked_edges_forward_cpu, pred_edges_forward_cpu[t]], 1) + self.gen_writer.add_image('img/edge-f:gt-pred', edge_results, self.iteration) + # backward + gt_edges_backward_cpu = gt_edges_bi[1][0].cpu() + masked_edges_backward_cpu = (gt_edges_backward_cpu[t] * (1-local_masks[0][t+1].cpu())).to(gt_edges_backward_cpu) + pred_edges_backward_cpu = pred_edges_bi[1][0].cpu() + + edge_results = torch.cat([gt_edges_backward_cpu[t], masked_edges_backward_cpu, pred_edges_backward_cpu[t]], 1) + self.gen_writer.add_image('img/edge-b:gt-pred', edge_results, self.iteration) + + # console logs + if self.config['global_rank'] == 0: + pbar.update(1) + pbar.set_description((f"flow: {flow_loss.item():.3f}; " + f"warp: {warp_loss.item():.3f}; " + f"edge: {edge_loss.item():.3f}; " + f"lr: {self.get_lr()}")) + + if self.iteration % self.train_args['log_freq'] == 0: + logging.info(f"[Iter {self.iteration}] " + f"flow: {flow_loss.item():.4f}; " + f"warp: {warp_loss.item():.4f}") + + # saving models + if self.iteration % self.train_args['save_freq'] == 0: + self.save(int(self.iteration)) + + if self.iteration > self.train_args['iterations']: + break + + train_data = self.prefetcher.next() \ No newline at end of file diff --git a/backend/inpaint/video/core/utils.py b/backend/inpaint/video/core/utils.py new file mode 100644 index 0000000..37dccb2 --- /dev/null +++ b/backend/inpaint/video/core/utils.py @@ -0,0 +1,371 @@ +import os +import io +import cv2 +import random +import numpy as np +from PIL import Image, ImageOps +import zipfile +import math + +import torch +import matplotlib +import matplotlib.patches as patches +from matplotlib.path import Path +from matplotlib import pyplot as plt +from torchvision import transforms + +# matplotlib.use('agg') + +# ########################################################################### +# Directory IO +# ########################################################################### + + +def read_dirnames_under_root(root_dir): + dirnames = [ + name for i, name in enumerate(sorted(os.listdir(root_dir))) + if os.path.isdir(os.path.join(root_dir, name)) + ] + print(f'Reading directories under {root_dir}, num: {len(dirnames)}') + return dirnames + + +class TrainZipReader(object): + file_dict = dict() + + def __init__(self): + super(TrainZipReader, self).__init__() + + @staticmethod + def build_file_dict(path): + file_dict = TrainZipReader.file_dict + if path in file_dict: + return file_dict[path] + else: + file_handle = zipfile.ZipFile(path, 'r') + file_dict[path] = file_handle + return file_dict[path] + + @staticmethod + def imread(path, idx): + zfile = TrainZipReader.build_file_dict(path) + filelist = zfile.namelist() + filelist.sort() + data = zfile.read(filelist[idx]) + # + im = Image.open(io.BytesIO(data)) + return im + + +class TestZipReader(object): + file_dict = dict() + + def __init__(self): + super(TestZipReader, self).__init__() + + @staticmethod + def build_file_dict(path): + file_dict = TestZipReader.file_dict + if path in file_dict: + return file_dict[path] + else: + file_handle = zipfile.ZipFile(path, 'r') + file_dict[path] = file_handle + return file_dict[path] + + @staticmethod + def imread(path, idx): + zfile = TestZipReader.build_file_dict(path) + filelist = zfile.namelist() + filelist.sort() + data = zfile.read(filelist[idx]) + file_bytes = np.asarray(bytearray(data), dtype=np.uint8) + im = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR) + im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB)) + # im = Image.open(io.BytesIO(data)) + return im + + +# ########################################################################### +# Data augmentation +# ########################################################################### + + +def to_tensors(): + return transforms.Compose([Stack(), ToTorchFormatTensor()]) + + +class GroupRandomHorizontalFlowFlip(object): + """Randomly horizontally flips the given PIL.Image with a probability of 0.5 + """ + def __call__(self, img_group, flowF_group, flowB_group): + v = random.random() + if v < 0.5: + ret_img = [ + img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group + ] + ret_flowF = [ff[:, ::-1] * [-1.0, 1.0] for ff in flowF_group] + ret_flowB = [fb[:, ::-1] * [-1.0, 1.0] for fb in flowB_group] + return ret_img, ret_flowF, ret_flowB + else: + return img_group, flowF_group, flowB_group + + +class GroupRandomHorizontalFlip(object): + """Randomly horizontally flips the given PIL.Image with a probability of 0.5 + """ + def __call__(self, img_group, is_flow=False): + v = random.random() + if v < 0.5: + ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group] + if is_flow: + for i in range(0, len(ret), 2): + # invert flow pixel values when flipping + ret[i] = ImageOps.invert(ret[i]) + return ret + else: + return img_group + + +class Stack(object): + def __init__(self, roll=False): + self.roll = roll + + def __call__(self, img_group): + mode = img_group[0].mode + if mode == '1': + img_group = [img.convert('L') for img in img_group] + mode = 'L' + if mode == 'L': + return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2) + elif mode == 'RGB': + if self.roll: + return np.stack([np.array(x)[:, :, ::-1] for x in img_group], + axis=2) + else: + return np.stack(img_group, axis=2) + else: + raise NotImplementedError(f"Image mode {mode}") + + +class ToTorchFormatTensor(object): + """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255] + to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """ + def __init__(self, div=True): + self.div = div + + def __call__(self, pic): + if isinstance(pic, np.ndarray): + # numpy img: [L, C, H, W] + img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous() + else: + # handle PIL Image + img = torch.ByteTensor(torch.ByteStorage.from_buffer( + pic.tobytes())) + img = img.view(pic.size[1], pic.size[0], len(pic.mode)) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + img = img.transpose(0, 1).transpose(0, 2).contiguous() + img = img.float().div(255) if self.div else img.float() + return img + + +# ########################################################################### +# Create masks with random shape +# ########################################################################### + + +def create_random_shape_with_random_motion(video_length, + imageHeight=240, + imageWidth=432): + # get a random shape + height = random.randint(imageHeight // 3, imageHeight - 1) + width = random.randint(imageWidth // 3, imageWidth - 1) + edge_num = random.randint(6, 8) + ratio = random.randint(6, 8) / 10 + + region = get_random_shape(edge_num=edge_num, + ratio=ratio, + height=height, + width=width) + region_width, region_height = region.size + # get random position + x, y = random.randint(0, imageHeight - region_height), random.randint( + 0, imageWidth - region_width) + velocity = get_random_velocity(max_speed=3) + m = Image.fromarray(np.zeros((imageHeight, imageWidth)).astype(np.uint8)) + m.paste(region, (y, x, y + region.size[0], x + region.size[1])) + masks = [m.convert('L')] + # return fixed masks + if random.uniform(0, 1) > 0.5: + return masks * video_length + # return moving masks + for _ in range(video_length - 1): + x, y, velocity = random_move_control_points(x, + y, + imageHeight, + imageWidth, + velocity, + region.size, + maxLineAcceleration=(3, + 0.5), + maxInitSpeed=3) + m = Image.fromarray( + np.zeros((imageHeight, imageWidth)).astype(np.uint8)) + m.paste(region, (y, x, y + region.size[0], x + region.size[1])) + masks.append(m.convert('L')) + return masks + + +def create_random_shape_with_random_motion_zoom_rotation(video_length, zoomin=0.9, zoomout=1.1, rotmin=1, rotmax=10, imageHeight=240, imageWidth=432): + # get a random shape + assert zoomin < 1, "Zoom-in parameter must be smaller than 1" + assert zoomout > 1, "Zoom-out parameter must be larger than 1" + assert rotmin < rotmax, "Minimum value of rotation must be smaller than maximun value !" + height = random.randint(imageHeight//3, imageHeight-1) + width = random.randint(imageWidth//3, imageWidth-1) + edge_num = random.randint(6, 8) + ratio = random.randint(6, 8)/10 + region = get_random_shape( + edge_num=edge_num, ratio=ratio, height=height, width=width) + region_width, region_height = region.size + # get random position + x, y = random.randint( + 0, imageHeight-region_height), random.randint(0, imageWidth-region_width) + velocity = get_random_velocity(max_speed=3) + m = Image.fromarray(np.zeros((imageHeight, imageWidth)).astype(np.uint8)) + m.paste(region, (y, x, y+region.size[0], x+region.size[1])) + masks = [m.convert('L')] + # return fixed masks + if random.uniform(0, 1) > 0.5: + return masks*video_length # -> directly copy all the base masks + # return moving masks + for _ in range(video_length-1): + x, y, velocity = random_move_control_points( + x, y, imageHeight, imageWidth, velocity, region.size, maxLineAcceleration=(3, 0.5), maxInitSpeed=3) + m = Image.fromarray( + np.zeros((imageHeight, imageWidth)).astype(np.uint8)) + ### add by kaidong, to simulate zoon-in, zoom-out and rotation + extra_transform = random.uniform(0, 1) + # zoom in and zoom out + if extra_transform > 0.75: + resize_coefficient = random.uniform(zoomin, zoomout) + region = region.resize((math.ceil(region_width * resize_coefficient), math.ceil(region_height * resize_coefficient)), Image.NEAREST) + m.paste(region, (y, x, y + region.size[0], x + region.size[1])) + region_width, region_height = region.size + # rotation + elif extra_transform > 0.5: + m.paste(region, (y, x, y + region.size[0], x + region.size[1])) + m = m.rotate(random.randint(rotmin, rotmax)) + # region_width, region_height = region.size + ### end + else: + m.paste(region, (y, x, y+region.size[0], x+region.size[1])) + masks.append(m.convert('L')) + return masks + + +def get_random_shape(edge_num=9, ratio=0.7, width=432, height=240): + ''' + There is the initial point and 3 points per cubic bezier curve. + Thus, the curve will only pass though n points, which will be the sharp edges. + The other 2 modify the shape of the bezier curve. + edge_num, Number of possibly sharp edges + points_num, number of points in the Path + ratio, (0, 1) magnitude of the perturbation from the unit circle, + ''' + points_num = edge_num*3 + 1 + angles = np.linspace(0, 2*np.pi, points_num) + codes = np.full(points_num, Path.CURVE4) + codes[0] = Path.MOVETO + # Using this instead of Path.CLOSEPOLY avoids an innecessary straight line + verts = np.stack((np.cos(angles), np.sin(angles))).T * \ + (2*ratio*np.random.random(points_num)+1-ratio)[:, None] + verts[-1, :] = verts[0, :] + path = Path(verts, codes) + # draw paths into images + fig = plt.figure() + ax = fig.add_subplot(111) + patch = patches.PathPatch(path, facecolor='black', lw=2) + ax.add_patch(patch) + ax.set_xlim(np.min(verts)*1.1, np.max(verts)*1.1) + ax.set_ylim(np.min(verts)*1.1, np.max(verts)*1.1) + ax.axis('off') # removes the axis to leave only the shape + fig.canvas.draw() + # convert plt images into numpy images + data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) + data = data.reshape((fig.canvas.get_width_height()[::-1] + (3,))) + plt.close(fig) + # postprocess + data = cv2.resize(data, (width, height))[:, :, 0] + data = (1 - np.array(data > 0).astype(np.uint8))*255 + corrdinates = np.where(data > 0) + xmin, xmax, ymin, ymax = np.min(corrdinates[0]), np.max( + corrdinates[0]), np.min(corrdinates[1]), np.max(corrdinates[1]) + region = Image.fromarray(data).crop((ymin, xmin, ymax, xmax)) + return region + + +def random_accelerate(velocity, maxAcceleration, dist='uniform'): + speed, angle = velocity + d_speed, d_angle = maxAcceleration + if dist == 'uniform': + speed += np.random.uniform(-d_speed, d_speed) + angle += np.random.uniform(-d_angle, d_angle) + elif dist == 'guassian': + speed += np.random.normal(0, d_speed / 2) + angle += np.random.normal(0, d_angle / 2) + else: + raise NotImplementedError( + f'Distribution type {dist} is not supported.') + return (speed, angle) + + +def get_random_velocity(max_speed=3, dist='uniform'): + if dist == 'uniform': + speed = np.random.uniform(max_speed) + elif dist == 'guassian': + speed = np.abs(np.random.normal(0, max_speed / 2)) + else: + raise NotImplementedError( + f'Distribution type {dist} is not supported.') + angle = np.random.uniform(0, 2 * np.pi) + return (speed, angle) + + +def random_move_control_points(X, + Y, + imageHeight, + imageWidth, + lineVelocity, + region_size, + maxLineAcceleration=(3, 0.5), + maxInitSpeed=3): + region_width, region_height = region_size + speed, angle = lineVelocity + X += int(speed * np.cos(angle)) + Y += int(speed * np.sin(angle)) + lineVelocity = random_accelerate(lineVelocity, + maxLineAcceleration, + dist='guassian') + if ((X > imageHeight - region_height) or (X < 0) + or (Y > imageWidth - region_width) or (Y < 0)): + lineVelocity = get_random_velocity(maxInitSpeed, dist='guassian') + new_X = np.clip(X, 0, imageHeight - region_height) + new_Y = np.clip(Y, 0, imageWidth - region_width) + return new_X, new_Y, lineVelocity + + +if __name__ == '__main__': + + trials = 10 + for _ in range(trials): + video_length = 10 + # The returned masks are either stationary (50%) or moving (50%) + masks = create_random_shape_with_random_motion(video_length, + imageHeight=240, + imageWidth=432) + + for m in masks: + cv2.imshow('mask', np.array(m)) + cv2.waitKey(500) diff --git a/backend/inpaint/video/model/__init__.py b/backend/inpaint/video/model/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/backend/inpaint/video/model/__init__.py @@ -0,0 +1 @@ + diff --git a/backend/inpaint/video/model/canny/canny_filter.py b/backend/inpaint/video/model/canny/canny_filter.py new file mode 100644 index 0000000..3d16195 --- /dev/null +++ b/backend/inpaint/video/model/canny/canny_filter.py @@ -0,0 +1,256 @@ +import math +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .gaussian import gaussian_blur2d +from .kernels import get_canny_nms_kernel, get_hysteresis_kernel +from .sobel import spatial_gradient + +def rgb_to_grayscale(image, rgb_weights = None): + if len(image.shape) < 3 or image.shape[-3] != 3: + raise ValueError(f"Input size must have a shape of (*, 3, H, W). Got {image.shape}") + + if rgb_weights is None: + # 8 bit images + if image.dtype == torch.uint8: + rgb_weights = torch.tensor([76, 150, 29], device=image.device, dtype=torch.uint8) + # floating point images + elif image.dtype in (torch.float16, torch.float32, torch.float64): + rgb_weights = torch.tensor([0.299, 0.587, 0.114], device=image.device, dtype=image.dtype) + else: + raise TypeError(f"Unknown data type: {image.dtype}") + else: + # is tensor that we make sure is in the same device/dtype + rgb_weights = rgb_weights.to(image) + + # unpack the color image channels with RGB order + r = image[..., 0:1, :, :] + g = image[..., 1:2, :, :] + b = image[..., 2:3, :, :] + + w_r, w_g, w_b = rgb_weights.unbind() + return w_r * r + w_g * g + w_b * b + + +def canny( + input: torch.Tensor, + low_threshold: float = 0.1, + high_threshold: float = 0.2, + kernel_size: Tuple[int, int] = (5, 5), + sigma: Tuple[float, float] = (1, 1), + hysteresis: bool = True, + eps: float = 1e-6, +) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Find edges of the input image and filters them using the Canny algorithm. + + .. image:: _static/img/canny.png + + Args: + input: input image tensor with shape :math:`(B,C,H,W)`. + low_threshold: lower threshold for the hysteresis procedure. + high_threshold: upper threshold for the hysteresis procedure. + kernel_size: the size of the kernel for the gaussian blur. + sigma: the standard deviation of the kernel for the gaussian blur. + hysteresis: if True, applies the hysteresis edge tracking. + Otherwise, the edges are divided between weak (0.5) and strong (1) edges. + eps: regularization number to avoid NaN during backprop. + + Returns: + - the canny edge magnitudes map, shape of :math:`(B,1,H,W)`. + - the canny edge detection filtered by thresholds and hysteresis, shape of :math:`(B,1,H,W)`. + + .. note:: + See a working example `here `__. + + Example: + >>> input = torch.rand(5, 3, 4, 4) + >>> magnitude, edges = canny(input) # 5x3x4x4 + >>> magnitude.shape + torch.Size([5, 1, 4, 4]) + >>> edges.shape + torch.Size([5, 1, 4, 4]) + """ + if not isinstance(input, torch.Tensor): + raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") + + if not len(input.shape) == 4: + raise ValueError(f"Invalid input shape, we expect BxCxHxW. Got: {input.shape}") + + if low_threshold > high_threshold: + raise ValueError( + "Invalid input thresholds. low_threshold should be smaller than the high_threshold. Got: {}>{}".format( + low_threshold, high_threshold + ) + ) + + if low_threshold < 0 and low_threshold > 1: + raise ValueError(f"Invalid input threshold. low_threshold should be in range (0,1). Got: {low_threshold}") + + if high_threshold < 0 and high_threshold > 1: + raise ValueError(f"Invalid input threshold. high_threshold should be in range (0,1). Got: {high_threshold}") + + device: torch.device = input.device + dtype: torch.dtype = input.dtype + + # To Grayscale + if input.shape[1] == 3: + input = rgb_to_grayscale(input) + + # Gaussian filter + blurred: torch.Tensor = gaussian_blur2d(input, kernel_size, sigma) + + # Compute the gradients + gradients: torch.Tensor = spatial_gradient(blurred, normalized=False) + + # Unpack the edges + gx: torch.Tensor = gradients[:, :, 0] + gy: torch.Tensor = gradients[:, :, 1] + + # Compute gradient magnitude and angle + magnitude: torch.Tensor = torch.sqrt(gx * gx + gy * gy + eps) + angle: torch.Tensor = torch.atan2(gy, gx) + + # Radians to Degrees + angle = 180.0 * angle / math.pi + + # Round angle to the nearest 45 degree + angle = torch.round(angle / 45) * 45 + + # Non-maximal suppression + nms_kernels: torch.Tensor = get_canny_nms_kernel(device, dtype) + nms_magnitude: torch.Tensor = F.conv2d(magnitude, nms_kernels, padding=nms_kernels.shape[-1] // 2) + + # Get the indices for both directions + positive_idx: torch.Tensor = (angle / 45) % 8 + positive_idx = positive_idx.long() + + negative_idx: torch.Tensor = ((angle / 45) + 4) % 8 + negative_idx = negative_idx.long() + + # Apply the non-maximum suppression to the different directions + channel_select_filtered_positive: torch.Tensor = torch.gather(nms_magnitude, 1, positive_idx) + channel_select_filtered_negative: torch.Tensor = torch.gather(nms_magnitude, 1, negative_idx) + + channel_select_filtered: torch.Tensor = torch.stack( + [channel_select_filtered_positive, channel_select_filtered_negative], 1 + ) + + is_max: torch.Tensor = channel_select_filtered.min(dim=1)[0] > 0.0 + + magnitude = magnitude * is_max + + # Threshold + edges: torch.Tensor = F.threshold(magnitude, low_threshold, 0.0) + + low: torch.Tensor = magnitude > low_threshold + high: torch.Tensor = magnitude > high_threshold + + edges = low * 0.5 + high * 0.5 + edges = edges.to(dtype) + + # Hysteresis + if hysteresis: + edges_old: torch.Tensor = -torch.ones(edges.shape, device=edges.device, dtype=dtype) + hysteresis_kernels: torch.Tensor = get_hysteresis_kernel(device, dtype) + + while ((edges_old - edges).abs() != 0).any(): + weak: torch.Tensor = (edges == 0.5).float() + strong: torch.Tensor = (edges == 1).float() + + hysteresis_magnitude: torch.Tensor = F.conv2d( + edges, hysteresis_kernels, padding=hysteresis_kernels.shape[-1] // 2 + ) + hysteresis_magnitude = (hysteresis_magnitude == 1).any(1, keepdim=True).to(dtype) + hysteresis_magnitude = hysteresis_magnitude * weak + strong + + edges_old = edges.clone() + edges = hysteresis_magnitude + (hysteresis_magnitude == 0) * weak * 0.5 + + edges = hysteresis_magnitude + + return magnitude, edges + + +class Canny(nn.Module): + r"""Module that finds edges of the input image and filters them using the Canny algorithm. + + Args: + input: input image tensor with shape :math:`(B,C,H,W)`. + low_threshold: lower threshold for the hysteresis procedure. + high_threshold: upper threshold for the hysteresis procedure. + kernel_size: the size of the kernel for the gaussian blur. + sigma: the standard deviation of the kernel for the gaussian blur. + hysteresis: if True, applies the hysteresis edge tracking. + Otherwise, the edges are divided between weak (0.5) and strong (1) edges. + eps: regularization number to avoid NaN during backprop. + + Returns: + - the canny edge magnitudes map, shape of :math:`(B,1,H,W)`. + - the canny edge detection filtered by thresholds and hysteresis, shape of :math:`(B,1,H,W)`. + + Example: + >>> input = torch.rand(5, 3, 4, 4) + >>> magnitude, edges = Canny()(input) # 5x3x4x4 + >>> magnitude.shape + torch.Size([5, 1, 4, 4]) + >>> edges.shape + torch.Size([5, 1, 4, 4]) + """ + + def __init__( + self, + low_threshold: float = 0.1, + high_threshold: float = 0.2, + kernel_size: Tuple[int, int] = (5, 5), + sigma: Tuple[float, float] = (1, 1), + hysteresis: bool = True, + eps: float = 1e-6, + ) -> None: + super().__init__() + + if low_threshold > high_threshold: + raise ValueError( + "Invalid input thresholds. low_threshold should be\ + smaller than the high_threshold. Got: {}>{}".format( + low_threshold, high_threshold + ) + ) + + if low_threshold < 0 or low_threshold > 1: + raise ValueError(f"Invalid input threshold. low_threshold should be in range (0,1). Got: {low_threshold}") + + if high_threshold < 0 or high_threshold > 1: + raise ValueError(f"Invalid input threshold. high_threshold should be in range (0,1). Got: {high_threshold}") + + # Gaussian blur parameters + self.kernel_size = kernel_size + self.sigma = sigma + + # Double threshold + self.low_threshold = low_threshold + self.high_threshold = high_threshold + + # Hysteresis + self.hysteresis = hysteresis + + self.eps: float = eps + + def __repr__(self) -> str: + return ''.join( + ( + f'{type(self).__name__}(', + ', '.join( + f'{name}={getattr(self, name)}' for name in sorted(self.__dict__) if not name.startswith('_') + ), + ')', + ) + ) + + def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + return canny( + input, self.low_threshold, self.high_threshold, self.kernel_size, self.sigma, self.hysteresis, self.eps + ) \ No newline at end of file diff --git a/backend/inpaint/video/model/canny/filter.py b/backend/inpaint/video/model/canny/filter.py new file mode 100644 index 0000000..e39d44d --- /dev/null +++ b/backend/inpaint/video/model/canny/filter.py @@ -0,0 +1,288 @@ +from typing import List + +import torch +import torch.nn.functional as F + +from .kernels import normalize_kernel2d + + +def _compute_padding(kernel_size: List[int]) -> List[int]: + """Compute padding tuple.""" + # 4 or 6 ints: (padding_left, padding_right,padding_top,padding_bottom) + # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad + if len(kernel_size) < 2: + raise AssertionError(kernel_size) + computed = [k - 1 for k in kernel_size] + + # for even kernels we need to do asymmetric padding :( + out_padding = 2 * len(kernel_size) * [0] + + for i in range(len(kernel_size)): + computed_tmp = computed[-(i + 1)] + + pad_front = computed_tmp // 2 + pad_rear = computed_tmp - pad_front + + out_padding[2 * i + 0] = pad_front + out_padding[2 * i + 1] = pad_rear + + return out_padding + + +def filter2d( + input: torch.Tensor, + kernel: torch.Tensor, + border_type: str = 'reflect', + normalized: bool = False, + padding: str = 'same', +) -> torch.Tensor: + r"""Convolve a tensor with a 2d kernel. + + The function applies a given kernel to a tensor. The kernel is applied + independently at each depth channel of the tensor. Before applying the + kernel, the function applies padding according to the specified mode so + that the output remains in the same shape. + + Args: + input: the input tensor with shape of + :math:`(B, C, H, W)`. + kernel: the kernel to be convolved with the input + tensor. The kernel shape must be :math:`(1, kH, kW)` or :math:`(B, kH, kW)`. + border_type: the padding mode to be applied before convolving. + The expected modes are: ``'constant'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. + normalized: If True, kernel will be L1 normalized. + padding: This defines the type of padding. + 2 modes available ``'same'`` or ``'valid'``. + + Return: + torch.Tensor: the convolved tensor of same size and numbers of channels + as the input with shape :math:`(B, C, H, W)`. + + Example: + >>> input = torch.tensor([[[ + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 5., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.],]]]) + >>> kernel = torch.ones(1, 3, 3) + >>> filter2d(input, kernel, padding='same') + tensor([[[[0., 0., 0., 0., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 0., 0., 0., 0.]]]]) + """ + if not isinstance(input, torch.Tensor): + raise TypeError(f"Input input is not torch.Tensor. Got {type(input)}") + + if not isinstance(kernel, torch.Tensor): + raise TypeError(f"Input kernel is not torch.Tensor. Got {type(kernel)}") + + if not isinstance(border_type, str): + raise TypeError(f"Input border_type is not string. Got {type(border_type)}") + + if border_type not in ['constant', 'reflect', 'replicate', 'circular']: + raise ValueError( + f"Invalid border type, we expect 'constant', \ + 'reflect', 'replicate', 'circular'. Got:{border_type}" + ) + + if not isinstance(padding, str): + raise TypeError(f"Input padding is not string. Got {type(padding)}") + + if padding not in ['valid', 'same']: + raise ValueError(f"Invalid padding mode, we expect 'valid' or 'same'. Got: {padding}") + + if not len(input.shape) == 4: + raise ValueError(f"Invalid input shape, we expect BxCxHxW. Got: {input.shape}") + + if (not len(kernel.shape) == 3) and not ((kernel.shape[0] == 0) or (kernel.shape[0] == input.shape[0])): + raise ValueError(f"Invalid kernel shape, we expect 1xHxW or BxHxW. Got: {kernel.shape}") + + # prepare kernel + b, c, h, w = input.shape + tmp_kernel: torch.Tensor = kernel.unsqueeze(1).to(input) + + if normalized: + tmp_kernel = normalize_kernel2d(tmp_kernel) + + tmp_kernel = tmp_kernel.expand(-1, c, -1, -1) + + height, width = tmp_kernel.shape[-2:] + + # pad the input tensor + if padding == 'same': + padding_shape: List[int] = _compute_padding([height, width]) + input = F.pad(input, padding_shape, mode=border_type) + + # kernel and input tensor reshape to align element-wise or batch-wise params + tmp_kernel = tmp_kernel.reshape(-1, 1, height, width) + input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1)) + + # convolve the tensor with the kernel. + output = F.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1) + + if padding == 'same': + out = output.view(b, c, h, w) + else: + out = output.view(b, c, h - height + 1, w - width + 1) + + return out + + +def filter2d_separable( + input: torch.Tensor, + kernel_x: torch.Tensor, + kernel_y: torch.Tensor, + border_type: str = 'reflect', + normalized: bool = False, + padding: str = 'same', +) -> torch.Tensor: + r"""Convolve a tensor with two 1d kernels, in x and y directions. + + The function applies a given kernel to a tensor. The kernel is applied + independently at each depth channel of the tensor. Before applying the + kernel, the function applies padding according to the specified mode so + that the output remains in the same shape. + + Args: + input: the input tensor with shape of + :math:`(B, C, H, W)`. + kernel_x: the kernel to be convolved with the input + tensor. The kernel shape must be :math:`(1, kW)` or :math:`(B, kW)`. + kernel_y: the kernel to be convolved with the input + tensor. The kernel shape must be :math:`(1, kH)` or :math:`(B, kH)`. + border_type: the padding mode to be applied before convolving. + The expected modes are: ``'constant'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. + normalized: If True, kernel will be L1 normalized. + padding: This defines the type of padding. + 2 modes available ``'same'`` or ``'valid'``. + + Return: + torch.Tensor: the convolved tensor of same size and numbers of channels + as the input with shape :math:`(B, C, H, W)`. + + Example: + >>> input = torch.tensor([[[ + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 5., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.],]]]) + >>> kernel = torch.ones(1, 3) + + >>> filter2d_separable(input, kernel, kernel, padding='same') + tensor([[[[0., 0., 0., 0., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 0., 0., 0., 0.]]]]) + """ + out_x = filter2d(input, kernel_x.unsqueeze(0), border_type, normalized, padding) + out = filter2d(out_x, kernel_y.unsqueeze(-1), border_type, normalized, padding) + return out + + +def filter3d( + input: torch.Tensor, kernel: torch.Tensor, border_type: str = 'replicate', normalized: bool = False +) -> torch.Tensor: + r"""Convolve a tensor with a 3d kernel. + + The function applies a given kernel to a tensor. The kernel is applied + independently at each depth channel of the tensor. Before applying the + kernel, the function applies padding according to the specified mode so + that the output remains in the same shape. + + Args: + input: the input tensor with shape of + :math:`(B, C, D, H, W)`. + kernel: the kernel to be convolved with the input + tensor. The kernel shape must be :math:`(1, kD, kH, kW)` or :math:`(B, kD, kH, kW)`. + border_type: the padding mode to be applied before convolving. + The expected modes are: ``'constant'``, + ``'replicate'`` or ``'circular'``. + normalized: If True, kernel will be L1 normalized. + + Return: + the convolved tensor of same size and numbers of channels + as the input with shape :math:`(B, C, D, H, W)`. + + Example: + >>> input = torch.tensor([[[ + ... [[0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.]], + ... [[0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 5., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.]], + ... [[0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.], + ... [0., 0., 0., 0., 0.]] + ... ]]]) + >>> kernel = torch.ones(1, 3, 3, 3) + >>> filter3d(input, kernel) + tensor([[[[[0., 0., 0., 0., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 0., 0., 0., 0.]], + + [[0., 0., 0., 0., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 0., 0., 0., 0.]], + + [[0., 0., 0., 0., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 5., 5., 5., 0.], + [0., 0., 0., 0., 0.]]]]]) + """ + if not isinstance(input, torch.Tensor): + raise TypeError(f"Input border_type is not torch.Tensor. Got {type(input)}") + + if not isinstance(kernel, torch.Tensor): + raise TypeError(f"Input border_type is not torch.Tensor. Got {type(kernel)}") + + if not isinstance(border_type, str): + raise TypeError(f"Input border_type is not string. Got {type(kernel)}") + + if not len(input.shape) == 5: + raise ValueError(f"Invalid input shape, we expect BxCxDxHxW. Got: {input.shape}") + + if not len(kernel.shape) == 4 and kernel.shape[0] != 1: + raise ValueError(f"Invalid kernel shape, we expect 1xDxHxW. Got: {kernel.shape}") + + # prepare kernel + b, c, d, h, w = input.shape + tmp_kernel: torch.Tensor = kernel.unsqueeze(1).to(input) + + if normalized: + bk, dk, hk, wk = kernel.shape + tmp_kernel = normalize_kernel2d(tmp_kernel.view(bk, dk, hk * wk)).view_as(tmp_kernel) + + tmp_kernel = tmp_kernel.expand(-1, c, -1, -1, -1) + + # pad the input tensor + depth, height, width = tmp_kernel.shape[-3:] + padding_shape: List[int] = _compute_padding([depth, height, width]) + input_pad: torch.Tensor = F.pad(input, padding_shape, mode=border_type) + + # kernel and input tensor reshape to align element-wise or batch-wise params + tmp_kernel = tmp_kernel.reshape(-1, 1, depth, height, width) + input_pad = input_pad.view(-1, tmp_kernel.size(0), input_pad.size(-3), input_pad.size(-2), input_pad.size(-1)) + + # convolve the tensor with the kernel. + output = F.conv3d(input_pad, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1) + + return output.view(b, c, d, h, w) \ No newline at end of file diff --git a/backend/inpaint/video/model/canny/gaussian.py b/backend/inpaint/video/model/canny/gaussian.py new file mode 100644 index 0000000..182f05c --- /dev/null +++ b/backend/inpaint/video/model/canny/gaussian.py @@ -0,0 +1,116 @@ +from typing import Tuple + +import torch +import torch.nn as nn + +from .filter import filter2d, filter2d_separable +from .kernels import get_gaussian_kernel1d, get_gaussian_kernel2d + + +def gaussian_blur2d( + input: torch.Tensor, + kernel_size: Tuple[int, int], + sigma: Tuple[float, float], + border_type: str = 'reflect', + separable: bool = True, +) -> torch.Tensor: + r"""Create an operator that blurs a tensor using a Gaussian filter. + + .. image:: _static/img/gaussian_blur2d.png + + The operator smooths the given tensor with a gaussian kernel by convolving + it to each channel. It supports batched operation. + + Arguments: + input: the input tensor with shape :math:`(B,C,H,W)`. + kernel_size: the size of the kernel. + sigma: the standard deviation of the kernel. + border_type: the padding mode to be applied before convolving. + The expected modes are: ``'constant'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'reflect'``. + separable: run as composition of two 1d-convolutions. + + Returns: + the blurred tensor with shape :math:`(B, C, H, W)`. + + .. note:: + See a working example `here `__. + + Examples: + >>> input = torch.rand(2, 4, 5, 5) + >>> output = gaussian_blur2d(input, (3, 3), (1.5, 1.5)) + >>> output.shape + torch.Size([2, 4, 5, 5]) + """ + if separable: + kernel_x: torch.Tensor = get_gaussian_kernel1d(kernel_size[1], sigma[1]) + kernel_y: torch.Tensor = get_gaussian_kernel1d(kernel_size[0], sigma[0]) + out = filter2d_separable(input, kernel_x[None], kernel_y[None], border_type) + else: + kernel: torch.Tensor = get_gaussian_kernel2d(kernel_size, sigma) + out = filter2d(input, kernel[None], border_type) + return out + + +class GaussianBlur2d(nn.Module): + r"""Create an operator that blurs a tensor using a Gaussian filter. + + The operator smooths the given tensor with a gaussian kernel by convolving + it to each channel. It supports batched operation. + + Arguments: + kernel_size: the size of the kernel. + sigma: the standard deviation of the kernel. + border_type: the padding mode to be applied before convolving. + The expected modes are: ``'constant'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'reflect'``. + separable: run as composition of two 1d-convolutions. + + Returns: + the blurred tensor. + + Shape: + - Input: :math:`(B, C, H, W)` + - Output: :math:`(B, C, H, W)` + + Examples:: + + >>> input = torch.rand(2, 4, 5, 5) + >>> gauss = GaussianBlur2d((3, 3), (1.5, 1.5)) + >>> output = gauss(input) # 2x4x5x5 + >>> output.shape + torch.Size([2, 4, 5, 5]) + """ + + def __init__( + self, + kernel_size: Tuple[int, int], + sigma: Tuple[float, float], + border_type: str = 'reflect', + separable: bool = True, + ) -> None: + super().__init__() + self.kernel_size: Tuple[int, int] = kernel_size + self.sigma: Tuple[float, float] = sigma + self.border_type = border_type + self.separable = separable + + def __repr__(self) -> str: + return ( + self.__class__.__name__ + + '(kernel_size=' + + str(self.kernel_size) + + ', ' + + 'sigma=' + + str(self.sigma) + + ', ' + + 'border_type=' + + self.border_type + + 'separable=' + + str(self.separable) + + ')' + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return gaussian_blur2d(input, self.kernel_size, self.sigma, self.border_type, self.separable) \ No newline at end of file diff --git a/backend/inpaint/video/model/canny/kernels.py b/backend/inpaint/video/model/canny/kernels.py new file mode 100644 index 0000000..ae1ee25 --- /dev/null +++ b/backend/inpaint/video/model/canny/kernels.py @@ -0,0 +1,690 @@ +import math +from math import sqrt +from typing import List, Optional, Tuple + +import torch + + +def normalize_kernel2d(input: torch.Tensor) -> torch.Tensor: + r"""Normalize both derivative and smoothing kernel.""" + if len(input.size()) < 2: + raise TypeError(f"input should be at least 2D tensor. Got {input.size()}") + norm: torch.Tensor = input.abs().sum(dim=-1).sum(dim=-1) + return input / (norm.unsqueeze(-1).unsqueeze(-1)) + + +def gaussian(window_size: int, sigma: float) -> torch.Tensor: + device, dtype = None, None + if isinstance(sigma, torch.Tensor): + device, dtype = sigma.device, sigma.dtype + x = torch.arange(window_size, device=device, dtype=dtype) - window_size // 2 + if window_size % 2 == 0: + x = x + 0.5 + + gauss = torch.exp((-x.pow(2.0) / (2 * sigma**2)).float()) + return gauss / gauss.sum() + + +def gaussian_discrete_erf(window_size: int, sigma) -> torch.Tensor: + r"""Discrete Gaussian by interpolating the error function. + + Adapted from: + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py + """ + device = sigma.device if isinstance(sigma, torch.Tensor) else None + sigma = torch.as_tensor(sigma, dtype=torch.float, device=device) + x = torch.arange(window_size).float() - window_size // 2 + t = 0.70710678 / torch.abs(sigma) + gauss = 0.5 * ((t * (x + 0.5)).erf() - (t * (x - 0.5)).erf()) + gauss = gauss.clamp(min=0) + return gauss / gauss.sum() + + +def _modified_bessel_0(x: torch.Tensor) -> torch.Tensor: + r"""Adapted from: + + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py + """ + if torch.abs(x) < 3.75: + y = (x / 3.75) * (x / 3.75) + return 1.0 + y * ( + 3.5156229 + y * (3.0899424 + y * (1.2067492 + y * (0.2659732 + y * (0.360768e-1 + y * 0.45813e-2)))) + ) + ax = torch.abs(x) + y = 3.75 / ax + ans = 0.916281e-2 + y * (-0.2057706e-1 + y * (0.2635537e-1 + y * (-0.1647633e-1 + y * 0.392377e-2))) + coef = 0.39894228 + y * (0.1328592e-1 + y * (0.225319e-2 + y * (-0.157565e-2 + y * ans))) + return (torch.exp(ax) / torch.sqrt(ax)) * coef + + +def _modified_bessel_1(x: torch.Tensor) -> torch.Tensor: + r"""adapted from: + + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py + """ + if torch.abs(x) < 3.75: + y = (x / 3.75) * (x / 3.75) + ans = 0.51498869 + y * (0.15084934 + y * (0.2658733e-1 + y * (0.301532e-2 + y * 0.32411e-3))) + return torch.abs(x) * (0.5 + y * (0.87890594 + y * ans)) + ax = torch.abs(x) + y = 3.75 / ax + ans = 0.2282967e-1 + y * (-0.2895312e-1 + y * (0.1787654e-1 - y * 0.420059e-2)) + ans = 0.39894228 + y * (-0.3988024e-1 + y * (-0.362018e-2 + y * (0.163801e-2 + y * (-0.1031555e-1 + y * ans)))) + ans = ans * torch.exp(ax) / torch.sqrt(ax) + return -ans if x < 0.0 else ans + + +def _modified_bessel_i(n: int, x: torch.Tensor) -> torch.Tensor: + r"""adapted from: + + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py + """ + if n < 2: + raise ValueError("n must be greater than 1.") + if x == 0.0: + return x + device = x.device + tox = 2.0 / torch.abs(x) + ans = torch.tensor(0.0, device=device) + bip = torch.tensor(0.0, device=device) + bi = torch.tensor(1.0, device=device) + m = int(2 * (n + int(sqrt(40.0 * n)))) + for j in range(m, 0, -1): + bim = bip + float(j) * tox * bi + bip = bi + bi = bim + if abs(bi) > 1.0e10: + ans = ans * 1.0e-10 + bi = bi * 1.0e-10 + bip = bip * 1.0e-10 + if j == n: + ans = bip + ans = ans * _modified_bessel_0(x) / bi + return -ans if x < 0.0 and (n % 2) == 1 else ans + + +def gaussian_discrete(window_size, sigma) -> torch.Tensor: + r"""Discrete Gaussian kernel based on the modified Bessel functions. + + Adapted from: + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py + """ + device = sigma.device if isinstance(sigma, torch.Tensor) else None + sigma = torch.as_tensor(sigma, dtype=torch.float, device=device) + sigma2 = sigma * sigma + tail = int(window_size // 2) + out_pos: List[Optional[torch.Tensor]] = [None] * (tail + 1) + out_pos[0] = _modified_bessel_0(sigma2) + out_pos[1] = _modified_bessel_1(sigma2) + for k in range(2, len(out_pos)): + out_pos[k] = _modified_bessel_i(k, sigma2) + out = out_pos[:0:-1] + out.extend(out_pos) + out = torch.stack(out) * torch.exp(sigma2) # type: ignore + return out / out.sum() # type: ignore + + +def laplacian_1d(window_size) -> torch.Tensor: + r"""One could also use the Laplacian of Gaussian formula to design the filter.""" + + filter_1d = torch.ones(window_size) + filter_1d[window_size // 2] = 1 - window_size + laplacian_1d: torch.Tensor = filter_1d + return laplacian_1d + + +def get_box_kernel2d(kernel_size: Tuple[int, int]) -> torch.Tensor: + r"""Utility function that returns a box filter.""" + kx: float = float(kernel_size[0]) + ky: float = float(kernel_size[1]) + scale: torch.Tensor = torch.tensor(1.0) / torch.tensor([kx * ky]) + tmp_kernel: torch.Tensor = torch.ones(1, kernel_size[0], kernel_size[1]) + return scale.to(tmp_kernel.dtype) * tmp_kernel + + +def get_binary_kernel2d(window_size: Tuple[int, int]) -> torch.Tensor: + r"""Create a binary kernel to extract the patches. + + If the window size is HxW will create a (H*W)xHxW kernel. + """ + window_range: int = window_size[0] * window_size[1] + kernel: torch.Tensor = torch.zeros(window_range, window_range) + for i in range(window_range): + kernel[i, i] += 1.0 + return kernel.view(window_range, 1, window_size[0], window_size[1]) + + +def get_sobel_kernel_3x3() -> torch.Tensor: + """Utility function that returns a sobel kernel of 3x3.""" + return torch.tensor([[-1.0, 0.0, 1.0], [-2.0, 0.0, 2.0], [-1.0, 0.0, 1.0]]) + + +def get_sobel_kernel_5x5_2nd_order() -> torch.Tensor: + """Utility function that returns a 2nd order sobel kernel of 5x5.""" + return torch.tensor( + [ + [-1.0, 0.0, 2.0, 0.0, -1.0], + [-4.0, 0.0, 8.0, 0.0, -4.0], + [-6.0, 0.0, 12.0, 0.0, -6.0], + [-4.0, 0.0, 8.0, 0.0, -4.0], + [-1.0, 0.0, 2.0, 0.0, -1.0], + ] + ) + + +def _get_sobel_kernel_5x5_2nd_order_xy() -> torch.Tensor: + """Utility function that returns a 2nd order sobel kernel of 5x5.""" + return torch.tensor( + [ + [-1.0, -2.0, 0.0, 2.0, 1.0], + [-2.0, -4.0, 0.0, 4.0, 2.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [2.0, 4.0, 0.0, -4.0, -2.0], + [1.0, 2.0, 0.0, -2.0, -1.0], + ] + ) + + +def get_diff_kernel_3x3() -> torch.Tensor: + """Utility function that returns a first order derivative kernel of 3x3.""" + return torch.tensor([[-0.0, 0.0, 0.0], [-1.0, 0.0, 1.0], [-0.0, 0.0, 0.0]]) + + +def get_diff_kernel3d(device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + """Utility function that returns a first order derivative kernel of 3x3x3.""" + kernel: torch.Tensor = torch.tensor( + [ + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [-0.5, 0.0, 0.5], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ], + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, -0.5, 0.0], [0.0, 0.0, 0.0], [0.0, 0.5, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ], + [ + [[0.0, 0.0, 0.0], [0.0, -0.5, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.5, 0.0], [0.0, 0.0, 0.0]], + ], + ], + device=device, + dtype=dtype, + ) + return kernel.unsqueeze(1) + + +def get_diff_kernel3d_2nd_order(device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + """Utility function that returns a first order derivative kernel of 3x3x3.""" + kernel: torch.Tensor = torch.tensor( + [ + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [1.0, -2.0, 1.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ], + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 1.0, 0.0], [0.0, -2.0, 0.0], [0.0, 1.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ], + [ + [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, -2.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]], + ], + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[1.0, 0.0, -1.0], [0.0, 0.0, 0.0], [-1.0, 0.0, 1.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ], + [ + [[0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [0.0, -1.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, -1.0, 0.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0]], + ], + [ + [[0.0, 0.0, 0.0], [1.0, 0.0, -1.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [-1.0, 0.0, 1.0], [0.0, 0.0, 0.0]], + ], + ], + device=device, + dtype=dtype, + ) + return kernel.unsqueeze(1) + + +def get_sobel_kernel2d() -> torch.Tensor: + kernel_x: torch.Tensor = get_sobel_kernel_3x3() + kernel_y: torch.Tensor = kernel_x.transpose(0, 1) + return torch.stack([kernel_x, kernel_y]) + + +def get_diff_kernel2d() -> torch.Tensor: + kernel_x: torch.Tensor = get_diff_kernel_3x3() + kernel_y: torch.Tensor = kernel_x.transpose(0, 1) + return torch.stack([kernel_x, kernel_y]) + + +def get_sobel_kernel2d_2nd_order() -> torch.Tensor: + gxx: torch.Tensor = get_sobel_kernel_5x5_2nd_order() + gyy: torch.Tensor = gxx.transpose(0, 1) + gxy: torch.Tensor = _get_sobel_kernel_5x5_2nd_order_xy() + return torch.stack([gxx, gxy, gyy]) + + +def get_diff_kernel2d_2nd_order() -> torch.Tensor: + gxx: torch.Tensor = torch.tensor([[0.0, 0.0, 0.0], [1.0, -2.0, 1.0], [0.0, 0.0, 0.0]]) + gyy: torch.Tensor = gxx.transpose(0, 1) + gxy: torch.Tensor = torch.tensor([[-1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, -1.0]]) + return torch.stack([gxx, gxy, gyy]) + + +def get_spatial_gradient_kernel2d(mode: str, order: int) -> torch.Tensor: + r"""Function that returns kernel for 1st or 2nd order image gradients, using one of the following operators: + + sobel, diff. + """ + if mode not in ['sobel', 'diff']: + raise TypeError( + "mode should be either sobel\ + or diff. Got {}".format( + mode + ) + ) + if order not in [1, 2]: + raise TypeError( + "order should be either 1 or 2\ + Got {}".format( + order + ) + ) + if mode == 'sobel' and order == 1: + kernel: torch.Tensor = get_sobel_kernel2d() + elif mode == 'sobel' and order == 2: + kernel = get_sobel_kernel2d_2nd_order() + elif mode == 'diff' and order == 1: + kernel = get_diff_kernel2d() + elif mode == 'diff' and order == 2: + kernel = get_diff_kernel2d_2nd_order() + else: + raise NotImplementedError("") + return kernel + + +def get_spatial_gradient_kernel3d(mode: str, order: int, device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + r"""Function that returns kernel for 1st or 2nd order scale pyramid gradients, using one of the following + operators: sobel, diff.""" + if mode not in ['sobel', 'diff']: + raise TypeError( + "mode should be either sobel\ + or diff. Got {}".format( + mode + ) + ) + if order not in [1, 2]: + raise TypeError( + "order should be either 1 or 2\ + Got {}".format( + order + ) + ) + if mode == 'sobel': + raise NotImplementedError("Sobel kernel for 3d gradient is not implemented yet") + if mode == 'diff' and order == 1: + kernel = get_diff_kernel3d(device, dtype) + elif mode == 'diff' and order == 2: + kernel = get_diff_kernel3d_2nd_order(device, dtype) + else: + raise NotImplementedError("") + return kernel + + +def get_gaussian_kernel1d(kernel_size: int, sigma: float, force_even: bool = False) -> torch.Tensor: + r"""Function that returns Gaussian filter coefficients. + + Args: + kernel_size: filter size. It should be odd and positive. + sigma: gaussian standard deviation. + force_even: overrides requirement for odd kernel size. + + Returns: + 1D tensor with gaussian filter coefficients. + + Shape: + - Output: :math:`(\text{kernel_size})` + + Examples: + + >>> get_gaussian_kernel1d(3, 2.5) + tensor([0.3243, 0.3513, 0.3243]) + + >>> get_gaussian_kernel1d(5, 1.5) + tensor([0.1201, 0.2339, 0.2921, 0.2339, 0.1201]) + """ + if not isinstance(kernel_size, int) or ((kernel_size % 2 == 0) and not force_even) or (kernel_size <= 0): + raise TypeError("kernel_size must be an odd positive integer. " "Got {}".format(kernel_size)) + window_1d: torch.Tensor = gaussian(kernel_size, sigma) + return window_1d + + +def get_gaussian_discrete_kernel1d(kernel_size: int, sigma: float, force_even: bool = False) -> torch.Tensor: + r"""Function that returns Gaussian filter coefficients based on the modified Bessel functions. Adapted from: + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py. + + Args: + kernel_size: filter size. It should be odd and positive. + sigma: gaussian standard deviation. + force_even: overrides requirement for odd kernel size. + + Returns: + 1D tensor with gaussian filter coefficients. + + Shape: + - Output: :math:`(\text{kernel_size})` + + Examples: + + >>> get_gaussian_discrete_kernel1d(3, 2.5) + tensor([0.3235, 0.3531, 0.3235]) + + >>> get_gaussian_discrete_kernel1d(5, 1.5) + tensor([0.1096, 0.2323, 0.3161, 0.2323, 0.1096]) + """ + if not isinstance(kernel_size, int) or ((kernel_size % 2 == 0) and not force_even) or (kernel_size <= 0): + raise TypeError("kernel_size must be an odd positive integer. " "Got {}".format(kernel_size)) + window_1d = gaussian_discrete(kernel_size, sigma) + return window_1d + + +def get_gaussian_erf_kernel1d(kernel_size: int, sigma: float, force_even: bool = False) -> torch.Tensor: + r"""Function that returns Gaussian filter coefficients by interpolating the error function, adapted from: + https://github.com/Project-MONAI/MONAI/blob/master/monai/networks/layers/convutils.py. + + Args: + kernel_size: filter size. It should be odd and positive. + sigma: gaussian standard deviation. + force_even: overrides requirement for odd kernel size. + + Returns: + 1D tensor with gaussian filter coefficients. + + Shape: + - Output: :math:`(\text{kernel_size})` + + Examples: + + >>> get_gaussian_erf_kernel1d(3, 2.5) + tensor([0.3245, 0.3511, 0.3245]) + + >>> get_gaussian_erf_kernel1d(5, 1.5) + tensor([0.1226, 0.2331, 0.2887, 0.2331, 0.1226]) + """ + if not isinstance(kernel_size, int) or ((kernel_size % 2 == 0) and not force_even) or (kernel_size <= 0): + raise TypeError("kernel_size must be an odd positive integer. " "Got {}".format(kernel_size)) + window_1d = gaussian_discrete_erf(kernel_size, sigma) + return window_1d + + +def get_gaussian_kernel2d( + kernel_size: Tuple[int, int], sigma: Tuple[float, float], force_even: bool = False +) -> torch.Tensor: + r"""Function that returns Gaussian filter matrix coefficients. + + Args: + kernel_size: filter sizes in the x and y direction. + Sizes should be odd and positive. + sigma: gaussian standard deviation in the x and y + direction. + force_even: overrides requirement for odd kernel size. + + Returns: + 2D tensor with gaussian filter matrix coefficients. + + Shape: + - Output: :math:`(\text{kernel_size}_x, \text{kernel_size}_y)` + + Examples: + >>> get_gaussian_kernel2d((3, 3), (1.5, 1.5)) + tensor([[0.0947, 0.1183, 0.0947], + [0.1183, 0.1478, 0.1183], + [0.0947, 0.1183, 0.0947]]) + >>> get_gaussian_kernel2d((3, 5), (1.5, 1.5)) + tensor([[0.0370, 0.0720, 0.0899, 0.0720, 0.0370], + [0.0462, 0.0899, 0.1123, 0.0899, 0.0462], + [0.0370, 0.0720, 0.0899, 0.0720, 0.0370]]) + """ + if not isinstance(kernel_size, tuple) or len(kernel_size) != 2: + raise TypeError(f"kernel_size must be a tuple of length two. Got {kernel_size}") + if not isinstance(sigma, tuple) or len(sigma) != 2: + raise TypeError(f"sigma must be a tuple of length two. Got {sigma}") + ksize_x, ksize_y = kernel_size + sigma_x, sigma_y = sigma + kernel_x: torch.Tensor = get_gaussian_kernel1d(ksize_x, sigma_x, force_even) + kernel_y: torch.Tensor = get_gaussian_kernel1d(ksize_y, sigma_y, force_even) + kernel_2d: torch.Tensor = torch.matmul(kernel_x.unsqueeze(-1), kernel_y.unsqueeze(-1).t()) + return kernel_2d + + +def get_laplacian_kernel1d(kernel_size: int) -> torch.Tensor: + r"""Function that returns the coefficients of a 1D Laplacian filter. + + Args: + kernel_size: filter size. It should be odd and positive. + + Returns: + 1D tensor with laplacian filter coefficients. + + Shape: + - Output: math:`(\text{kernel_size})` + + Examples: + >>> get_laplacian_kernel1d(3) + tensor([ 1., -2., 1.]) + >>> get_laplacian_kernel1d(5) + tensor([ 1., 1., -4., 1., 1.]) + """ + if not isinstance(kernel_size, int) or kernel_size % 2 == 0 or kernel_size <= 0: + raise TypeError(f"ksize must be an odd positive integer. Got {kernel_size}") + window_1d: torch.Tensor = laplacian_1d(kernel_size) + return window_1d + + +def get_laplacian_kernel2d(kernel_size: int) -> torch.Tensor: + r"""Function that returns Gaussian filter matrix coefficients. + + Args: + kernel_size: filter size should be odd. + + Returns: + 2D tensor with laplacian filter matrix coefficients. + + Shape: + - Output: :math:`(\text{kernel_size}_x, \text{kernel_size}_y)` + + Examples: + >>> get_laplacian_kernel2d(3) + tensor([[ 1., 1., 1.], + [ 1., -8., 1.], + [ 1., 1., 1.]]) + >>> get_laplacian_kernel2d(5) + tensor([[ 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1.], + [ 1., 1., -24., 1., 1.], + [ 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1.]]) + """ + if not isinstance(kernel_size, int) or kernel_size % 2 == 0 or kernel_size <= 0: + raise TypeError(f"ksize must be an odd positive integer. Got {kernel_size}") + + kernel = torch.ones((kernel_size, kernel_size)) + mid = kernel_size // 2 + kernel[mid, mid] = 1 - kernel_size**2 + kernel_2d: torch.Tensor = kernel + return kernel_2d + + +def get_pascal_kernel_2d(kernel_size: int, norm: bool = True) -> torch.Tensor: + """Generate pascal filter kernel by kernel size. + + Args: + kernel_size: height and width of the kernel. + norm: if to normalize the kernel or not. Default: True. + + Returns: + kernel shaped as :math:`(kernel_size, kernel_size)` + + Examples: + >>> get_pascal_kernel_2d(1) + tensor([[1.]]) + >>> get_pascal_kernel_2d(4) + tensor([[0.0156, 0.0469, 0.0469, 0.0156], + [0.0469, 0.1406, 0.1406, 0.0469], + [0.0469, 0.1406, 0.1406, 0.0469], + [0.0156, 0.0469, 0.0469, 0.0156]]) + >>> get_pascal_kernel_2d(4, norm=False) + tensor([[1., 3., 3., 1.], + [3., 9., 9., 3.], + [3., 9., 9., 3.], + [1., 3., 3., 1.]]) + """ + a = get_pascal_kernel_1d(kernel_size) + + filt = a[:, None] * a[None, :] + if norm: + filt = filt / torch.sum(filt) + return filt + + +def get_pascal_kernel_1d(kernel_size: int, norm: bool = False) -> torch.Tensor: + """Generate Yang Hui triangle (Pascal's triangle) by a given number. + + Args: + kernel_size: height and width of the kernel. + norm: if to normalize the kernel or not. Default: False. + + Returns: + kernel shaped as :math:`(kernel_size,)` + + Examples: + >>> get_pascal_kernel_1d(1) + tensor([1.]) + >>> get_pascal_kernel_1d(2) + tensor([1., 1.]) + >>> get_pascal_kernel_1d(3) + tensor([1., 2., 1.]) + >>> get_pascal_kernel_1d(4) + tensor([1., 3., 3., 1.]) + >>> get_pascal_kernel_1d(5) + tensor([1., 4., 6., 4., 1.]) + >>> get_pascal_kernel_1d(6) + tensor([ 1., 5., 10., 10., 5., 1.]) + """ + pre: List[float] = [] + cur: List[float] = [] + for i in range(kernel_size): + cur = [1.0] * (i + 1) + + for j in range(1, i // 2 + 1): + value = pre[j - 1] + pre[j] + cur[j] = value + if i != 2 * j: + cur[-j - 1] = value + pre = cur + + out = torch.as_tensor(cur) + if norm: + out = out / torch.sum(out) + return out + + +def get_canny_nms_kernel(device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + """Utility function that returns 3x3 kernels for the Canny Non-maximal suppression.""" + kernel: torch.Tensor = torch.tensor( + [ + [[0.0, 0.0, 0.0], [0.0, 1.0, -1.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]], + [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, -1.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [-1.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [-1.0, 1.0, 0.0], [0.0, 0.0, 0.0]], + [[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, -1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, -1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]], + ], + device=device, + dtype=dtype, + ) + return kernel.unsqueeze(1) + + +def get_hysteresis_kernel(device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + """Utility function that returns the 3x3 kernels for the Canny hysteresis.""" + kernel: torch.Tensor = torch.tensor( + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ], + device=device, + dtype=dtype, + ) + return kernel.unsqueeze(1) + + +def get_hanning_kernel1d(kernel_size: int, device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + r"""Returns Hanning (also known as Hann) kernel, used in signal processing and KCF tracker. + + .. math:: w(n) = 0.5 - 0.5cos\\left(\\frac{2\\pi{n}}{M-1}\\right) + \\qquad 0 \\leq n \\leq M-1 + + See further in numpy docs https://numpy.org/doc/stable/reference/generated/numpy.hanning.html + + Args: + kernel_size: The size the of the kernel. It should be positive. + + Returns: + 1D tensor with Hanning filter coefficients. + .. math:: w(n) = 0.5 - 0.5cos\\left(\\frac{2\\pi{n}}{M-1}\\right) + + Shape: + - Output: math:`(\text{kernel_size})` + + Examples: + >>> get_hanning_kernel1d(4) + tensor([0.0000, 0.7500, 0.7500, 0.0000]) + """ + if not isinstance(kernel_size, int) or kernel_size <= 2: + raise TypeError(f"ksize must be an positive integer > 2. Got {kernel_size}") + + x: torch.Tensor = torch.arange(kernel_size, device=device, dtype=dtype) + x = 0.5 - 0.5 * torch.cos(2.0 * math.pi * x / float(kernel_size - 1)) + return x + + +def get_hanning_kernel2d(kernel_size: Tuple[int, int], device=torch.device('cpu'), dtype=torch.float) -> torch.Tensor: + r"""Returns 2d Hanning kernel, used in signal processing and KCF tracker. + + Args: + kernel_size: The size of the kernel for the filter. It should be positive. + + Returns: + 2D tensor with Hanning filter coefficients. + .. math:: w(n) = 0.5 - 0.5cos\\left(\\frac{2\\pi{n}}{M-1}\\right) + + Shape: + - Output: math:`(\text{kernel_size[0], kernel_size[1]})` + """ + if kernel_size[0] <= 2 or kernel_size[1] <= 2: + raise TypeError(f"ksize must be an tuple of positive integers > 2. Got {kernel_size}") + ky: torch.Tensor = get_hanning_kernel1d(kernel_size[0], device, dtype)[None].T + kx: torch.Tensor = get_hanning_kernel1d(kernel_size[1], device, dtype)[None] + kernel2d = ky @ kx + return kernel2d \ No newline at end of file diff --git a/backend/inpaint/video/model/canny/sobel.py b/backend/inpaint/video/model/canny/sobel.py new file mode 100644 index 0000000..d780c5c --- /dev/null +++ b/backend/inpaint/video/model/canny/sobel.py @@ -0,0 +1,263 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .kernels import get_spatial_gradient_kernel2d, get_spatial_gradient_kernel3d, normalize_kernel2d + + +def spatial_gradient(input: torch.Tensor, mode: str = 'sobel', order: int = 1, normalized: bool = True) -> torch.Tensor: + r"""Compute the first order image derivative in both x and y using a Sobel operator. + + .. image:: _static/img/spatial_gradient.png + + Args: + input: input image tensor with shape :math:`(B, C, H, W)`. + mode: derivatives modality, can be: `sobel` or `diff`. + order: the order of the derivatives. + normalized: whether the output is normalized. + + Return: + the derivatives of the input feature map. with shape :math:`(B, C, 2, H, W)`. + + .. note:: + See a working example `here `__. + + Examples: + >>> input = torch.rand(1, 3, 4, 4) + >>> output = spatial_gradient(input) # 1x3x2x4x4 + >>> output.shape + torch.Size([1, 3, 2, 4, 4]) + """ + if not isinstance(input, torch.Tensor): + raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") + + if not len(input.shape) == 4: + raise ValueError(f"Invalid input shape, we expect BxCxHxW. Got: {input.shape}") + # allocate kernel + kernel: torch.Tensor = get_spatial_gradient_kernel2d(mode, order) + if normalized: + kernel = normalize_kernel2d(kernel) + + # prepare kernel + b, c, h, w = input.shape + tmp_kernel: torch.Tensor = kernel.to(input).detach() + tmp_kernel = tmp_kernel.unsqueeze(1).unsqueeze(1) + + # convolve input tensor with sobel kernel + kernel_flip: torch.Tensor = tmp_kernel.flip(-3) + + # Pad with "replicate for spatial dims, but with zeros for channel + spatial_pad = [kernel.size(1) // 2, kernel.size(1) // 2, kernel.size(2) // 2, kernel.size(2) // 2] + out_channels: int = 3 if order == 2 else 2 + padded_inp: torch.Tensor = F.pad(input.reshape(b * c, 1, h, w), spatial_pad, 'replicate')[:, :, None] + + return F.conv3d(padded_inp, kernel_flip, padding=0).view(b, c, out_channels, h, w) + + +def spatial_gradient3d(input: torch.Tensor, mode: str = 'diff', order: int = 1) -> torch.Tensor: + r"""Compute the first and second order volume derivative in x, y and d using a diff operator. + + Args: + input: input features tensor with shape :math:`(B, C, D, H, W)`. + mode: derivatives modality, can be: `sobel` or `diff`. + order: the order of the derivatives. + + Return: + the spatial gradients of the input feature map with shape math:`(B, C, 3, D, H, W)` + or :math:`(B, C, 6, D, H, W)`. + + Examples: + >>> input = torch.rand(1, 4, 2, 4, 4) + >>> output = spatial_gradient3d(input) + >>> output.shape + torch.Size([1, 4, 3, 2, 4, 4]) + """ + if not isinstance(input, torch.Tensor): + raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") + + if not len(input.shape) == 5: + raise ValueError(f"Invalid input shape, we expect BxCxDxHxW. Got: {input.shape}") + b, c, d, h, w = input.shape + dev = input.device + dtype = input.dtype + if (mode == 'diff') and (order == 1): + # we go for the special case implementation due to conv3d bad speed + x: torch.Tensor = F.pad(input, 6 * [1], 'replicate') + center = slice(1, -1) + left = slice(0, -2) + right = slice(2, None) + out = torch.empty(b, c, 3, d, h, w, device=dev, dtype=dtype) + out[..., 0, :, :, :] = x[..., center, center, right] - x[..., center, center, left] + out[..., 1, :, :, :] = x[..., center, right, center] - x[..., center, left, center] + out[..., 2, :, :, :] = x[..., right, center, center] - x[..., left, center, center] + out = 0.5 * out + else: + # prepare kernel + # allocate kernel + kernel: torch.Tensor = get_spatial_gradient_kernel3d(mode, order) + + tmp_kernel: torch.Tensor = kernel.to(input).detach() + tmp_kernel = tmp_kernel.repeat(c, 1, 1, 1, 1) + + # convolve input tensor with grad kernel + kernel_flip: torch.Tensor = tmp_kernel.flip(-3) + + # Pad with "replicate for spatial dims, but with zeros for channel + spatial_pad = [ + kernel.size(2) // 2, + kernel.size(2) // 2, + kernel.size(3) // 2, + kernel.size(3) // 2, + kernel.size(4) // 2, + kernel.size(4) // 2, + ] + out_ch: int = 6 if order == 2 else 3 + out = F.conv3d(F.pad(input, spatial_pad, 'replicate'), kernel_flip, padding=0, groups=c).view( + b, c, out_ch, d, h, w + ) + return out + + +def sobel(input: torch.Tensor, normalized: bool = True, eps: float = 1e-6) -> torch.Tensor: + r"""Compute the Sobel operator and returns the magnitude per channel. + + .. image:: _static/img/sobel.png + + Args: + input: the input image with shape :math:`(B,C,H,W)`. + normalized: if True, L1 norm of the kernel is set to 1. + eps: regularization number to avoid NaN during backprop. + + Return: + the sobel edge gradient magnitudes map with shape :math:`(B,C,H,W)`. + + .. note:: + See a working example `here `__. + + Example: + >>> input = torch.rand(1, 3, 4, 4) + >>> output = sobel(input) # 1x3x4x4 + >>> output.shape + torch.Size([1, 3, 4, 4]) + """ + if not isinstance(input, torch.Tensor): + raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") + + if not len(input.shape) == 4: + raise ValueError(f"Invalid input shape, we expect BxCxHxW. Got: {input.shape}") + + # comput the x/y gradients + edges: torch.Tensor = spatial_gradient(input, normalized=normalized) + + # unpack the edges + gx: torch.Tensor = edges[:, :, 0] + gy: torch.Tensor = edges[:, :, 1] + + # compute gradient maginitude + magnitude: torch.Tensor = torch.sqrt(gx * gx + gy * gy + eps) + + return magnitude + + +class SpatialGradient(nn.Module): + r"""Compute the first order image derivative in both x and y using a Sobel operator. + + Args: + mode: derivatives modality, can be: `sobel` or `diff`. + order: the order of the derivatives. + normalized: whether the output is normalized. + + Return: + the sobel edges of the input feature map. + + Shape: + - Input: :math:`(B, C, H, W)` + - Output: :math:`(B, C, 2, H, W)` + + Examples: + >>> input = torch.rand(1, 3, 4, 4) + >>> output = SpatialGradient()(input) # 1x3x2x4x4 + """ + + def __init__(self, mode: str = 'sobel', order: int = 1, normalized: bool = True) -> None: + super().__init__() + self.normalized: bool = normalized + self.order: int = order + self.mode: str = mode + + def __repr__(self) -> str: + return ( + self.__class__.__name__ + '(' + 'order=' + str(self.order) + ', ' + 'normalized=' + str(self.normalized) + ', ' + 'mode=' + self.mode + ')' + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return spatial_gradient(input, self.mode, self.order, self.normalized) + + +class SpatialGradient3d(nn.Module): + r"""Compute the first and second order volume derivative in x, y and d using a diff operator. + + Args: + mode: derivatives modality, can be: `sobel` or `diff`. + order: the order of the derivatives. + + Return: + the spatial gradients of the input feature map. + + Shape: + - Input: :math:`(B, C, D, H, W)`. D, H, W are spatial dimensions, gradient is calculated w.r.t to them. + - Output: :math:`(B, C, 3, D, H, W)` or :math:`(B, C, 6, D, H, W)` + + Examples: + >>> input = torch.rand(1, 4, 2, 4, 4) + >>> output = SpatialGradient3d()(input) + >>> output.shape + torch.Size([1, 4, 3, 2, 4, 4]) + """ + + def __init__(self, mode: str = 'diff', order: int = 1) -> None: + super().__init__() + self.order: int = order + self.mode: str = mode + self.kernel = get_spatial_gradient_kernel3d(mode, order) + return + + def __repr__(self) -> str: + return self.__class__.__name__ + '(' 'order=' + str(self.order) + ', ' + 'mode=' + self.mode + ')' + + def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore + return spatial_gradient3d(input, self.mode, self.order) + + +class Sobel(nn.Module): + r"""Compute the Sobel operator and returns the magnitude per channel. + + Args: + normalized: if True, L1 norm of the kernel is set to 1. + eps: regularization number to avoid NaN during backprop. + + Return: + the sobel edge gradient magnitudes map. + + Shape: + - Input: :math:`(B, C, H, W)` + - Output: :math:`(B, C, H, W)` + + Examples: + >>> input = torch.rand(1, 3, 4, 4) + >>> output = Sobel()(input) # 1x3x4x4 + """ + + def __init__(self, normalized: bool = True, eps: float = 1e-6) -> None: + super().__init__() + self.normalized: bool = normalized + self.eps: float = eps + + def __repr__(self) -> str: + return self.__class__.__name__ + '(' 'normalized=' + str(self.normalized) + ')' + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return sobel(input, self.normalized, self.eps) \ No newline at end of file diff --git a/backend/inpaint/video/model/misc.py b/backend/inpaint/video/model/misc.py new file mode 100644 index 0000000..097c67a --- /dev/null +++ b/backend/inpaint/video/model/misc.py @@ -0,0 +1,133 @@ +import os +import re +import random +import time +import torch +import torch.nn as nn +import logging +import numpy as np +from os import path as osp + +def constant_init(module, val, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.constant_(module.weight, val) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + +initialized_logger = {} +def get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=None): + """Get the root logger. + The logger will be initialized if it has not been initialized. By default a + StreamHandler will be added. If `log_file` is specified, a FileHandler will + also be added. + Args: + logger_name (str): root logger name. Default: 'basicsr'. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the root logger. + log_level (int): The root logger level. Note that only the process of + rank 0 is affected, while other processes will set the level to + "Error" and be silent most of the time. + Returns: + logging.Logger: The root logger. + """ + logger = logging.getLogger(logger_name) + # if the logger has been initialized, just return it + if logger_name in initialized_logger: + return logger + + format_str = '%(asctime)s %(levelname)s: %(message)s' + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(logging.Formatter(format_str)) + logger.addHandler(stream_handler) + logger.propagate = False + + if log_file is not None: + logger.setLevel(log_level) + # add file handler + # file_handler = logging.FileHandler(log_file, 'w') + file_handler = logging.FileHandler(log_file, 'a') #Shangchen: keep the previous log + file_handler.setFormatter(logging.Formatter(format_str)) + file_handler.setLevel(log_level) + logger.addHandler(file_handler) + initialized_logger[logger_name] = True + return logger + + +IS_HIGH_VERSION = [int(m) for m in list(re.findall(r"^([0-9]+)\.([0-9]+)\.([0-9]+)([^0-9][a-zA-Z0-9]*)?(\+git.*)?$",\ + torch.__version__)[0][:3])] >= [1, 12, 0] + + +def gpu_is_available(): + if IS_HIGH_VERSION: + if torch.backends.mps.is_available(): + return True + return True if torch.cuda.is_available() and torch.backends.cudnn.is_available() else False + + +def get_device(gpu_id=None): + if gpu_id is None: + gpu_str = '' + elif isinstance(gpu_id, int): + gpu_str = f':{gpu_id}' + else: + raise TypeError('Input should be int value.') + + if IS_HIGH_VERSION: + if torch.backends.mps.is_available(): + return torch.device('mps'+gpu_str) + return torch.device('cuda'+gpu_str if torch.cuda.is_available() and torch.backends.cudnn.is_available() else 'cpu') + + +def set_random_seed(seed): + """Set random seeds.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def get_time_str(): + return time.strftime('%Y%m%d_%H%M%S', time.localtime()) + + +def scandir(dir_path, suffix=None, recursive=False, full_path=False): + """Scan a directory to find the interested files. + + Args: + dir_path (str): Path of the directory. + suffix (str | tuple(str), optional): File suffix that we are + interested in. Default: None. + recursive (bool, optional): If set to True, recursively scan the + directory. Default: False. + full_path (bool, optional): If set to True, include the dir_path. + Default: False. + + Returns: + A generator for all the interested files with relative pathes. + """ + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + + root = dir_path + + def _scandir(dir_path, suffix, recursive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + if full_path: + return_path = entry.path + else: + return_path = osp.relpath(entry.path, root) + + if suffix is None: + yield return_path + elif return_path.endswith(suffix): + yield return_path + else: + if recursive: + yield from _scandir(entry.path, suffix=suffix, recursive=recursive) + else: + continue + + return _scandir(dir_path, suffix=suffix, recursive=recursive) \ No newline at end of file diff --git a/backend/inpaint/video/model/modules/base_module.py b/backend/inpaint/video/model/modules/base_module.py new file mode 100644 index 0000000..b28c094 --- /dev/null +++ b/backend/inpaint/video/model/modules/base_module.py @@ -0,0 +1,131 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from functools import reduce + +class BaseNetwork(nn.Module): + def __init__(self): + super(BaseNetwork, self).__init__() + + def print_network(self): + if isinstance(self, list): + self = self[0] + num_params = 0 + for param in self.parameters(): + num_params += param.numel() + print( + 'Network [%s] was created. Total number of parameters: %.1f million. ' + 'To see the architecture, do print(network).' % + (type(self).__name__, num_params / 1000000)) + + def init_weights(self, init_type='normal', gain=0.02): + ''' + initialize network's weights + init_type: normal | xavier | kaiming | orthogonal + https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39 + ''' + def init_func(m): + classname = m.__class__.__name__ + if classname.find('InstanceNorm2d') != -1: + if hasattr(m, 'weight') and m.weight is not None: + nn.init.constant_(m.weight.data, 1.0) + if hasattr(m, 'bias') and m.bias is not None: + nn.init.constant_(m.bias.data, 0.0) + elif hasattr(m, 'weight') and (classname.find('Conv') != -1 + or classname.find('Linear') != -1): + if init_type == 'normal': + nn.init.normal_(m.weight.data, 0.0, gain) + elif init_type == 'xavier': + nn.init.xavier_normal_(m.weight.data, gain=gain) + elif init_type == 'xavier_uniform': + nn.init.xavier_uniform_(m.weight.data, gain=1.0) + elif init_type == 'kaiming': + nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') + elif init_type == 'orthogonal': + nn.init.orthogonal_(m.weight.data, gain=gain) + elif init_type == 'none': # uses pytorch's default init method + m.reset_parameters() + else: + raise NotImplementedError( + 'initialization method [%s] is not implemented' % + init_type) + if hasattr(m, 'bias') and m.bias is not None: + nn.init.constant_(m.bias.data, 0.0) + + self.apply(init_func) + + # propagate to children + for m in self.children(): + if hasattr(m, 'init_weights'): + m.init_weights(init_type, gain) + + +class Vec2Feat(nn.Module): + def __init__(self, channel, hidden, kernel_size, stride, padding): + super(Vec2Feat, self).__init__() + self.relu = nn.LeakyReLU(0.2, inplace=True) + c_out = reduce((lambda x, y: x * y), kernel_size) * channel + self.embedding = nn.Linear(hidden, c_out) + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.bias_conv = nn.Conv2d(channel, + channel, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x, t, output_size): + b_, _, _, _, c_ = x.shape + x = x.view(b_, -1, c_) + feat = self.embedding(x) + b, _, c = feat.size() + feat = feat.view(b * t, -1, c).permute(0, 2, 1) + feat = F.fold(feat, + output_size=output_size, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding) + feat = self.bias_conv(feat) + return feat + + +class FusionFeedForward(nn.Module): + def __init__(self, dim, hidden_dim=1960, t2t_params=None): + super(FusionFeedForward, self).__init__() + # We set hidden_dim as a default to 1960 + self.fc1 = nn.Sequential(nn.Linear(dim, hidden_dim)) + self.fc2 = nn.Sequential(nn.GELU(), nn.Linear(hidden_dim, dim)) + assert t2t_params is not None + self.t2t_params = t2t_params + self.kernel_shape = reduce((lambda x, y: x * y), t2t_params['kernel_size']) # 49 + + def forward(self, x, output_size): + n_vecs = 1 + for i, d in enumerate(self.t2t_params['kernel_size']): + n_vecs *= int((output_size[i] + 2 * self.t2t_params['padding'][i] - + (d - 1) - 1) / self.t2t_params['stride'][i] + 1) + + x = self.fc1(x) + b, n, c = x.size() + normalizer = x.new_ones(b, n, self.kernel_shape).view(-1, n_vecs, self.kernel_shape).permute(0, 2, 1) + normalizer = F.fold(normalizer, + output_size=output_size, + kernel_size=self.t2t_params['kernel_size'], + padding=self.t2t_params['padding'], + stride=self.t2t_params['stride']) + + x = F.fold(x.view(-1, n_vecs, c).permute(0, 2, 1), + output_size=output_size, + kernel_size=self.t2t_params['kernel_size'], + padding=self.t2t_params['padding'], + stride=self.t2t_params['stride']) + + x = F.unfold(x / normalizer, + kernel_size=self.t2t_params['kernel_size'], + padding=self.t2t_params['padding'], + stride=self.t2t_params['stride']).permute( + 0, 2, 1).contiguous().view(b, n, c) + x = self.fc2(x) + return x diff --git a/backend/inpaint/video/model/modules/deformconv.py b/backend/inpaint/video/model/modules/deformconv.py new file mode 100644 index 0000000..89cb31b --- /dev/null +++ b/backend/inpaint/video/model/modules/deformconv.py @@ -0,0 +1,54 @@ +import torch +import torch.nn as nn +from torch.nn import init as init +from torch.nn.modules.utils import _pair, _single +import math + +class ModulatedDeformConv2d(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1, + bias=True): + super(ModulatedDeformConv2d, self).__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.deform_groups = deform_groups + self.with_bias = bias + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.init_weights() + + def init_weights(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.zero_() + + if hasattr(self, 'conv_offset'): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x, offset, mask): + pass \ No newline at end of file diff --git a/backend/inpaint/video/model/modules/flow_comp_raft.py b/backend/inpaint/video/model/modules/flow_comp_raft.py new file mode 100644 index 0000000..1d4b81f --- /dev/null +++ b/backend/inpaint/video/model/modules/flow_comp_raft.py @@ -0,0 +1,265 @@ +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F + +from backend.inpaint.video.raft import RAFT +from backend.inpaint.video.model.modules.flow_loss_utils import flow_warp, ternary_loss2 + + +def initialize_RAFT(model_path='weights/raft-things.pth', device='cuda'): + """Initializes the RAFT model. + """ + args = argparse.ArgumentParser() + args.raft_model = model_path + args.small = False + args.mixed_precision = False + args.alternate_corr = False + model = torch.nn.DataParallel(RAFT(args)) + model.load_state_dict(torch.load(args.raft_model, map_location='cpu')) + model = model.module + + model.to(device) + + return model + + +class RAFT_bi(nn.Module): + """Flow completion loss""" + def __init__(self, model_path='weights/raft-things.pth', device='cuda'): + super().__init__() + self.fix_raft = initialize_RAFT(model_path, device=device) + + for p in self.fix_raft.parameters(): + p.requires_grad = False + + self.l1_criterion = nn.L1Loss() + self.eval() + + def forward(self, gt_local_frames, iters=20): + b, l_t, c, h, w = gt_local_frames.size() + # print(gt_local_frames.shape) + + with torch.no_grad(): + gtlf_1 = gt_local_frames[:, :-1, :, :, :].reshape(-1, c, h, w) + gtlf_2 = gt_local_frames[:, 1:, :, :, :].reshape(-1, c, h, w) + # print(gtlf_1.shape) + + _, gt_flows_forward = self.fix_raft(gtlf_1, gtlf_2, iters=iters, test_mode=True) + _, gt_flows_backward = self.fix_raft(gtlf_2, gtlf_1, iters=iters, test_mode=True) + + + gt_flows_forward = gt_flows_forward.view(b, l_t-1, 2, h, w) + gt_flows_backward = gt_flows_backward.view(b, l_t-1, 2, h, w) + + return gt_flows_forward, gt_flows_backward + + +################################################################################## +def smoothness_loss(flow, cmask): + delta_u, delta_v, mask = smoothness_deltas(flow) + loss_u = charbonnier_loss(delta_u, cmask) + loss_v = charbonnier_loss(delta_v, cmask) + return loss_u + loss_v + + +def smoothness_deltas(flow): + """ + flow: [b, c, h, w] + """ + mask_x = create_mask(flow, [[0, 0], [0, 1]]) + mask_y = create_mask(flow, [[0, 1], [0, 0]]) + mask = torch.cat((mask_x, mask_y), dim=1) + mask = mask.to(flow.device) + filter_x = torch.tensor([[0, 0, 0.], [0, 1, -1], [0, 0, 0]]) + filter_y = torch.tensor([[0, 0, 0.], [0, 1, 0], [0, -1, 0]]) + weights = torch.ones([2, 1, 3, 3]) + weights[0, 0] = filter_x + weights[1, 0] = filter_y + weights = weights.to(flow.device) + + flow_u, flow_v = torch.split(flow, split_size_or_sections=1, dim=1) + delta_u = F.conv2d(flow_u, weights, stride=1, padding=1) + delta_v = F.conv2d(flow_v, weights, stride=1, padding=1) + return delta_u, delta_v, mask + + +def second_order_loss(flow, cmask): + delta_u, delta_v, mask = second_order_deltas(flow) + loss_u = charbonnier_loss(delta_u, cmask) + loss_v = charbonnier_loss(delta_v, cmask) + return loss_u + loss_v + + +def charbonnier_loss(x, mask=None, truncate=None, alpha=0.45, beta=1.0, epsilon=0.001): + """ + Compute the generalized charbonnier loss of the difference tensor x + All positions where mask == 0 are not taken into account + x: a tensor of shape [b, c, h, w] + mask: a mask of shape [b, mc, h, w], where mask channels must be either 1 or the same as + the number of channels of x. Entries should be 0 or 1 + return: loss + """ + b, c, h, w = x.shape + norm = b * c * h * w + error = torch.pow(torch.square(x * beta) + torch.square(torch.tensor(epsilon)), alpha) + if mask is not None: + error = mask * error + if truncate is not None: + error = torch.min(error, truncate) + return torch.sum(error) / norm + + +def second_order_deltas(flow): + """ + consider the single flow first + flow shape: [b, c, h, w] + """ + # create mask + mask_x = create_mask(flow, [[0, 0], [1, 1]]) + mask_y = create_mask(flow, [[1, 1], [0, 0]]) + mask_diag = create_mask(flow, [[1, 1], [1, 1]]) + mask = torch.cat((mask_x, mask_y, mask_diag, mask_diag), dim=1) + mask = mask.to(flow.device) + + filter_x = torch.tensor([[0, 0, 0.], [1, -2, 1], [0, 0, 0]]) + filter_y = torch.tensor([[0, 1, 0.], [0, -2, 0], [0, 1, 0]]) + filter_diag1 = torch.tensor([[1, 0, 0.], [0, -2, 0], [0, 0, 1]]) + filter_diag2 = torch.tensor([[0, 0, 1.], [0, -2, 0], [1, 0, 0]]) + weights = torch.ones([4, 1, 3, 3]) + weights[0] = filter_x + weights[1] = filter_y + weights[2] = filter_diag1 + weights[3] = filter_diag2 + weights = weights.to(flow.device) + + # split the flow into flow_u and flow_v, conv them with the weights + flow_u, flow_v = torch.split(flow, split_size_or_sections=1, dim=1) + delta_u = F.conv2d(flow_u, weights, stride=1, padding=1) + delta_v = F.conv2d(flow_v, weights, stride=1, padding=1) + return delta_u, delta_v, mask + +def create_mask(tensor, paddings): + """ + tensor shape: [b, c, h, w] + paddings: [2 x 2] shape list, the first row indicates up and down paddings + the second row indicates left and right paddings + | | + | x | + | x * x | + | x | + | | + """ + shape = tensor.shape + inner_height = shape[2] - (paddings[0][0] + paddings[0][1]) + inner_width = shape[3] - (paddings[1][0] + paddings[1][1]) + inner = torch.ones([inner_height, inner_width]) + torch_paddings = [paddings[1][0], paddings[1][1], paddings[0][0], paddings[0][1]] # left, right, up and down + mask2d = F.pad(inner, pad=torch_paddings) + mask3d = mask2d.unsqueeze(0).repeat(shape[0], 1, 1) + mask4d = mask3d.unsqueeze(1) + return mask4d.detach() + +def ternary_loss(flow_comp, flow_gt, mask, current_frame, shift_frame, scale_factor=1): + if scale_factor != 1: + current_frame = F.interpolate(current_frame, scale_factor=1 / scale_factor, mode='bilinear') + shift_frame = F.interpolate(shift_frame, scale_factor=1 / scale_factor, mode='bilinear') + warped_sc = flow_warp(shift_frame, flow_gt.permute(0, 2, 3, 1)) + noc_mask = torch.exp(-50. * torch.sum(torch.abs(current_frame - warped_sc), dim=1).pow(2)).unsqueeze(1) + warped_comp_sc = flow_warp(shift_frame, flow_comp.permute(0, 2, 3, 1)) + loss = ternary_loss2(current_frame, warped_comp_sc, noc_mask, mask) + return loss + +class FlowLoss(nn.Module): + def __init__(self): + super().__init__() + self.l1_criterion = nn.L1Loss() + + def forward(self, pred_flows, gt_flows, masks, frames): + # pred_flows: b t-1 2 h w + loss = 0 + warp_loss = 0 + h, w = pred_flows[0].shape[-2:] + masks = [masks[:,:-1,...].contiguous(), masks[:, 1:, ...].contiguous()] + frames0 = frames[:,:-1,...] + frames1 = frames[:,1:,...] + current_frames = [frames0, frames1] + next_frames = [frames1, frames0] + for i in range(len(pred_flows)): + # print(pred_flows[i].shape) + combined_flow = pred_flows[i] * masks[i] + gt_flows[i] * (1-masks[i]) + l1_loss = self.l1_criterion(pred_flows[i] * masks[i], gt_flows[i] * masks[i]) / torch.mean(masks[i]) + l1_loss += self.l1_criterion(pred_flows[i] * (1-masks[i]), gt_flows[i] * (1-masks[i])) / torch.mean((1-masks[i])) + + smooth_loss = smoothness_loss(combined_flow.reshape(-1,2,h,w), masks[i].reshape(-1,1,h,w)) + smooth_loss2 = second_order_loss(combined_flow.reshape(-1,2,h,w), masks[i].reshape(-1,1,h,w)) + + warp_loss_i = ternary_loss(combined_flow.reshape(-1,2,h,w), gt_flows[i].reshape(-1,2,h,w), + masks[i].reshape(-1,1,h,w), current_frames[i].reshape(-1,3,h,w), next_frames[i].reshape(-1,3,h,w)) + + loss += l1_loss + smooth_loss + smooth_loss2 + + warp_loss += warp_loss_i + + return loss, warp_loss + + +def edgeLoss(preds_edges, edges): + """ + + Args: + preds_edges: with shape [b, c, h , w] + edges: with shape [b, c, h, w] + + Returns: Edge losses + + """ + mask = (edges > 0.5).float() + b, c, h, w = mask.shape + num_pos = torch.sum(mask, dim=[1, 2, 3]).float() # Shape: [b,]. + num_neg = c * h * w - num_pos # Shape: [b,]. + neg_weights = (num_neg / (num_pos + num_neg)).unsqueeze(1).unsqueeze(2).unsqueeze(3) + pos_weights = (num_pos / (num_pos + num_neg)).unsqueeze(1).unsqueeze(2).unsqueeze(3) + weight = neg_weights * mask + pos_weights * (1 - mask) # weight for debug + losses = F.binary_cross_entropy_with_logits(preds_edges.float(), edges.float(), weight=weight, reduction='none') + loss = torch.mean(losses) + return loss + +class EdgeLoss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, pred_edges, gt_edges, masks): + # pred_flows: b t-1 1 h w + loss = 0 + h, w = pred_edges[0].shape[-2:] + masks = [masks[:,:-1,...].contiguous(), masks[:, 1:, ...].contiguous()] + for i in range(len(pred_edges)): + # print(f'edges_{i}', torch.sum(gt_edges[i])) # debug + combined_edge = pred_edges[i] * masks[i] + gt_edges[i] * (1-masks[i]) + edge_loss = (edgeLoss(pred_edges[i].reshape(-1,1,h,w), gt_edges[i].reshape(-1,1,h,w)) \ + + 5 * edgeLoss(combined_edge.reshape(-1,1,h,w), gt_edges[i].reshape(-1,1,h,w))) + loss += edge_loss + + return loss + + +class FlowSimpleLoss(nn.Module): + def __init__(self): + super().__init__() + self.l1_criterion = nn.L1Loss() + + def forward(self, pred_flows, gt_flows): + # pred_flows: b t-1 2 h w + loss = 0 + h, w = pred_flows[0].shape[-2:] + h_orig, w_orig = gt_flows[0].shape[-2:] + pred_flows = [f.view(-1, 2, h, w) for f in pred_flows] + gt_flows = [f.view(-1, 2, h_orig, w_orig) for f in gt_flows] + + ds_factor = 1.0*h/h_orig + gt_flows = [F.interpolate(f, scale_factor=ds_factor, mode='area') * ds_factor for f in gt_flows] + for i in range(len(pred_flows)): + loss += self.l1_criterion(pred_flows[i], gt_flows[i]) + + return loss \ No newline at end of file diff --git a/backend/inpaint/video/model/modules/flow_loss_utils.py b/backend/inpaint/video/model/modules/flow_loss_utils.py new file mode 100755 index 0000000..6e465c0 --- /dev/null +++ b/backend/inpaint/video/model/modules/flow_loss_utils.py @@ -0,0 +1,142 @@ +import torch +import numpy as np +import torch.nn as nn +import torch.nn.functional as F + +def flow_warp(x, + flow, + interpolation='bilinear', + padding_mode='zeros', + align_corners=True): + """Warp an image or a feature map with optical flow. + Args: + x (Tensor): Tensor with size (n, c, h, w). + flow (Tensor): Tensor with size (n, h, w, 2). The last dimension is + a two-channel, denoting the width and height relative offsets. + Note that the values are not normalized to [-1, 1]. + interpolation (str): Interpolation mode: 'nearest' or 'bilinear'. + Default: 'bilinear'. + padding_mode (str): Padding mode: 'zeros' or 'border' or 'reflection'. + Default: 'zeros'. + align_corners (bool): Whether align corners. Default: True. + Returns: + Tensor: Warped image or feature map. + """ + if x.size()[-2:] != flow.size()[1:3]: + raise ValueError(f'The spatial sizes of input ({x.size()[-2:]}) and ' + f'flow ({flow.size()[1:3]}) are not the same.') + _, _, h, w = x.size() + # create mesh grid + device = flow.device + grid_y, grid_x = torch.meshgrid(torch.arange(0, h, device=device), torch.arange(0, w, device=device)) + grid = torch.stack((grid_x, grid_y), 2).type_as(x) # (w, h, 2) + grid.requires_grad = False + + grid_flow = grid + flow + # scale grid_flow to [-1,1] + grid_flow_x = 2.0 * grid_flow[:, :, :, 0] / max(w - 1, 1) - 1.0 + grid_flow_y = 2.0 * grid_flow[:, :, :, 1] / max(h - 1, 1) - 1.0 + grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=3) + output = F.grid_sample(x, + grid_flow, + mode=interpolation, + padding_mode=padding_mode, + align_corners=align_corners) + return output + + +# def image_warp(image, flow): +# b, c, h, w = image.size() +# device = image.device +# flow = torch.cat([flow[:, 0:1, :, :] / ((w - 1.0) / 2.0), flow[:, 1:2, :, :] / ((h - 1.0) / 2.0)], dim=1) # normalize to [-1~1](from upper left to lower right +# flow = flow.permute(0, 2, 3, 1) # if you wanna use grid_sample function, the channel(band) shape of show must be in the last dimension +# x = np.linspace(-1, 1, w) +# y = np.linspace(-1, 1, h) +# X, Y = np.meshgrid(x, y) +# grid = torch.cat((torch.from_numpy(X.astype('float32')).unsqueeze(0).unsqueeze(3), +# torch.from_numpy(Y.astype('float32')).unsqueeze(0).unsqueeze(3)), 3).to(device) +# output = torch.nn.functional.grid_sample(image, grid + flow, mode='bilinear', padding_mode='zeros') +# return output + + +def length_sq(x): + return torch.sum(torch.square(x), dim=1, keepdim=True) + + +def fbConsistencyCheck(flow_fw, flow_bw, alpha1=0.01, alpha2=0.5): + flow_bw_warped = flow_warp(flow_bw, flow_fw.permute(0, 2, 3, 1)) # wb(wf(x)) + flow_fw_warped = flow_warp(flow_fw, flow_bw.permute(0, 2, 3, 1)) # wf(wb(x)) + flow_diff_fw = flow_fw + flow_bw_warped # wf + wb(wf(x)) + flow_diff_bw = flow_bw + flow_fw_warped # wb + wf(wb(x)) + + mag_sq_fw = length_sq(flow_fw) + length_sq(flow_bw_warped) # |wf| + |wb(wf(x))| + mag_sq_bw = length_sq(flow_bw) + length_sq(flow_fw_warped) # |wb| + |wf(wb(x))| + occ_thresh_fw = alpha1 * mag_sq_fw + alpha2 + occ_thresh_bw = alpha1 * mag_sq_bw + alpha2 + + fb_occ_fw = (length_sq(flow_diff_fw) > occ_thresh_fw).float() + fb_occ_bw = (length_sq(flow_diff_bw) > occ_thresh_bw).float() + + return fb_occ_fw, fb_occ_bw # fb_occ_fw -> frame2 area occluded by frame1, fb_occ_bw -> frame1 area occluded by frame2 + + +def rgb2gray(image): + gray_image = image[:, 0] * 0.299 + image[:, 1] * 0.587 + 0.110 * image[:, 2] + gray_image = gray_image.unsqueeze(1) + return gray_image + + +def ternary_transform(image, max_distance=1): + device = image.device + patch_size = 2 * max_distance + 1 + intensities = rgb2gray(image) * 255 + out_channels = patch_size * patch_size + w = np.eye(out_channels).reshape(out_channels, 1, patch_size, patch_size) + weights = torch.from_numpy(w).float().to(device) + patches = F.conv2d(intensities, weights, stride=1, padding=1) + transf = patches - intensities + transf_norm = transf / torch.sqrt(0.81 + torch.square(transf)) + return transf_norm + + +def hamming_distance(t1, t2): + dist = torch.square(t1 - t2) + dist_norm = dist / (0.1 + dist) + dist_sum = torch.sum(dist_norm, dim=1, keepdim=True) + return dist_sum + + +def create_mask(mask, paddings): + """ + padding: [[top, bottom], [left, right]] + """ + shape = mask.shape + inner_height = shape[2] - (paddings[0][0] + paddings[0][1]) + inner_width = shape[3] - (paddings[1][0] + paddings[1][1]) + inner = torch.ones([inner_height, inner_width]) + + mask2d = F.pad(inner, pad=[paddings[1][0], paddings[1][1], paddings[0][0], paddings[0][1]]) + mask3d = mask2d.unsqueeze(0) + mask4d = mask3d.unsqueeze(0).repeat(shape[0], 1, 1, 1) + return mask4d.detach() + + +def ternary_loss2(frame1, warp_frame21, confMask, masks, max_distance=1): + """ + + Args: + frame1: torch tensor, with shape [b * t, c, h, w] + warp_frame21: torch tensor, with shape [b * t, c, h, w] + confMask: confidence mask, with shape [b * t, c, h, w] + masks: torch tensor, with shape [b * t, c, h, w] + max_distance: maximum distance. + + Returns: ternary loss + + """ + t1 = ternary_transform(frame1) + t21 = ternary_transform(warp_frame21) + dist = hamming_distance(t1, t21) + loss = torch.mean(dist * confMask * masks) / torch.mean(masks) + return loss + diff --git a/backend/inpaint/video/model/modules/sparse_transformer.py b/backend/inpaint/video/model/modules/sparse_transformer.py new file mode 100644 index 0000000..11028ff --- /dev/null +++ b/backend/inpaint/video/model/modules/sparse_transformer.py @@ -0,0 +1,344 @@ +import math +from functools import reduce +import torch +import torch.nn as nn +import torch.nn.functional as F + +class SoftSplit(nn.Module): + def __init__(self, channel, hidden, kernel_size, stride, padding): + super(SoftSplit, self).__init__() + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.t2t = nn.Unfold(kernel_size=kernel_size, + stride=stride, + padding=padding) + c_in = reduce((lambda x, y: x * y), kernel_size) * channel + self.embedding = nn.Linear(c_in, hidden) + + def forward(self, x, b, output_size): + f_h = int((output_size[0] + 2 * self.padding[0] - + (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1) + f_w = int((output_size[1] + 2 * self.padding[1] - + (self.kernel_size[1] - 1) - 1) / self.stride[1] + 1) + + feat = self.t2t(x) + feat = feat.permute(0, 2, 1) + # feat shape [b*t, num_vec, ks*ks*c] + feat = self.embedding(feat) + # feat shape after embedding [b, t*num_vec, hidden] + feat = feat.view(b, -1, f_h, f_w, feat.size(2)) + return feat + + +class SoftComp(nn.Module): + def __init__(self, channel, hidden, kernel_size, stride, padding): + super(SoftComp, self).__init__() + self.relu = nn.LeakyReLU(0.2, inplace=True) + c_out = reduce((lambda x, y: x * y), kernel_size) * channel + self.embedding = nn.Linear(hidden, c_out) + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.bias_conv = nn.Conv2d(channel, + channel, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x, t, output_size): + b_, _, _, _, c_ = x.shape + x = x.view(b_, -1, c_) + feat = self.embedding(x) + b, _, c = feat.size() + feat = feat.view(b * t, -1, c).permute(0, 2, 1) + feat = F.fold(feat, + output_size=output_size, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding) + feat = self.bias_conv(feat) + return feat + + +class FusionFeedForward(nn.Module): + def __init__(self, dim, hidden_dim=1960, t2t_params=None): + super(FusionFeedForward, self).__init__() + # We set hidden_dim as a default to 1960 + self.fc1 = nn.Sequential(nn.Linear(dim, hidden_dim)) + self.fc2 = nn.Sequential(nn.GELU(), nn.Linear(hidden_dim, dim)) + assert t2t_params is not None + self.t2t_params = t2t_params + self.kernel_shape = reduce((lambda x, y: x * y), t2t_params['kernel_size']) # 49 + + def forward(self, x, output_size): + n_vecs = 1 + for i, d in enumerate(self.t2t_params['kernel_size']): + n_vecs *= int((output_size[i] + 2 * self.t2t_params['padding'][i] - + (d - 1) - 1) / self.t2t_params['stride'][i] + 1) + + x = self.fc1(x) + b, n, c = x.size() + normalizer = x.new_ones(b, n, self.kernel_shape).view(-1, n_vecs, self.kernel_shape).permute(0, 2, 1) + normalizer = F.fold(normalizer, + output_size=output_size, + kernel_size=self.t2t_params['kernel_size'], + padding=self.t2t_params['padding'], + stride=self.t2t_params['stride']) + + x = F.fold(x.view(-1, n_vecs, c).permute(0, 2, 1), + output_size=output_size, + kernel_size=self.t2t_params['kernel_size'], + padding=self.t2t_params['padding'], + stride=self.t2t_params['stride']) + + x = F.unfold(x / normalizer, + kernel_size=self.t2t_params['kernel_size'], + padding=self.t2t_params['padding'], + stride=self.t2t_params['stride']).permute( + 0, 2, 1).contiguous().view(b, n, c) + x = self.fc2(x) + return x + + +def window_partition(x, window_size, n_head): + """ + Args: + x: shape is (B, T, H, W, C) + window_size (tuple[int]): window size + Returns: + windows: (B, num_windows_h, num_windows_w, n_head, T, window_size, window_size, C//n_head) + """ + B, T, H, W, C = x.shape + x = x.view(B, T, H // window_size[0], window_size[0], W // window_size[1], window_size[1], n_head, C//n_head) + windows = x.permute(0, 2, 4, 6, 1, 3, 5, 7).contiguous() + return windows + +class SparseWindowAttention(nn.Module): + def __init__(self, dim, n_head, window_size, pool_size=(4,4), qkv_bias=True, attn_drop=0., proj_drop=0., + pooling_token=True): + super().__init__() + assert dim % n_head == 0 + # key, query, value projections for all heads + self.key = nn.Linear(dim, dim, qkv_bias) + self.query = nn.Linear(dim, dim, qkv_bias) + self.value = nn.Linear(dim, dim, qkv_bias) + # regularization + self.attn_drop = nn.Dropout(attn_drop) + self.proj_drop = nn.Dropout(proj_drop) + # output projection + self.proj = nn.Linear(dim, dim) + self.n_head = n_head + self.window_size = window_size + self.pooling_token = pooling_token + if self.pooling_token: + ks, stride = pool_size, pool_size + self.pool_layer = nn.Conv2d(dim, dim, kernel_size=ks, stride=stride, padding=(0, 0), groups=dim) + self.pool_layer.weight.data.fill_(1. / (pool_size[0] * pool_size[1])) + self.pool_layer.bias.data.fill_(0) + # self.expand_size = tuple(i // 2 for i in window_size) + self.expand_size = tuple((i + 1) // 2 for i in window_size) + + if any(i > 0 for i in self.expand_size): + # get mask for rolled k and rolled v + mask_tl = torch.ones(self.window_size[0], self.window_size[1]) + mask_tl[:-self.expand_size[0], :-self.expand_size[1]] = 0 + mask_tr = torch.ones(self.window_size[0], self.window_size[1]) + mask_tr[:-self.expand_size[0], self.expand_size[1]:] = 0 + mask_bl = torch.ones(self.window_size[0], self.window_size[1]) + mask_bl[self.expand_size[0]:, :-self.expand_size[1]] = 0 + mask_br = torch.ones(self.window_size[0], self.window_size[1]) + mask_br[self.expand_size[0]:, self.expand_size[1]:] = 0 + masrool_k = torch.stack((mask_tl, mask_tr, mask_bl, mask_br), 0).flatten(0) + self.register_buffer("valid_ind_rolled", masrool_k.nonzero(as_tuple=False).view(-1)) + + self.max_pool = nn.MaxPool2d(window_size, window_size, (0, 0)) + + + def forward(self, x, mask=None, T_ind=None, attn_mask=None): + b, t, h, w, c = x.shape # 20 36 + w_h, w_w = self.window_size[0], self.window_size[1] + c_head = c // self.n_head + n_wh = math.ceil(h / self.window_size[0]) + n_ww = math.ceil(w / self.window_size[1]) + new_h = n_wh * self.window_size[0] # 20 + new_w = n_ww * self.window_size[1] # 36 + pad_r = new_w - w + pad_b = new_h - h + # reverse order + if pad_r > 0 or pad_b > 0: + x = F.pad(x,(0, 0, 0, pad_r, 0, pad_b, 0, 0), mode='constant', value=0) + mask = F.pad(mask,(0, 0, 0, pad_r, 0, pad_b, 0, 0), mode='constant', value=0) + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + q = self.query(x) + k = self.key(x) + v = self.value(x) + win_q = window_partition(q.contiguous(), self.window_size, self.n_head).view(b, n_wh*n_ww, self.n_head, t, w_h*w_w, c_head) + win_k = window_partition(k.contiguous(), self.window_size, self.n_head).view(b, n_wh*n_ww, self.n_head, t, w_h*w_w, c_head) + win_v = window_partition(v.contiguous(), self.window_size, self.n_head).view(b, n_wh*n_ww, self.n_head, t, w_h*w_w, c_head) + # roll_k and roll_v + if any(i > 0 for i in self.expand_size): + (k_tl, v_tl) = map(lambda a: torch.roll(a, shifts=(-self.expand_size[0], -self.expand_size[1]), dims=(2, 3)), (k, v)) + (k_tr, v_tr) = map(lambda a: torch.roll(a, shifts=(-self.expand_size[0], self.expand_size[1]), dims=(2, 3)), (k, v)) + (k_bl, v_bl) = map(lambda a: torch.roll(a, shifts=(self.expand_size[0], -self.expand_size[1]), dims=(2, 3)), (k, v)) + (k_br, v_br) = map(lambda a: torch.roll(a, shifts=(self.expand_size[0], self.expand_size[1]), dims=(2, 3)), (k, v)) + + (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows) = map( + lambda a: window_partition(a, self.window_size, self.n_head).view(b, n_wh*n_ww, self.n_head, t, w_h*w_w, c_head), + (k_tl, k_tr, k_bl, k_br)) + (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows) = map( + lambda a: window_partition(a, self.window_size, self.n_head).view(b, n_wh*n_ww, self.n_head, t, w_h*w_w, c_head), + (v_tl, v_tr, v_bl, v_br)) + rool_k = torch.cat((k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows), 4).contiguous() + rool_v = torch.cat((v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows), 4).contiguous() # [b, n_wh*n_ww, n_head, t, w_h*w_w, c_head] + # mask out tokens in current window + rool_k = rool_k[:, :, :, :, self.valid_ind_rolled] + rool_v = rool_v[:, :, :, :, self.valid_ind_rolled] + roll_N = rool_k.shape[4] + rool_k = rool_k.view(b, n_wh*n_ww, self.n_head, t, roll_N, c // self.n_head) + rool_v = rool_v.view(b, n_wh*n_ww, self.n_head, t, roll_N, c // self.n_head) + win_k = torch.cat((win_k, rool_k), dim=4) + win_v = torch.cat((win_v, rool_v), dim=4) + else: + win_k = win_k + win_v = win_v + + # pool_k and pool_v + if self.pooling_token: + pool_x = self.pool_layer(x.view(b*t, new_h, new_w, c).permute(0,3,1,2)) + _, _, p_h, p_w = pool_x.shape + pool_x = pool_x.permute(0,2,3,1).view(b, t, p_h, p_w, c) + # pool_k + pool_k = self.key(pool_x).unsqueeze(1).repeat(1, n_wh*n_ww, 1, 1, 1, 1) # [b, n_wh*n_ww, t, p_h, p_w, c] + pool_k = pool_k.view(b, n_wh*n_ww, t, p_h, p_w, self.n_head, c_head).permute(0,1,5,2,3,4,6) + pool_k = pool_k.contiguous().view(b, n_wh*n_ww, self.n_head, t, p_h*p_w, c_head) + win_k = torch.cat((win_k, pool_k), dim=4) + # pool_v + pool_v = self.value(pool_x).unsqueeze(1).repeat(1, n_wh*n_ww, 1, 1, 1, 1) # [b, n_wh*n_ww, t, p_h, p_w, c] + pool_v = pool_v.view(b, n_wh*n_ww, t, p_h, p_w, self.n_head, c_head).permute(0,1,5,2,3,4,6) + pool_v = pool_v.contiguous().view(b, n_wh*n_ww, self.n_head, t, p_h*p_w, c_head) + win_v = torch.cat((win_v, pool_v), dim=4) + + # [b, n_wh*n_ww, n_head, t, w_h*w_w, c_head] + out = torch.zeros_like(win_q) + l_t = mask.size(1) + + mask = self.max_pool(mask.view(b * l_t, new_h, new_w)) + mask = mask.view(b, l_t, n_wh*n_ww) + mask = torch.sum(mask, dim=1) # [b, n_wh*n_ww] + for i in range(win_q.shape[0]): + ### For masked windows + mask_ind_i = mask[i].nonzero(as_tuple=False).view(-1) + # mask out quary in current window + # [b, n_wh*n_ww, n_head, t, w_h*w_w, c_head] + mask_n = len(mask_ind_i) + if mask_n > 0: + win_q_t = win_q[i, mask_ind_i].view(mask_n, self.n_head, t*w_h*w_w, c_head) + win_k_t = win_k[i, mask_ind_i] + win_v_t = win_v[i, mask_ind_i] + # mask out key and value + if T_ind is not None: + # key [n_wh*n_ww, n_head, t, w_h*w_w, c_head] + win_k_t = win_k_t[:, :, T_ind.view(-1)].view(mask_n, self.n_head, -1, c_head) + # value + win_v_t = win_v_t[:, :, T_ind.view(-1)].view(mask_n, self.n_head, -1, c_head) + else: + win_k_t = win_k_t.view(n_wh*n_ww, self.n_head, t*w_h*w_w, c_head) + win_v_t = win_v_t.view(n_wh*n_ww, self.n_head, t*w_h*w_w, c_head) + + att_t = (win_q_t @ win_k_t.transpose(-2, -1)) * (1.0 / math.sqrt(win_q_t.size(-1))) + att_t = F.softmax(att_t, dim=-1) + att_t = self.attn_drop(att_t) + y_t = att_t @ win_v_t + + out[i, mask_ind_i] = y_t.view(-1, self.n_head, t, w_h*w_w, c_head) + + ### For unmasked windows + unmask_ind_i = (mask[i] == 0).nonzero(as_tuple=False).view(-1) + # mask out quary in current window + # [b, n_wh*n_ww, n_head, t, w_h*w_w, c_head] + win_q_s = win_q[i, unmask_ind_i] + win_k_s = win_k[i, unmask_ind_i, :, :, :w_h*w_w] + win_v_s = win_v[i, unmask_ind_i, :, :, :w_h*w_w] + + att_s = (win_q_s @ win_k_s.transpose(-2, -1)) * (1.0 / math.sqrt(win_q_s.size(-1))) + att_s = F.softmax(att_s, dim=-1) + att_s = self.attn_drop(att_s) + y_s = att_s @ win_v_s + out[i, unmask_ind_i] = y_s + + # re-assemble all head outputs side by side + out = out.view(b, n_wh, n_ww, self.n_head, t, w_h, w_w, c_head) + out = out.permute(0, 4, 1, 5, 2, 6, 3, 7).contiguous().view(b, t, new_h, new_w, c) + + + if pad_r > 0 or pad_b > 0: + out = out[:, :, :h, :w, :] + + # output projection + out = self.proj_drop(self.proj(out)) + return out + + +class TemporalSparseTransformer(nn.Module): + def __init__(self, dim, n_head, window_size, pool_size, + norm_layer=nn.LayerNorm, t2t_params=None): + super().__init__() + self.window_size = window_size + self.attention = SparseWindowAttention(dim, n_head, window_size, pool_size) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.mlp = FusionFeedForward(dim, t2t_params=t2t_params) + + def forward(self, x, fold_x_size, mask=None, T_ind=None): + """ + Args: + x: image tokens, shape [B T H W C] + fold_x_size: fold feature size, shape [60 108] + mask: mask tokens, shape [B T H W 1] + Returns: + out_tokens: shape [B T H W C] + """ + B, T, H, W, C = x.shape # 20 36 + + shortcut = x + x = self.norm1(x) + att_x = self.attention(x, mask, T_ind) + + # FFN + x = shortcut + att_x + y = self.norm2(x) + x = x + self.mlp(y.view(B, T * H * W, C), fold_x_size).view(B, T, H, W, C) + + return x + + +class TemporalSparseTransformerBlock(nn.Module): + def __init__(self, dim, n_head, window_size, pool_size, depths, t2t_params=None): + super().__init__() + blocks = [] + for i in range(depths): + blocks.append( + TemporalSparseTransformer(dim, n_head, window_size, pool_size, t2t_params=t2t_params) + ) + self.transformer = nn.Sequential(*blocks) + self.depths = depths + + def forward(self, x, fold_x_size, l_mask=None, t_dilation=2): + """ + Args: + x: image tokens, shape [B T H W C] + fold_x_size: fold feature size, shape [60 108] + l_mask: local mask tokens, shape [B T H W 1] + Returns: + out_tokens: shape [B T H W C] + """ + assert self.depths % t_dilation == 0, 'wrong t_dilation input.' + T = x.size(1) + T_ind = [torch.arange(i, T, t_dilation) for i in range(t_dilation)] * (self.depths // t_dilation) + + for i in range(0, self.depths): + x = self.transformer[i](x, fold_x_size, l_mask, T_ind[i]) + + return x diff --git a/backend/inpaint/video/model/modules/spectral_norm.py b/backend/inpaint/video/model/modules/spectral_norm.py new file mode 100644 index 0000000..f38c34e --- /dev/null +++ b/backend/inpaint/video/model/modules/spectral_norm.py @@ -0,0 +1,288 @@ +""" +Spectral Normalization from https://arxiv.org/abs/1802.05957 +""" +import torch +from torch.nn.functional import normalize + + +class SpectralNorm(object): + # Invariant before and after each forward call: + # u = normalize(W @ v) + # NB: At initialization, this invariant is not enforced + + _version = 1 + + # At version 1: + # made `W` not a buffer, + # added `v` as a buffer, and + # made eval mode use `W = u @ W_orig @ v` rather than the stored `W`. + + def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12): + self.name = name + self.dim = dim + if n_power_iterations <= 0: + raise ValueError( + 'Expected n_power_iterations to be positive, but ' + 'got n_power_iterations={}'.format(n_power_iterations)) + self.n_power_iterations = n_power_iterations + self.eps = eps + + def reshape_weight_to_matrix(self, weight): + weight_mat = weight + if self.dim != 0: + # permute dim to front + weight_mat = weight_mat.permute( + self.dim, + *[d for d in range(weight_mat.dim()) if d != self.dim]) + height = weight_mat.size(0) + return weight_mat.reshape(height, -1) + + def compute_weight(self, module, do_power_iteration): + # NB: If `do_power_iteration` is set, the `u` and `v` vectors are + # updated in power iteration **in-place**. This is very important + # because in `DataParallel` forward, the vectors (being buffers) are + # broadcast from the parallelized module to each module replica, + # which is a new module object created on the fly. And each replica + # runs its own spectral norm power iteration. So simply assigning + # the updated vectors to the module this function runs on will cause + # the update to be lost forever. And the next time the parallelized + # module is replicated, the same randomly initialized vectors are + # broadcast and used! + # + # Therefore, to make the change propagate back, we rely on two + # important behaviors (also enforced via tests): + # 1. `DataParallel` doesn't clone storage if the broadcast tensor + # is already on correct device; and it makes sure that the + # parallelized module is already on `device[0]`. + # 2. If the out tensor in `out=` kwarg has correct shape, it will + # just fill in the values. + # Therefore, since the same power iteration is performed on all + # devices, simply updating the tensors in-place will make sure that + # the module replica on `device[0]` will update the _u vector on the + # parallized module (by shared storage). + # + # However, after we update `u` and `v` in-place, we need to **clone** + # them before using them to normalize the weight. This is to support + # backproping through two forward passes, e.g., the common pattern in + # GAN training: loss = D(real) - D(fake). Otherwise, engine will + # complain that variables needed to do backward for the first forward + # (i.e., the `u` and `v` vectors) are changed in the second forward. + weight = getattr(module, self.name + '_orig') + u = getattr(module, self.name + '_u') + v = getattr(module, self.name + '_v') + weight_mat = self.reshape_weight_to_matrix(weight) + + if do_power_iteration: + with torch.no_grad(): + for _ in range(self.n_power_iterations): + # Spectral norm of weight equals to `u^T W v`, where `u` and `v` + # are the first left and right singular vectors. + # This power iteration produces approximations of `u` and `v`. + v = normalize(torch.mv(weight_mat.t(), u), + dim=0, + eps=self.eps, + out=v) + u = normalize(torch.mv(weight_mat, v), + dim=0, + eps=self.eps, + out=u) + if self.n_power_iterations > 0: + # See above on why we need to clone + u = u.clone() + v = v.clone() + + sigma = torch.dot(u, torch.mv(weight_mat, v)) + weight = weight / sigma + return weight + + def remove(self, module): + with torch.no_grad(): + weight = self.compute_weight(module, do_power_iteration=False) + delattr(module, self.name) + delattr(module, self.name + '_u') + delattr(module, self.name + '_v') + delattr(module, self.name + '_orig') + module.register_parameter(self.name, + torch.nn.Parameter(weight.detach())) + + def __call__(self, module, inputs): + setattr( + module, self.name, + self.compute_weight(module, do_power_iteration=module.training)) + + def _solve_v_and_rescale(self, weight_mat, u, target_sigma): + # Tries to returns a vector `v` s.t. `u = normalize(W @ v)` + # (the invariant at top of this class) and `u @ W @ v = sigma`. + # This uses pinverse in case W^T W is not invertible. + v = torch.chain_matmul(weight_mat.t().mm(weight_mat).pinverse(), + weight_mat.t(), u.unsqueeze(1)).squeeze(1) + return v.mul_(target_sigma / torch.dot(u, torch.mv(weight_mat, v))) + + @staticmethod + def apply(module, name, n_power_iterations, dim, eps): + for k, hook in module._forward_pre_hooks.items(): + if isinstance(hook, SpectralNorm) and hook.name == name: + raise RuntimeError( + "Cannot register two spectral_norm hooks on " + "the same parameter {}".format(name)) + + fn = SpectralNorm(name, n_power_iterations, dim, eps) + weight = module._parameters[name] + + with torch.no_grad(): + weight_mat = fn.reshape_weight_to_matrix(weight) + + h, w = weight_mat.size() + # randomly initialize `u` and `v` + u = normalize(weight.new_empty(h).normal_(0, 1), dim=0, eps=fn.eps) + v = normalize(weight.new_empty(w).normal_(0, 1), dim=0, eps=fn.eps) + + delattr(module, fn.name) + module.register_parameter(fn.name + "_orig", weight) + # We still need to assign weight back as fn.name because all sorts of + # things may assume that it exists, e.g., when initializing weights. + # However, we can't directly assign as it could be an nn.Parameter and + # gets added as a parameter. Instead, we register weight.data as a plain + # attribute. + setattr(module, fn.name, weight.data) + module.register_buffer(fn.name + "_u", u) + module.register_buffer(fn.name + "_v", v) + + module.register_forward_pre_hook(fn) + + module._register_state_dict_hook(SpectralNormStateDictHook(fn)) + module._register_load_state_dict_pre_hook( + SpectralNormLoadStateDictPreHook(fn)) + return fn + + +# This is a top level class because Py2 pickle doesn't like inner class nor an +# instancemethod. +class SpectralNormLoadStateDictPreHook(object): + # See docstring of SpectralNorm._version on the changes to spectral_norm. + def __init__(self, fn): + self.fn = fn + + # For state_dict with version None, (assuming that it has gone through at + # least one training forward), we have + # + # u = normalize(W_orig @ v) + # W = W_orig / sigma, where sigma = u @ W_orig @ v + # + # To compute `v`, we solve `W_orig @ x = u`, and let + # v = x / (u @ W_orig @ x) * (W / W_orig). + def __call__(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + fn = self.fn + version = local_metadata.get('spectral_norm', + {}).get(fn.name + '.version', None) + if version is None or version < 1: + with torch.no_grad(): + weight_orig = state_dict[prefix + fn.name + '_orig'] + # weight = state_dict.pop(prefix + fn.name) + # sigma = (weight_orig / weight).mean() + weight_mat = fn.reshape_weight_to_matrix(weight_orig) + u = state_dict[prefix + fn.name + '_u'] + # v = fn._solve_v_and_rescale(weight_mat, u, sigma) + # state_dict[prefix + fn.name + '_v'] = v + + +# This is a top level class because Py2 pickle doesn't like inner class nor an +# instancemethod. +class SpectralNormStateDictHook(object): + # See docstring of SpectralNorm._version on the changes to spectral_norm. + def __init__(self, fn): + self.fn = fn + + def __call__(self, module, state_dict, prefix, local_metadata): + if 'spectral_norm' not in local_metadata: + local_metadata['spectral_norm'] = {} + key = self.fn.name + '.version' + if key in local_metadata['spectral_norm']: + raise RuntimeError( + "Unexpected key in metadata['spectral_norm']: {}".format(key)) + local_metadata['spectral_norm'][key] = self.fn._version + + +def spectral_norm(module, + name='weight', + n_power_iterations=1, + eps=1e-12, + dim=None): + r"""Applies spectral normalization to a parameter in the given module. + + .. math:: + \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})}, + \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2} + + Spectral normalization stabilizes the training of discriminators (critics) + in Generative Adversarial Networks (GANs) by rescaling the weight tensor + with spectral norm :math:`\sigma` of the weight matrix calculated using + power iteration method. If the dimension of the weight tensor is greater + than 2, it is reshaped to 2D in power iteration method to get spectral + norm. This is implemented via a hook that calculates spectral norm and + rescales weight before every :meth:`~Module.forward` call. + + See `Spectral Normalization for Generative Adversarial Networks`_ . + + .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957 + + Args: + module (nn.Module): containing module + name (str, optional): name of weight parameter + n_power_iterations (int, optional): number of power iterations to + calculate spectral norm + eps (float, optional): epsilon for numerical stability in + calculating norms + dim (int, optional): dimension corresponding to number of outputs, + the default is ``0``, except for modules that are instances of + ConvTranspose{1,2,3}d, when it is ``1`` + + Returns: + The original module with the spectral norm hook + + Example:: + + >>> m = spectral_norm(nn.Linear(20, 40)) + >>> m + Linear(in_features=20, out_features=40, bias=True) + >>> m.weight_u.size() + torch.Size([40]) + + """ + if dim is None: + if isinstance(module, + (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, + torch.nn.ConvTranspose3d)): + dim = 1 + else: + dim = 0 + SpectralNorm.apply(module, name, n_power_iterations, dim, eps) + return module + + +def remove_spectral_norm(module, name='weight'): + r"""Removes the spectral normalization reparameterization from a module. + + Args: + module (Module): containing module + name (str, optional): name of weight parameter + + Example: + >>> m = spectral_norm(nn.Linear(40, 10)) + >>> remove_spectral_norm(m) + """ + for k, hook in module._forward_pre_hooks.items(): + if isinstance(hook, SpectralNorm) and hook.name == name: + hook.remove(module) + del module._forward_pre_hooks[k] + return module + + raise ValueError("spectral_norm of '{}' not found in {}".format( + name, module)) + + +def use_spectral_norm(module, use_sn=False): + if use_sn: + return spectral_norm(module) + return module \ No newline at end of file diff --git a/backend/inpaint/video/model/propainter.py b/backend/inpaint/video/model/propainter.py new file mode 100644 index 0000000..a83ed3d --- /dev/null +++ b/backend/inpaint/video/model/propainter.py @@ -0,0 +1,539 @@ +''' Towards An End-to-End Framework for Video Inpainting +''' + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision + +from einops import rearrange + +from backend.inpaint.video.model.modules.base_module import BaseNetwork +from backend.inpaint.video.model.modules.sparse_transformer import TemporalSparseTransformerBlock, SoftSplit, SoftComp +from backend.inpaint.video.model.modules.spectral_norm import spectral_norm as _spectral_norm +from backend.inpaint.video.model.modules.flow_loss_utils import flow_warp +from backend.inpaint.video.model.modules.deformconv import ModulatedDeformConv2d + +from .misc import constant_init + + +def length_sq(x): + return torch.sum(torch.square(x), dim=1, keepdim=True) + + +def fbConsistencyCheck(flow_fw, flow_bw, alpha1=0.01, alpha2=0.5): + flow_bw_warped = flow_warp(flow_bw, flow_fw.permute(0, 2, 3, 1)) # wb(wf(x)) + flow_diff_fw = flow_fw + flow_bw_warped # wf + wb(wf(x)) + + mag_sq_fw = length_sq(flow_fw) + length_sq(flow_bw_warped) # |wf| + |wb(wf(x))| + occ_thresh_fw = alpha1 * mag_sq_fw + alpha2 + + # fb_valid_fw = (length_sq(flow_diff_fw) < occ_thresh_fw).float() + fb_valid_fw = (length_sq(flow_diff_fw) < occ_thresh_fw).to(flow_fw) + return fb_valid_fw + + +class DeformableAlignment(ModulatedDeformConv2d): + """Second-order deformable alignment module.""" + + def __init__(self, *args, **kwargs): + # self.max_residue_magnitude = kwargs.pop('max_residue_magnitude', 10) + self.max_residue_magnitude = kwargs.pop('max_residue_magnitude', 3) + + super(DeformableAlignment, self).__init__(*args, **kwargs) + + self.conv_offset = nn.Sequential( + nn.Conv2d(2 * self.out_channels + 2 + 1 + 2, self.out_channels, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(self.out_channels, 27 * self.deform_groups, 3, 1, 1), + ) + self.init_offset() + + def init_offset(self): + constant_init(self.conv_offset[-1], val=0, bias=0) + + def forward(self, x, cond_feat, flow): + out = self.conv_offset(cond_feat) + o1, o2, mask = torch.chunk(out, 3, dim=1) + + # offset + offset = self.max_residue_magnitude * torch.tanh(torch.cat((o1, o2), dim=1)) + offset = offset + flow.flip(1).repeat(1, offset.size(1) // 2, 1, 1) + + # mask + mask = torch.sigmoid(mask) + + return torchvision.ops.deform_conv2d(x, offset, self.weight, self.bias, + self.stride, self.padding, + self.dilation, mask) + + +class BidirectionalPropagation(nn.Module): + def __init__(self, channel, learnable=True): + super(BidirectionalPropagation, self).__init__() + self.deform_align = nn.ModuleDict() + self.backbone = nn.ModuleDict() + self.channel = channel + self.prop_list = ['backward_1', 'forward_1'] + self.learnable = learnable + + if self.learnable: + for i, module in enumerate(self.prop_list): + self.deform_align[module] = DeformableAlignment( + channel, channel, 3, padding=1, deform_groups=16) + + self.backbone[module] = nn.Sequential( + nn.Conv2d(2 * channel + 2, channel, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(channel, channel, 3, 1, 1), + ) + + self.fuse = nn.Sequential( + nn.Conv2d(2 * channel + 2, channel, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(channel, channel, 3, 1, 1), + ) + + def binary_mask(self, mask, th=0.1): + mask[mask > th] = 1 + mask[mask <= th] = 0 + # return mask.float() + return mask.to(mask) + + def forward(self, x, flows_forward, flows_backward, mask, interpolation='bilinear'): + """ + x shape : [b, t, c, h, w] + return [b, t, c, h, w] + """ + + # For backward warping + # pred_flows_forward for backward feature propagation + # pred_flows_backward for forward feature propagation + b, t, c, h, w = x.shape + feats, masks = {}, {} + feats['input'] = [x[:, i, :, :, :] for i in range(0, t)] + masks['input'] = [mask[:, i, :, :, :] for i in range(0, t)] + + prop_list = ['backward_1', 'forward_1'] + cache_list = ['input'] + prop_list + + for p_i, module_name in enumerate(prop_list): + feats[module_name] = [] + masks[module_name] = [] + + if 'backward' in module_name: + frame_idx = range(0, t) + frame_idx = frame_idx[::-1] + flow_idx = frame_idx + flows_for_prop = flows_forward + flows_for_check = flows_backward + else: + frame_idx = range(0, t) + flow_idx = range(-1, t - 1) + flows_for_prop = flows_backward + flows_for_check = flows_forward + + for i, idx in enumerate(frame_idx): + feat_current = feats[cache_list[p_i]][idx] + mask_current = masks[cache_list[p_i]][idx] + + if i == 0: + feat_prop = feat_current + mask_prop = mask_current + else: + flow_prop = flows_for_prop[:, flow_idx[i], :, :, :] + flow_check = flows_for_check[:, flow_idx[i], :, :, :] + flow_vaild_mask = fbConsistencyCheck(flow_prop, flow_check) + feat_warped = flow_warp(feat_prop, flow_prop.permute(0, 2, 3, 1), interpolation) + + if self.learnable: + cond = torch.cat([feat_current, feat_warped, flow_prop, flow_vaild_mask, mask_current], dim=1) + feat_prop = self.deform_align[module_name](feat_prop, cond, flow_prop) + mask_prop = mask_current + else: + mask_prop_valid = flow_warp(mask_prop, flow_prop.permute(0, 2, 3, 1)) + mask_prop_valid = self.binary_mask(mask_prop_valid) + + union_vaild_mask = self.binary_mask(mask_current * flow_vaild_mask * (1 - mask_prop_valid)) + feat_prop = union_vaild_mask * feat_warped + (1 - union_vaild_mask) * feat_current + # update mask + mask_prop = self.binary_mask(mask_current * (1 - (flow_vaild_mask * (1 - mask_prop_valid)))) + + # refine + if self.learnable: + feat = torch.cat([feat_current, feat_prop, mask_current], dim=1) + feat_prop = feat_prop + self.backbone[module_name](feat) + # feat_prop = self.backbone[module_name](feat_prop) + + feats[module_name].append(feat_prop) + masks[module_name].append(mask_prop) + + # end for + if 'backward' in module_name: + feats[module_name] = feats[module_name][::-1] + masks[module_name] = masks[module_name][::-1] + + outputs_b = torch.stack(feats['backward_1'], dim=1).view(-1, c, h, w) + outputs_f = torch.stack(feats['forward_1'], dim=1).view(-1, c, h, w) + + if self.learnable: + mask_in = mask.view(-1, 2, h, w) + masks_b, masks_f = None, None + outputs = self.fuse(torch.cat([outputs_b, outputs_f, mask_in], dim=1)) + x.view(-1, c, h, w) + else: + masks_b = torch.stack(masks['backward_1'], dim=1) + masks_f = torch.stack(masks['forward_1'], dim=1) + outputs = outputs_f + + return outputs_b.view(b, -1, c, h, w), outputs_f.view(b, -1, c, h, w), \ + outputs.view(b, -1, c, h, w), masks_f + + +class Encoder(nn.Module): + def __init__(self): + super(Encoder, self).__init__() + self.group = [1, 2, 4, 8, 1] + self.layers = nn.ModuleList([ + nn.Conv2d(5, 64, kernel_size=3, stride=2, padding=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1, groups=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(640, 512, kernel_size=3, stride=1, padding=1, groups=2), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(768, 384, kernel_size=3, stride=1, padding=1, groups=4), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(640, 256, kernel_size=3, stride=1, padding=1, groups=8), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(512, 128, kernel_size=3, stride=1, padding=1, groups=1), + nn.LeakyReLU(0.2, inplace=True) + ]) + + def forward(self, x): + bt, c, _, _ = x.size() + # h, w = h//4, w//4 + out = x + for i, layer in enumerate(self.layers): + if i == 8: + x0 = out + _, _, h, w = x0.size() + if i > 8 and i % 2 == 0: + g = self.group[(i - 8) // 2] + x = x0.view(bt, g, -1, h, w) + o = out.view(bt, g, -1, h, w) + out = torch.cat([x, o], 2).view(bt, -1, h, w) + out = layer(out) + return out + + +class deconv(nn.Module): + def __init__(self, + input_channel, + output_channel, + kernel_size=3, + padding=0): + super().__init__() + self.conv = nn.Conv2d(input_channel, + output_channel, + kernel_size=kernel_size, + stride=1, + padding=padding) + + def forward(self, x): + x = F.interpolate(x, + scale_factor=2, + mode='bilinear', + align_corners=True) + return self.conv(x) + + +class InpaintGenerator(BaseNetwork): + def __init__(self, init_weights=True, model_path=None): + super(InpaintGenerator, self).__init__() + channel = 128 + hidden = 512 + + # encoder + self.encoder = Encoder() + + # decoder + self.decoder = nn.Sequential( + deconv(channel, 128, kernel_size=3, padding=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1), + nn.LeakyReLU(0.2, inplace=True), + deconv(64, 64, kernel_size=3, padding=1), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1)) + + # soft split and soft composition + kernel_size = (7, 7) + padding = (3, 3) + stride = (3, 3) + t2t_params = { + 'kernel_size': kernel_size, + 'stride': stride, + 'padding': padding + } + self.ss = SoftSplit(channel, hidden, kernel_size, stride, padding) + self.sc = SoftComp(channel, hidden, kernel_size, stride, padding) + self.max_pool = nn.MaxPool2d(kernel_size, stride, padding) + + # feature propagation module + self.img_prop_module = BidirectionalPropagation(3, learnable=False) + self.feat_prop_module = BidirectionalPropagation(128, learnable=True) + + depths = 8 + num_heads = 4 + window_size = (5, 9) + pool_size = (4, 4) + self.transformers = TemporalSparseTransformerBlock(dim=hidden, + n_head=num_heads, + window_size=window_size, + pool_size=pool_size, + depths=depths, + t2t_params=t2t_params) + if init_weights: + self.init_weights() + + if model_path is not None: + print('Pretrained ProPainter has loaded...') + ckpt = torch.load(model_path, map_location='cpu') + self.load_state_dict(ckpt, strict=True) + + # print network parameter number + self.print_network() + + def img_propagation(self, masked_frames, completed_flows, masks, interpolation='nearest'): + _, _, prop_frames, updated_masks = self.img_prop_module(masked_frames, completed_flows[0], completed_flows[1], + masks, interpolation) + return prop_frames, updated_masks + + def forward(self, masked_frames, completed_flows, masks_in, masks_updated, num_local_frames, + interpolation='bilinear', t_dilation=2): + """ + Args: + masks_in: original mask + masks_updated: updated mask after image propagation + """ + + l_t = num_local_frames + b, t, _, ori_h, ori_w = masked_frames.size() + + # extracting features + enc_feat = self.encoder(torch.cat([masked_frames.view(b * t, 3, ori_h, ori_w), + masks_in.view(b * t, 1, ori_h, ori_w), + masks_updated.view(b * t, 1, ori_h, ori_w)], dim=1)) + _, c, h, w = enc_feat.size() + local_feat = enc_feat.view(b, t, c, h, w)[:, :l_t, ...] + ref_feat = enc_feat.view(b, t, c, h, w)[:, l_t:, ...] + fold_feat_size = (h, w) + + ds_flows_f = F.interpolate(completed_flows[0].view(-1, 2, ori_h, ori_w), scale_factor=1 / 4, mode='bilinear', + align_corners=False).view(b, l_t - 1, 2, h, w) / 4.0 + ds_flows_b = F.interpolate(completed_flows[1].view(-1, 2, ori_h, ori_w), scale_factor=1 / 4, mode='bilinear', + align_corners=False).view(b, l_t - 1, 2, h, w) / 4.0 + ds_mask_in = F.interpolate(masks_in.reshape(-1, 1, ori_h, ori_w), scale_factor=1 / 4, mode='nearest').view(b, t, + 1, h, + w) + ds_mask_in_local = ds_mask_in[:, :l_t] + ds_mask_updated_local = F.interpolate(masks_updated[:, :l_t].reshape(-1, 1, ori_h, ori_w), scale_factor=1 / 4, + mode='nearest').view(b, l_t, 1, h, w) + + if self.training: + mask_pool_l = self.max_pool(ds_mask_in.view(-1, 1, h, w)) + mask_pool_l = mask_pool_l.view(b, t, 1, mask_pool_l.size(-2), mask_pool_l.size(-1)) + else: + mask_pool_l = self.max_pool(ds_mask_in_local.view(-1, 1, h, w)) + mask_pool_l = mask_pool_l.view(b, l_t, 1, mask_pool_l.size(-2), mask_pool_l.size(-1)) + + prop_mask_in = torch.cat([ds_mask_in_local, ds_mask_updated_local], dim=2) + _, _, local_feat, _ = self.feat_prop_module(local_feat, ds_flows_f, ds_flows_b, prop_mask_in, interpolation) + enc_feat = torch.cat((local_feat, ref_feat), dim=1) + + trans_feat = self.ss(enc_feat.view(-1, c, h, w), b, fold_feat_size) + mask_pool_l = rearrange(mask_pool_l, 'b t c h w -> b t h w c').contiguous() + trans_feat = self.transformers(trans_feat, fold_feat_size, mask_pool_l, t_dilation=t_dilation) + trans_feat = self.sc(trans_feat, t, fold_feat_size) + trans_feat = trans_feat.view(b, t, -1, h, w) + + enc_feat = enc_feat + trans_feat + + if self.training: + output = self.decoder(enc_feat.view(-1, c, h, w)) + output = torch.tanh(output).view(b, t, 3, ori_h, ori_w) + else: + output = self.decoder(enc_feat[:, :l_t].view(-1, c, h, w)) + output = torch.tanh(output).view(b, l_t, 3, ori_h, ori_w) + + return output + + +# ###################################################################### +# Discriminator for Temporal Patch GAN +# ###################################################################### +class Discriminator(BaseNetwork): + def __init__(self, + in_channels=3, + use_sigmoid=False, + use_spectral_norm=True, + init_weights=True): + super(Discriminator, self).__init__() + self.use_sigmoid = use_sigmoid + nf = 32 + + self.conv = nn.Sequential( + spectral_norm( + nn.Conv3d(in_channels=in_channels, + out_channels=nf * 1, + kernel_size=(3, 5, 5), + stride=(1, 2, 2), + padding=1, + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(64, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 1, + nf * 2, + kernel_size=(3, 5, 5), + stride=(1, 2, 2), + padding=(1, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(128, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 2, + nf * 4, + kernel_size=(3, 5, 5), + stride=(1, 2, 2), + padding=(1, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(256, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 4, + nf * 4, + kernel_size=(3, 5, 5), + stride=(1, 2, 2), + padding=(1, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(256, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 4, + nf * 4, + kernel_size=(3, 5, 5), + stride=(1, 2, 2), + padding=(1, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(256, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv3d(nf * 4, + nf * 4, + kernel_size=(3, 5, 5), + stride=(1, 2, 2), + padding=(1, 2, 2))) + + if init_weights: + self.init_weights() + + def forward(self, xs): + # T, C, H, W = xs.shape (old) + # B, T, C, H, W (new) + xs_t = torch.transpose(xs, 1, 2) + feat = self.conv(xs_t) + if self.use_sigmoid: + feat = torch.sigmoid(feat) + out = torch.transpose(feat, 1, 2) # B, T, C, H, W + return out + + +class Discriminator_2D(BaseNetwork): + def __init__(self, + in_channels=3, + use_sigmoid=False, + use_spectral_norm=True, + init_weights=True): + super(Discriminator_2D, self).__init__() + self.use_sigmoid = use_sigmoid + nf = 32 + + self.conv = nn.Sequential( + spectral_norm( + nn.Conv3d(in_channels=in_channels, + out_channels=nf * 1, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(64, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 1, + nf * 2, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(128, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 2, + nf * 4, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(256, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 4, + nf * 4, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(256, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + spectral_norm( + nn.Conv3d(nf * 4, + nf * 4, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + bias=not use_spectral_norm), use_spectral_norm), + # nn.InstanceNorm2d(256, track_running_stats=False), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv3d(nf * 4, + nf * 4, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2))) + + if init_weights: + self.init_weights() + + def forward(self, xs): + # T, C, H, W = xs.shape (old) + # B, T, C, H, W (new) + xs_t = torch.transpose(xs, 1, 2) + feat = self.conv(xs_t) + if self.use_sigmoid: + feat = torch.sigmoid(feat) + out = torch.transpose(feat, 1, 2) # B, T, C, H, W + return out + + +def spectral_norm(module, mode=True): + if mode: + return _spectral_norm(module) + return module diff --git a/backend/inpaint/video/model/recurrent_flow_completion.py b/backend/inpaint/video/model/recurrent_flow_completion.py new file mode 100644 index 0000000..7038e34 --- /dev/null +++ b/backend/inpaint/video/model/recurrent_flow_completion.py @@ -0,0 +1,348 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision + +from backend.inpaint.video.model.modules.deformconv import ModulatedDeformConv2d +from .misc import constant_init + + +class SecondOrderDeformableAlignment(ModulatedDeformConv2d): + """Second-order deformable alignment module.""" + + def __init__(self, *args, **kwargs): + self.max_residue_magnitude = kwargs.pop('max_residue_magnitude', 5) + + super(SecondOrderDeformableAlignment, self).__init__(*args, **kwargs) + + self.conv_offset = nn.Sequential( + nn.Conv2d(3 * self.out_channels, self.out_channels, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(self.out_channels, 27 * self.deform_groups, 3, 1, 1), + ) + self.init_offset() + + def init_offset(self): + constant_init(self.conv_offset[-1], val=0, bias=0) + + def forward(self, x, extra_feat): + out = self.conv_offset(extra_feat) + o1, o2, mask = torch.chunk(out, 3, dim=1) + + # offset + offset = self.max_residue_magnitude * torch.tanh(torch.cat((o1, o2), dim=1)) + offset_1, offset_2 = torch.chunk(offset, 2, dim=1) + offset = torch.cat([offset_1, offset_2], dim=1) + + # mask + mask = torch.sigmoid(mask) + + return torchvision.ops.deform_conv2d(x, offset, self.weight, self.bias, + self.stride, self.padding, + self.dilation, mask) + + +class BidirectionalPropagation(nn.Module): + def __init__(self, channel): + super(BidirectionalPropagation, self).__init__() + modules = ['backward_', 'forward_'] + self.deform_align = nn.ModuleDict() + self.backbone = nn.ModuleDict() + self.channel = channel + + for i, module in enumerate(modules): + self.deform_align[module] = SecondOrderDeformableAlignment( + 2 * channel, channel, 3, padding=1, deform_groups=16) + + self.backbone[module] = nn.Sequential( + nn.Conv2d((2 + i) * channel, channel, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.1, inplace=True), + nn.Conv2d(channel, channel, 3, 1, 1), + ) + + self.fusion = nn.Conv2d(2 * channel, channel, 1, 1, 0) + + def forward(self, x): + """ + x shape : [b, t, c, h, w] + return [b, t, c, h, w] + """ + b, t, c, h, w = x.shape + feats = {} + feats['spatial'] = [x[:, i, :, :, :] for i in range(0, t)] + + for module_name in ['backward_', 'forward_']: + + feats[module_name] = [] + + frame_idx = range(0, t) + mapping_idx = list(range(0, len(feats['spatial']))) + mapping_idx += mapping_idx[::-1] + + if 'backward' in module_name: + frame_idx = frame_idx[::-1] + + feat_prop = x.new_zeros(b, self.channel, h, w) + for i, idx in enumerate(frame_idx): + feat_current = feats['spatial'][mapping_idx[idx]] + if i > 0: + cond_n1 = feat_prop + + # initialize second-order features + feat_n2 = torch.zeros_like(feat_prop) + cond_n2 = torch.zeros_like(cond_n1) + if i > 1: # second-order features + feat_n2 = feats[module_name][-2] + cond_n2 = feat_n2 + + cond = torch.cat([cond_n1, feat_current, cond_n2], + dim=1) # condition information, cond(flow warped 1st/2nd feature) + feat_prop = torch.cat([feat_prop, feat_n2], dim=1) # two order feat_prop -1 & -2 + feat_prop = self.deform_align[module_name](feat_prop, cond) + + # fuse current features + feat = [feat_current] + \ + [feats[k][idx] for k in feats if k not in ['spatial', module_name]] \ + + [feat_prop] + + feat = torch.cat(feat, dim=1) + # embed current features + feat_prop = feat_prop + self.backbone[module_name](feat) + + feats[module_name].append(feat_prop) + + # end for + if 'backward' in module_name: + feats[module_name] = feats[module_name][::-1] + + outputs = [] + for i in range(0, t): + align_feats = [feats[k].pop(0) for k in feats if k != 'spatial'] + align_feats = torch.cat(align_feats, dim=1) + outputs.append(self.fusion(align_feats)) + + return torch.stack(outputs, dim=1) + x + + +class deconv(nn.Module): + def __init__(self, + input_channel, + output_channel, + kernel_size=3, + padding=0): + super().__init__() + self.conv = nn.Conv2d(input_channel, + output_channel, + kernel_size=kernel_size, + stride=1, + padding=padding) + + def forward(self, x): + x = F.interpolate(x, + scale_factor=2, + mode='bilinear', + align_corners=True) + return self.conv(x) + + +class P3DBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride, padding, use_residual=0, bias=True): + super().__init__() + self.conv1 = nn.Sequential( + nn.Conv3d(in_channels, out_channels, kernel_size=(1, kernel_size, kernel_size), + stride=(1, stride, stride), padding=(0, padding, padding), bias=bias), + nn.LeakyReLU(0.2, inplace=True) + ) + self.conv2 = nn.Sequential( + nn.Conv3d(out_channels, out_channels, kernel_size=(3, 1, 1), stride=(1, 1, 1), + padding=(2, 0, 0), dilation=(2, 1, 1), bias=bias) + ) + self.use_residual = use_residual + + def forward(self, feats): + feat1 = self.conv1(feats) + feat2 = self.conv2(feat1) + if self.use_residual: + output = feats + feat2 + else: + output = feat2 + return output + + +class EdgeDetection(nn.Module): + def __init__(self, in_ch=2, out_ch=1, mid_ch=16): + super().__init__() + self.projection = nn.Sequential( + nn.Conv2d(in_ch, mid_ch, 3, 1, 1), + nn.LeakyReLU(0.2, inplace=True) + ) + + self.mid_layer_1 = nn.Sequential( + nn.Conv2d(mid_ch, mid_ch, 3, 1, 1), + nn.LeakyReLU(0.2, inplace=True) + ) + + self.mid_layer_2 = nn.Sequential( + nn.Conv2d(mid_ch, mid_ch, 3, 1, 1) + ) + + self.l_relu = nn.LeakyReLU(0.01, inplace=True) + + self.out_layer = nn.Conv2d(mid_ch, out_ch, 1, 1, 0) + + def forward(self, flow): + flow = self.projection(flow) + edge = self.mid_layer_1(flow) + edge = self.mid_layer_2(edge) + edge = self.l_relu(flow + edge) + edge = self.out_layer(edge) + edge = torch.sigmoid(edge) + return edge + + +class RecurrentFlowCompleteNet(nn.Module): + def __init__(self, model_path=None): + super().__init__() + self.downsample = nn.Sequential( + nn.Conv3d(3, 32, kernel_size=(1, 5, 5), stride=(1, 2, 2), + padding=(0, 2, 2), padding_mode='replicate'), + nn.LeakyReLU(0.2, inplace=True) + ) + + self.encoder1 = nn.Sequential( + P3DBlock(32, 32, 3, 1, 1), + nn.LeakyReLU(0.2, inplace=True), + P3DBlock(32, 64, 3, 2, 1), + nn.LeakyReLU(0.2, inplace=True) + ) # 4x + + self.encoder2 = nn.Sequential( + P3DBlock(64, 64, 3, 1, 1), + nn.LeakyReLU(0.2, inplace=True), + P3DBlock(64, 128, 3, 2, 1), + nn.LeakyReLU(0.2, inplace=True) + ) # 8x + + self.mid_dilation = nn.Sequential( + nn.Conv3d(128, 128, (1, 3, 3), (1, 1, 1), padding=(0, 3, 3), dilation=(1, 3, 3)), # p = d*(k-1)/2 + nn.LeakyReLU(0.2, inplace=True), + nn.Conv3d(128, 128, (1, 3, 3), (1, 1, 1), padding=(0, 2, 2), dilation=(1, 2, 2)), + nn.LeakyReLU(0.2, inplace=True), + nn.Conv3d(128, 128, (1, 3, 3), (1, 1, 1), padding=(0, 1, 1), dilation=(1, 1, 1)), + nn.LeakyReLU(0.2, inplace=True) + ) + + # feature propagation module + self.feat_prop_module = BidirectionalPropagation(128) + + self.decoder2 = nn.Sequential( + nn.Conv2d(128, 128, 3, 1, 1), + nn.LeakyReLU(0.2, inplace=True), + deconv(128, 64, 3, 1), + nn.LeakyReLU(0.2, inplace=True) + ) # 4x + + self.decoder1 = nn.Sequential( + nn.Conv2d(64, 64, 3, 1, 1), + nn.LeakyReLU(0.2, inplace=True), + deconv(64, 32, 3, 1), + nn.LeakyReLU(0.2, inplace=True) + ) # 2x + + self.upsample = nn.Sequential( + nn.Conv2d(32, 32, 3, padding=1), + nn.LeakyReLU(0.2, inplace=True), + deconv(32, 2, 3, 1) + ) + + # edge loss + self.edgeDetector = EdgeDetection(in_ch=2, out_ch=1, mid_ch=16) + + # Need to initial the weights of MSDeformAttn specifically + for m in self.modules(): + if isinstance(m, SecondOrderDeformableAlignment): + m.init_offset() + + if model_path is not None: + print('Pretrained flow completion model has loaded...') + ckpt = torch.load(model_path, map_location='cpu') + self.load_state_dict(ckpt, strict=True) + + def forward(self, masked_flows, masks): + # masked_flows: b t-1 2 h w + # masks: b t-1 2 h w + b, t, _, h, w = masked_flows.size() + masked_flows = masked_flows.permute(0, 2, 1, 3, 4) + masks = masks.permute(0, 2, 1, 3, 4) + + inputs = torch.cat((masked_flows, masks), dim=1) + + x = self.downsample(inputs) + + feat_e1 = self.encoder1(x) + feat_e2 = self.encoder2(feat_e1) # b c t h w + feat_mid = self.mid_dilation(feat_e2) # b c t h w + feat_mid = feat_mid.permute(0, 2, 1, 3, 4) # b t c h w + + feat_prop = self.feat_prop_module(feat_mid) + feat_prop = feat_prop.view(-1, 128, h // 8, w // 8) # b*t c h w + + _, c, _, h_f, w_f = feat_e1.shape + feat_e1 = feat_e1.permute(0, 2, 1, 3, 4).contiguous().view(-1, c, h_f, w_f) # b*t c h w + feat_d2 = self.decoder2(feat_prop) + feat_e1 + + _, c, _, h_f, w_f = x.shape + x = x.permute(0, 2, 1, 3, 4).contiguous().view(-1, c, h_f, w_f) # b*t c h w + + feat_d1 = self.decoder1(feat_d2) + + flow = self.upsample(feat_d1) + if self.training: + edge = self.edgeDetector(flow) + edge = edge.view(b, t, 1, h, w) + else: + edge = None + + flow = flow.view(b, t, 2, h, w) + + return flow, edge + + def forward_bidirect_flow(self, masked_flows_bi, masks): + """ + Args: + masked_flows_bi: [masked_flows_f, masked_flows_b] | (b t-1 2 h w), (b t-1 2 h w) + masks: b t 1 h w + """ + masks_forward = masks[:, :-1, ...].contiguous() + masks_backward = masks[:, 1:, ...].contiguous() + + # mask flow + masked_flows_forward = masked_flows_bi[0] * (1 - masks_forward) + masked_flows_backward = masked_flows_bi[1] * (1 - masks_backward) + + # -- completion -- + # forward + pred_flows_forward, pred_edges_forward = self.forward(masked_flows_forward, masks_forward) + + # backward + masked_flows_backward = torch.flip(masked_flows_backward, dims=[1]) + masks_backward = torch.flip(masks_backward, dims=[1]) + pred_flows_backward, pred_edges_backward = self.forward(masked_flows_backward, masks_backward) + pred_flows_backward = torch.flip(pred_flows_backward, dims=[1]) + if self.training: + pred_edges_backward = torch.flip(pred_edges_backward, dims=[1]) + + return [pred_flows_forward, pred_flows_backward], [pred_edges_forward, pred_edges_backward] + + def combine_flow(self, masked_flows_bi, pred_flows_bi, masks): + masks_forward = masks[:, :-1, ...].contiguous() + masks_backward = masks[:, 1:, ...].contiguous() + + pred_flows_forward = pred_flows_bi[0] * masks_forward + masked_flows_bi[0] * (1 - masks_forward) + pred_flows_backward = pred_flows_bi[1] * masks_backward + masked_flows_bi[1] * (1 - masks_backward) + + return pred_flows_forward, pred_flows_backward diff --git a/backend/inpaint/video/model/vgg_arch.py b/backend/inpaint/video/model/vgg_arch.py new file mode 100644 index 0000000..43fc2ff --- /dev/null +++ b/backend/inpaint/video/model/vgg_arch.py @@ -0,0 +1,157 @@ +import os +import torch +from collections import OrderedDict +from torch import nn as nn +from torchvision.models import vgg as vgg + +VGG_PRETRAIN_PATH = 'experiments/pretrained_models/vgg19-dcbb9e9d.pth' +NAMES = { + 'vgg11': [ + 'conv1_1', 'relu1_1', 'pool1', 'conv2_1', 'relu2_1', 'pool2', 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', + 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', + 'pool5' + ], + 'vgg13': [ + 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', + 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'pool4', + 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'pool5' + ], + 'vgg16': [ + 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', + 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', + 'relu4_2', 'conv4_3', 'relu4_3', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', + 'pool5' + ], + 'vgg19': [ + 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', + 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', 'conv4_1', + 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1', + 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5' + ] +} + + +def insert_bn(names): + """Insert bn layer after each conv. + + Args: + names (list): The list of layer names. + + Returns: + list: The list of layer names with bn layers. + """ + names_bn = [] + for name in names: + names_bn.append(name) + if 'conv' in name: + position = name.replace('conv', '') + names_bn.append('bn' + position) + return names_bn + +class VGGFeatureExtractor(nn.Module): + """VGG network for feature extraction. + + In this implementation, we allow users to choose whether use normalization + in the input feature and the type of vgg network. Note that the pretrained + path must fit the vgg type. + + Args: + layer_name_list (list[str]): Forward function returns the corresponding + features according to the layer_name_list. + Example: {'relu1_1', 'relu2_1', 'relu3_1'}. + vgg_type (str): Set the type of vgg network. Default: 'vgg19'. + use_input_norm (bool): If True, normalize the input image. Importantly, + the input feature must in the range [0, 1]. Default: True. + range_norm (bool): If True, norm images with range [-1, 1] to [0, 1]. + Default: False. + requires_grad (bool): If true, the parameters of VGG network will be + optimized. Default: False. + remove_pooling (bool): If true, the max pooling operations in VGG net + will be removed. Default: False. + pooling_stride (int): The stride of max pooling operation. Default: 2. + """ + + def __init__(self, + layer_name_list, + vgg_type='vgg19', + use_input_norm=True, + range_norm=False, + requires_grad=False, + remove_pooling=False, + pooling_stride=2): + super(VGGFeatureExtractor, self).__init__() + + self.layer_name_list = layer_name_list + self.use_input_norm = use_input_norm + self.range_norm = range_norm + + self.names = NAMES[vgg_type.replace('_bn', '')] + if 'bn' in vgg_type: + self.names = insert_bn(self.names) + + # only borrow layers that will be used to avoid unused params + max_idx = 0 + for v in layer_name_list: + idx = self.names.index(v) + if idx > max_idx: + max_idx = idx + + if os.path.exists(VGG_PRETRAIN_PATH): + vgg_net = getattr(vgg, vgg_type)(pretrained=False) + state_dict = torch.load(VGG_PRETRAIN_PATH, map_location=lambda storage, loc: storage) + vgg_net.load_state_dict(state_dict) + else: + vgg_net = getattr(vgg, vgg_type)(pretrained=True) + + features = vgg_net.features[:max_idx + 1] + + modified_net = OrderedDict() + for k, v in zip(self.names, features): + if 'pool' in k: + # if remove_pooling is true, pooling operation will be removed + if remove_pooling: + continue + else: + # in some cases, we may want to change the default stride + modified_net[k] = nn.MaxPool2d(kernel_size=2, stride=pooling_stride) + else: + modified_net[k] = v + + self.vgg_net = nn.Sequential(modified_net) + + if not requires_grad: + self.vgg_net.eval() + for param in self.parameters(): + param.requires_grad = False + else: + self.vgg_net.train() + for param in self.parameters(): + param.requires_grad = True + + if self.use_input_norm: + # the mean is for image with range [0, 1] + self.register_buffer('mean', torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)) + # the std is for image with range [0, 1] + self.register_buffer('std', torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)) + + def forward(self, x): + """Forward function. + + Args: + x (Tensor): Input tensor with shape (n, c, h, w). + + Returns: + Tensor: Forward results. + """ + if self.range_norm: + x = (x + 1) / 2 + if self.use_input_norm: + x = (x - self.mean) / self.std + output = {} + + for key, layer in self.vgg_net._modules.items(): + x = layer(x) + if key in self.layer_name_list: + output[key] = x.clone() + + return output diff --git a/backend/inpaint/video/raft/__init__.py b/backend/inpaint/video/raft/__init__.py new file mode 100755 index 0000000..e7179ea --- /dev/null +++ b/backend/inpaint/video/raft/__init__.py @@ -0,0 +1,2 @@ +# from .demo import RAFT_infer +from .raft import RAFT diff --git a/backend/inpaint/video/raft/corr.py b/backend/inpaint/video/raft/corr.py new file mode 100755 index 0000000..34603a8 --- /dev/null +++ b/backend/inpaint/video/raft/corr.py @@ -0,0 +1,91 @@ +import torch +import torch.nn.functional as F +from .utils.utils import bilinear_sampler, coords_grid + +try: + import alt_cuda_corr +except: + # alt_cuda_corr is not compiled + pass + + +class CorrBlock: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.corr_pyramid = [] + + # all pairs correlation + corr = CorrBlock.corr(fmap1, fmap2) + + batch, h1, w1, dim, h2, w2 = corr.shape + corr = corr.reshape(batch*h1*w1, dim, h2, w2) + + self.corr_pyramid.append(corr) + for i in range(self.num_levels-1): + corr = F.avg_pool2d(corr, 2, stride=2) + self.corr_pyramid.append(corr) + + def __call__(self, coords): + r = self.radius + coords = coords.permute(0, 2, 3, 1) + batch, h1, w1, _ = coords.shape + + out_pyramid = [] + for i in range(self.num_levels): + corr = self.corr_pyramid[i] + dx = torch.linspace(-r, r, 2*r+1) + dy = torch.linspace(-r, r, 2*r+1) + delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device) + + centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i + delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2) + coords_lvl = centroid_lvl + delta_lvl + + corr = bilinear_sampler(corr, coords_lvl) + corr = corr.view(batch, h1, w1, -1) + out_pyramid.append(corr) + + out = torch.cat(out_pyramid, dim=-1) + return out.permute(0, 3, 1, 2).contiguous().float() + + @staticmethod + def corr(fmap1, fmap2): + batch, dim, ht, wd = fmap1.shape + fmap1 = fmap1.view(batch, dim, ht*wd) + fmap2 = fmap2.view(batch, dim, ht*wd) + + corr = torch.matmul(fmap1.transpose(1,2), fmap2) + corr = corr.view(batch, ht, wd, 1, ht, wd) + return corr / torch.sqrt(torch.tensor(dim).float()) + + +class AlternateCorrBlock: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + + self.pyramid = [(fmap1, fmap2)] + for i in range(self.num_levels): + fmap1 = F.avg_pool2d(fmap1, 2, stride=2) + fmap2 = F.avg_pool2d(fmap2, 2, stride=2) + self.pyramid.append((fmap1, fmap2)) + + def __call__(self, coords): + + coords = coords.permute(0, 2, 3, 1) + B, H, W, _ = coords.shape + + corr_list = [] + for i in range(self.num_levels): + r = self.radius + fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1) + fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1) + + coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous() + corr = alt_cuda_corr(fmap1_i, fmap2_i, coords_i, r) + corr_list.append(corr.squeeze(1)) + + corr = torch.stack(corr_list, dim=1) + corr = corr.reshape(B, -1, H, W) + return corr / 16.0 diff --git a/backend/inpaint/video/raft/datasets.py b/backend/inpaint/video/raft/datasets.py new file mode 100755 index 0000000..3411fda --- /dev/null +++ b/backend/inpaint/video/raft/datasets.py @@ -0,0 +1,235 @@ +# Data loading based on https://github.com/NVIDIA/flownet2-pytorch + +import numpy as np +import torch +import torch.utils.data as data +import torch.nn.functional as F + +import os +import math +import random +from glob import glob +import os.path as osp + +from utils import frame_utils +from utils.augmentor import FlowAugmentor, SparseFlowAugmentor + + +class FlowDataset(data.Dataset): + def __init__(self, aug_params=None, sparse=False): + self.augmentor = None + self.sparse = sparse + if aug_params is not None: + if sparse: + self.augmentor = SparseFlowAugmentor(**aug_params) + else: + self.augmentor = FlowAugmentor(**aug_params) + + self.is_test = False + self.init_seed = False + self.flow_list = [] + self.image_list = [] + self.extra_info = [] + + def __getitem__(self, index): + + if self.is_test: + img1 = frame_utils.read_gen(self.image_list[index][0]) + img2 = frame_utils.read_gen(self.image_list[index][1]) + img1 = np.array(img1).astype(np.uint8)[..., :3] + img2 = np.array(img2).astype(np.uint8)[..., :3] + img1 = torch.from_numpy(img1).permute(2, 0, 1).float() + img2 = torch.from_numpy(img2).permute(2, 0, 1).float() + return img1, img2, self.extra_info[index] + + if not self.init_seed: + worker_info = torch.utils.data.get_worker_info() + if worker_info is not None: + torch.manual_seed(worker_info.id) + np.random.seed(worker_info.id) + random.seed(worker_info.id) + self.init_seed = True + + index = index % len(self.image_list) + valid = None + if self.sparse: + flow, valid = frame_utils.readFlowKITTI(self.flow_list[index]) + else: + flow = frame_utils.read_gen(self.flow_list[index]) + + img1 = frame_utils.read_gen(self.image_list[index][0]) + img2 = frame_utils.read_gen(self.image_list[index][1]) + + flow = np.array(flow).astype(np.float32) + img1 = np.array(img1).astype(np.uint8) + img2 = np.array(img2).astype(np.uint8) + + # grayscale images + if len(img1.shape) == 2: + img1 = np.tile(img1[...,None], (1, 1, 3)) + img2 = np.tile(img2[...,None], (1, 1, 3)) + else: + img1 = img1[..., :3] + img2 = img2[..., :3] + + if self.augmentor is not None: + if self.sparse: + img1, img2, flow, valid = self.augmentor(img1, img2, flow, valid) + else: + img1, img2, flow = self.augmentor(img1, img2, flow) + + img1 = torch.from_numpy(img1).permute(2, 0, 1).float() + img2 = torch.from_numpy(img2).permute(2, 0, 1).float() + flow = torch.from_numpy(flow).permute(2, 0, 1).float() + + if valid is not None: + valid = torch.from_numpy(valid) + else: + valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000) + + return img1, img2, flow, valid.float() + + + def __rmul__(self, v): + self.flow_list = v * self.flow_list + self.image_list = v * self.image_list + return self + + def __len__(self): + return len(self.image_list) + + +class MpiSintel(FlowDataset): + def __init__(self, aug_params=None, split='training', root='datasets/Sintel', dstype='clean'): + super(MpiSintel, self).__init__(aug_params) + flow_root = osp.join(root, split, 'flow') + image_root = osp.join(root, split, dstype) + + if split == 'test': + self.is_test = True + + for scene in os.listdir(image_root): + image_list = sorted(glob(osp.join(image_root, scene, '*.png'))) + for i in range(len(image_list)-1): + self.image_list += [ [image_list[i], image_list[i+1]] ] + self.extra_info += [ (scene, i) ] # scene and frame_id + + if split != 'test': + self.flow_list += sorted(glob(osp.join(flow_root, scene, '*.flo'))) + + +class FlyingChairs(FlowDataset): + def __init__(self, aug_params=None, split='train', root='datasets/FlyingChairs_release/data'): + super(FlyingChairs, self).__init__(aug_params) + + images = sorted(glob(osp.join(root, '*.ppm'))) + flows = sorted(glob(osp.join(root, '*.flo'))) + assert (len(images)//2 == len(flows)) + + split_list = np.loadtxt('chairs_split.txt', dtype=np.int32) + for i in range(len(flows)): + xid = split_list[i] + if (split=='training' and xid==1) or (split=='validation' and xid==2): + self.flow_list += [ flows[i] ] + self.image_list += [ [images[2*i], images[2*i+1]] ] + + +class FlyingThings3D(FlowDataset): + def __init__(self, aug_params=None, root='datasets/FlyingThings3D', dstype='frames_cleanpass'): + super(FlyingThings3D, self).__init__(aug_params) + + for cam in ['left']: + for direction in ['into_future', 'into_past']: + image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*'))) + image_dirs = sorted([osp.join(f, cam) for f in image_dirs]) + + flow_dirs = sorted(glob(osp.join(root, 'optical_flow/TRAIN/*/*'))) + flow_dirs = sorted([osp.join(f, direction, cam) for f in flow_dirs]) + + for idir, fdir in zip(image_dirs, flow_dirs): + images = sorted(glob(osp.join(idir, '*.png')) ) + flows = sorted(glob(osp.join(fdir, '*.pfm')) ) + for i in range(len(flows)-1): + if direction == 'into_future': + self.image_list += [ [images[i], images[i+1]] ] + self.flow_list += [ flows[i] ] + elif direction == 'into_past': + self.image_list += [ [images[i+1], images[i]] ] + self.flow_list += [ flows[i+1] ] + + +class KITTI(FlowDataset): + def __init__(self, aug_params=None, split='training', root='datasets/KITTI'): + super(KITTI, self).__init__(aug_params, sparse=True) + if split == 'testing': + self.is_test = True + + root = osp.join(root, split) + images1 = sorted(glob(osp.join(root, 'image_2/*_10.png'))) + images2 = sorted(glob(osp.join(root, 'image_2/*_11.png'))) + + for img1, img2 in zip(images1, images2): + frame_id = img1.split('/')[-1] + self.extra_info += [ [frame_id] ] + self.image_list += [ [img1, img2] ] + + if split == 'training': + self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png'))) + + +class HD1K(FlowDataset): + def __init__(self, aug_params=None, root='datasets/HD1k'): + super(HD1K, self).__init__(aug_params, sparse=True) + + seq_ix = 0 + while 1: + flows = sorted(glob(os.path.join(root, 'hd1k_flow_gt', 'flow_occ/%06d_*.png' % seq_ix))) + images = sorted(glob(os.path.join(root, 'hd1k_input', 'image_2/%06d_*.png' % seq_ix))) + + if len(flows) == 0: + break + + for i in range(len(flows)-1): + self.flow_list += [flows[i]] + self.image_list += [ [images[i], images[i+1]] ] + + seq_ix += 1 + + +def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'): + """ Create the data loader for the corresponding trainign set """ + + if args.stage == 'chairs': + aug_params = {'crop_size': args.image_size, 'min_scale': -0.1, 'max_scale': 1.0, 'do_flip': True} + train_dataset = FlyingChairs(aug_params, split='training') + + elif args.stage == 'things': + aug_params = {'crop_size': args.image_size, 'min_scale': -0.4, 'max_scale': 0.8, 'do_flip': True} + clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass') + final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass') + train_dataset = clean_dataset + final_dataset + + elif args.stage == 'sintel': + aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.6, 'do_flip': True} + things = FlyingThings3D(aug_params, dstype='frames_cleanpass') + sintel_clean = MpiSintel(aug_params, split='training', dstype='clean') + sintel_final = MpiSintel(aug_params, split='training', dstype='final') + + if TRAIN_DS == 'C+T+K+S+H': + kitti = KITTI({'crop_size': args.image_size, 'min_scale': -0.3, 'max_scale': 0.5, 'do_flip': True}) + hd1k = HD1K({'crop_size': args.image_size, 'min_scale': -0.5, 'max_scale': 0.2, 'do_flip': True}) + train_dataset = 100*sintel_clean + 100*sintel_final + 200*kitti + 5*hd1k + things + + elif TRAIN_DS == 'C+T+K/S': + train_dataset = 100*sintel_clean + 100*sintel_final + things + + elif args.stage == 'kitti': + aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.4, 'do_flip': False} + train_dataset = KITTI(aug_params, split='training') + + train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, + pin_memory=False, shuffle=True, num_workers=4, drop_last=True) + + print('Training with %d image pairs' % len(train_dataset)) + return train_loader + diff --git a/backend/inpaint/video/raft/demo.py b/backend/inpaint/video/raft/demo.py new file mode 100755 index 0000000..096963b --- /dev/null +++ b/backend/inpaint/video/raft/demo.py @@ -0,0 +1,79 @@ +import sys +import argparse +import os +import cv2 +import glob +import numpy as np +import torch +from PIL import Image + +from .raft import RAFT +from .utils import flow_viz +from .utils.utils import InputPadder + + + +DEVICE = 'cuda' + +def load_image(imfile): + img = np.array(Image.open(imfile)).astype(np.uint8) + img = torch.from_numpy(img).permute(2, 0, 1).float() + return img + + +def load_image_list(image_files): + images = [] + for imfile in sorted(image_files): + images.append(load_image(imfile)) + + images = torch.stack(images, dim=0) + images = images.to(DEVICE) + + padder = InputPadder(images.shape) + return padder.pad(images)[0] + + +def viz(img, flo): + img = img[0].permute(1,2,0).cpu().numpy() + flo = flo[0].permute(1,2,0).cpu().numpy() + + # map flow to rgb image + flo = flow_viz.flow_to_image(flo) + # img_flo = np.concatenate([img, flo], axis=0) + img_flo = flo + + cv2.imwrite('/home/chengao/test/flow.png', img_flo[:, :, [2,1,0]]) + # cv2.imshow('image', img_flo[:, :, [2,1,0]]/255.0) + # cv2.waitKey() + + +def demo(args): + model = torch.nn.DataParallel(RAFT(args)) + model.load_state_dict(torch.load(args.model)) + + model = model.module + model.to(DEVICE) + model.eval() + + with torch.no_grad(): + images = glob.glob(os.path.join(args.path, '*.png')) + \ + glob.glob(os.path.join(args.path, '*.jpg')) + + images = load_image_list(images) + for i in range(images.shape[0]-1): + image1 = images[i,None] + image2 = images[i+1,None] + + flow_low, flow_up = model(image1, image2, iters=20, test_mode=True) + viz(image1, flow_up) + + +def RAFT_infer(args): + model = torch.nn.DataParallel(RAFT(args)) + model.load_state_dict(torch.load(args.model)) + + model = model.module + model.to(DEVICE) + model.eval() + + return model diff --git a/backend/inpaint/video/raft/extractor.py b/backend/inpaint/video/raft/extractor.py new file mode 100755 index 0000000..9a9c759 --- /dev/null +++ b/backend/inpaint/video/raft/extractor.py @@ -0,0 +1,267 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ResidualBlock(nn.Module): + def __init__(self, in_planes, planes, norm_fn='group', stride=1): + super(ResidualBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + if not stride == 1: + self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + + elif norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(planes) + self.norm2 = nn.BatchNorm2d(planes) + if not stride == 1: + self.norm3 = nn.BatchNorm2d(planes) + + elif norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(planes) + self.norm2 = nn.InstanceNorm2d(planes) + if not stride == 1: + self.norm3 = nn.InstanceNorm2d(planes) + + elif norm_fn == 'none': + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + if not stride == 1: + self.norm3 = nn.Sequential() + + if stride == 1: + self.downsample = None + + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) + + + def forward(self, x): + y = x + y = self.relu(self.norm1(self.conv1(y))) + y = self.relu(self.norm2(self.conv2(y))) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x+y) + + + +class BottleneckBlock(nn.Module): + def __init__(self, in_planes, planes, norm_fn='group', stride=1): + super(BottleneckBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0) + self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride) + self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) + self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) + self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + if not stride == 1: + self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + + elif norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(planes//4) + self.norm2 = nn.BatchNorm2d(planes//4) + self.norm3 = nn.BatchNorm2d(planes) + if not stride == 1: + self.norm4 = nn.BatchNorm2d(planes) + + elif norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(planes//4) + self.norm2 = nn.InstanceNorm2d(planes//4) + self.norm3 = nn.InstanceNorm2d(planes) + if not stride == 1: + self.norm4 = nn.InstanceNorm2d(planes) + + elif norm_fn == 'none': + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + self.norm3 = nn.Sequential() + if not stride == 1: + self.norm4 = nn.Sequential() + + if stride == 1: + self.downsample = None + + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) + + + def forward(self, x): + y = x + y = self.relu(self.norm1(self.conv1(y))) + y = self.relu(self.norm2(self.conv2(y))) + y = self.relu(self.norm3(self.conv3(y))) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x+y) + +class BasicEncoder(nn.Module): + def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): + super(BasicEncoder, self).__init__() + self.norm_fn = norm_fn + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) + + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(64) + + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(64) + + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.relu1 = nn.ReLU(inplace=True) + + self.in_planes = 64 + self.layer1 = self._make_layer(64, stride=1) + self.layer2 = self._make_layer(96, stride=2) + self.layer3 = self._make_layer(128, stride=2) + + # output convolution + self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) + + self.dropout = None + if dropout > 0: + self.dropout = nn.Dropout2d(p=dropout) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + + def forward(self, x): + + # if input is list, combine batch dimension + is_list = isinstance(x, tuple) or isinstance(x, list) + if is_list: + batch_dim = x[0].shape[0] + x = torch.cat(x, dim=0) + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + + x = self.conv2(x) + + if self.training and self.dropout is not None: + x = self.dropout(x) + + if is_list: + x = torch.split(x, [batch_dim, batch_dim], dim=0) + + return x + + +class SmallEncoder(nn.Module): + def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): + super(SmallEncoder, self).__init__() + self.norm_fn = norm_fn + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) + + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(32) + + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(32) + + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) + self.relu1 = nn.ReLU(inplace=True) + + self.in_planes = 32 + self.layer1 = self._make_layer(32, stride=1) + self.layer2 = self._make_layer(64, stride=2) + self.layer3 = self._make_layer(96, stride=2) + + self.dropout = None + if dropout > 0: + self.dropout = nn.Dropout2d(p=dropout) + + self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + + def forward(self, x): + + # if input is list, combine batch dimension + is_list = isinstance(x, tuple) or isinstance(x, list) + if is_list: + batch_dim = x[0].shape[0] + x = torch.cat(x, dim=0) + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.conv2(x) + + if self.training and self.dropout is not None: + x = self.dropout(x) + + if is_list: + x = torch.split(x, [batch_dim, batch_dim], dim=0) + + return x diff --git a/backend/inpaint/video/raft/raft.py b/backend/inpaint/video/raft/raft.py new file mode 100755 index 0000000..829ef97 --- /dev/null +++ b/backend/inpaint/video/raft/raft.py @@ -0,0 +1,146 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .update import BasicUpdateBlock, SmallUpdateBlock +from .extractor import BasicEncoder, SmallEncoder +from .corr import CorrBlock, AlternateCorrBlock +from .utils.utils import bilinear_sampler, coords_grid, upflow8 + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + + +class RAFT(nn.Module): + def __init__(self, args): + super(RAFT, self).__init__() + self.args = args + + if args.small: + self.hidden_dim = hdim = 96 + self.context_dim = cdim = 64 + args.corr_levels = 4 + args.corr_radius = 3 + + else: + self.hidden_dim = hdim = 128 + self.context_dim = cdim = 128 + args.corr_levels = 4 + args.corr_radius = 4 + + if 'dropout' not in args._get_kwargs(): + args.dropout = 0 + + if 'alternate_corr' not in args._get_kwargs(): + args.alternate_corr = False + + # feature network, context network, and update block + if args.small: + self.fnet = SmallEncoder(output_dim=128, norm_fn='instance', dropout=args.dropout) + self.cnet = SmallEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout) + self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim) + + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=args.dropout) + self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='batch', dropout=args.dropout) + self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim) + + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_flow(self, img): + """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0""" + N, C, H, W = img.shape + coords0 = coords_grid(N, H//8, W//8).to(img.device) + coords1 = coords_grid(N, H//8, W//8).to(img.device) + + # optical flow computed as difference: flow = coords1 - coords0 + return coords0, coords1 + + def upsample_flow(self, flow, mask): + """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ + N, _, H, W = flow.shape + mask = mask.view(N, 1, 9, 8, 8, H, W) + mask = torch.softmax(mask, dim=2) + + up_flow = F.unfold(8 * flow, [3,3], padding=1) + up_flow = up_flow.view(N, 2, 9, 1, 1, H, W) + + up_flow = torch.sum(mask * up_flow, dim=2) + up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) + return up_flow.reshape(N, 2, 8*H, 8*W) + + + def forward(self, image1, image2, iters=12, flow_init=None, test_mode=True): + """ Estimate optical flow between pair of frames """ + + # image1 = 2 * (image1 / 255.0) - 1.0 + # image2 = 2 * (image2 / 255.0) - 1.0 + + image1 = image1.contiguous() + image2 = image2.contiguous() + + hdim = self.hidden_dim + cdim = self.context_dim + + # run the feature network + with autocast(enabled=self.args.mixed_precision): + fmap1, fmap2 = self.fnet([image1, image2]) + + fmap1 = fmap1.float() + fmap2 = fmap2.float() + + if self.args.alternate_corr: + corr_fn = AlternateCorrBlock(fmap1, fmap2, radius=self.args.corr_radius) + else: + corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius) + + # run the context network + with autocast(enabled=self.args.mixed_precision): + cnet = self.cnet(image1) + net, inp = torch.split(cnet, [hdim, cdim], dim=1) + net = torch.tanh(net) + inp = torch.relu(inp) + + coords0, coords1 = self.initialize_flow(image1) + + if flow_init is not None: + coords1 = coords1 + flow_init + + flow_predictions = [] + for itr in range(iters): + coords1 = coords1.detach() + corr = corr_fn(coords1) # index correlation volume + + flow = coords1 - coords0 + with autocast(enabled=self.args.mixed_precision): + net, up_mask, delta_flow = self.update_block(net, inp, corr, flow) + + # F(t+1) = F(t) + \Delta(t) + coords1 = coords1 + delta_flow + + # upsample predictions + if up_mask is None: + flow_up = upflow8(coords1 - coords0) + else: + flow_up = self.upsample_flow(coords1 - coords0, up_mask) + + flow_predictions.append(flow_up) + + if test_mode: + return coords1 - coords0, flow_up + + return flow_predictions diff --git a/backend/inpaint/video/raft/update.py b/backend/inpaint/video/raft/update.py new file mode 100755 index 0000000..f940497 --- /dev/null +++ b/backend/inpaint/video/raft/update.py @@ -0,0 +1,139 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class FlowHead(nn.Module): + def __init__(self, input_dim=128, hidden_dim=256): + super(FlowHead, self).__init__() + self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) + self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.conv2(self.relu(self.conv1(x))) + +class ConvGRU(nn.Module): + def __init__(self, hidden_dim=128, input_dim=192+128): + super(ConvGRU, self).__init__() + self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) + self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) + self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) + + def forward(self, h, x): + hx = torch.cat([h, x], dim=1) + + z = torch.sigmoid(self.convz(hx)) + r = torch.sigmoid(self.convr(hx)) + q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1))) + + h = (1-z) * h + z * q + return h + +class SepConvGRU(nn.Module): + def __init__(self, hidden_dim=128, input_dim=192+128): + super(SepConvGRU, self).__init__() + self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + + self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + + + def forward(self, h, x): + # horizontal + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz1(hx)) + r = torch.sigmoid(self.convr1(hx)) + q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + # vertical + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz2(hx)) + r = torch.sigmoid(self.convr2(hx)) + q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + return h + +class SmallMotionEncoder(nn.Module): + def __init__(self, args): + super(SmallMotionEncoder, self).__init__() + cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2 + self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0) + self.convf1 = nn.Conv2d(2, 64, 7, padding=3) + self.convf2 = nn.Conv2d(64, 32, 3, padding=1) + self.conv = nn.Conv2d(128, 80, 3, padding=1) + + def forward(self, flow, corr): + cor = F.relu(self.convc1(corr)) + flo = F.relu(self.convf1(flow)) + flo = F.relu(self.convf2(flo)) + cor_flo = torch.cat([cor, flo], dim=1) + out = F.relu(self.conv(cor_flo)) + return torch.cat([out, flow], dim=1) + +class BasicMotionEncoder(nn.Module): + def __init__(self, args): + super(BasicMotionEncoder, self).__init__() + cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2 + self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0) + self.convc2 = nn.Conv2d(256, 192, 3, padding=1) + self.convf1 = nn.Conv2d(2, 128, 7, padding=3) + self.convf2 = nn.Conv2d(128, 64, 3, padding=1) + self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1) + + def forward(self, flow, corr): + cor = F.relu(self.convc1(corr)) + cor = F.relu(self.convc2(cor)) + flo = F.relu(self.convf1(flow)) + flo = F.relu(self.convf2(flo)) + + cor_flo = torch.cat([cor, flo], dim=1) + out = F.relu(self.conv(cor_flo)) + return torch.cat([out, flow], dim=1) + +class SmallUpdateBlock(nn.Module): + def __init__(self, args, hidden_dim=96): + super(SmallUpdateBlock, self).__init__() + self.encoder = SmallMotionEncoder(args) + self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64) + self.flow_head = FlowHead(hidden_dim, hidden_dim=128) + + def forward(self, net, inp, corr, flow): + motion_features = self.encoder(flow, corr) + inp = torch.cat([inp, motion_features], dim=1) + net = self.gru(net, inp) + delta_flow = self.flow_head(net) + + return net, None, delta_flow + +class BasicUpdateBlock(nn.Module): + def __init__(self, args, hidden_dim=128, input_dim=128): + super(BasicUpdateBlock, self).__init__() + self.args = args + self.encoder = BasicMotionEncoder(args) + self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim) + self.flow_head = FlowHead(hidden_dim, hidden_dim=256) + + self.mask = nn.Sequential( + nn.Conv2d(128, 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 64*9, 1, padding=0)) + + def forward(self, net, inp, corr, flow, upsample=True): + motion_features = self.encoder(flow, corr) + inp = torch.cat([inp, motion_features], dim=1) + + net = self.gru(net, inp) + delta_flow = self.flow_head(net) + + # scale mask to balence gradients + mask = .25 * self.mask(net) + return net, mask, delta_flow + + + diff --git a/backend/inpaint/video/raft/utils/__init__.py b/backend/inpaint/video/raft/utils/__init__.py new file mode 100755 index 0000000..0437149 --- /dev/null +++ b/backend/inpaint/video/raft/utils/__init__.py @@ -0,0 +1,2 @@ +from .flow_viz import flow_to_image +from .frame_utils import writeFlow diff --git a/backend/inpaint/video/raft/utils/augmentor.py b/backend/inpaint/video/raft/utils/augmentor.py new file mode 100755 index 0000000..e81c4f2 --- /dev/null +++ b/backend/inpaint/video/raft/utils/augmentor.py @@ -0,0 +1,246 @@ +import numpy as np +import random +import math +from PIL import Image + +import cv2 +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +import torch +from torchvision.transforms import ColorJitter +import torch.nn.functional as F + + +class FlowAugmentor: + def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True): + + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = 0.8 + self.stretch_prob = 0.8 + self.max_stretch = 0.2 + + # flip augmentation params + self.do_flip = do_flip + self.h_flip_prob = 0.5 + self.v_flip_prob = 0.1 + + # photometric augmentation params + self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5/3.14) + self.asymmetric_color_aug_prob = 0.2 + self.eraser_aug_prob = 0.5 + + def color_transform(self, img1, img2): + """ Photometric augmentation """ + + # asymmetric + if np.random.rand() < self.asymmetric_color_aug_prob: + img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) + img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) + + # symmetric + else: + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) + img1, img2 = np.split(image_stack, 2, axis=0) + + return img1, img2 + + def eraser_transform(self, img1, img2, bounds=[50, 100]): + """ Occlusion augmentation """ + + ht, wd = img1.shape[:2] + if np.random.rand() < self.eraser_aug_prob: + mean_color = np.mean(img2.reshape(-1, 3), axis=0) + for _ in range(np.random.randint(1, 3)): + x0 = np.random.randint(0, wd) + y0 = np.random.randint(0, ht) + dx = np.random.randint(bounds[0], bounds[1]) + dy = np.random.randint(bounds[0], bounds[1]) + img2[y0:y0+dy, x0:x0+dx, :] = mean_color + + return img1, img2 + + def spatial_transform(self, img1, img2, flow): + # randomly sample scale + ht, wd = img1.shape[:2] + min_scale = np.maximum( + (self.crop_size[0] + 8) / float(ht), + (self.crop_size[1] + 8) / float(wd)) + + scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) + scale_x = scale + scale_y = scale + if np.random.rand() < self.stretch_prob: + scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + + scale_x = np.clip(scale_x, min_scale, None) + scale_y = np.clip(scale_y, min_scale, None) + + if np.random.rand() < self.spatial_aug_prob: + # rescale the images + img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow = flow * [scale_x, scale_y] + + if self.do_flip: + if np.random.rand() < self.h_flip_prob: # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + + if np.random.rand() < self.v_flip_prob: # v-flip + img1 = img1[::-1, :] + img2 = img2[::-1, :] + flow = flow[::-1, :] * [1.0, -1.0] + + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) + x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) + + img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + + return img1, img2, flow + + def __call__(self, img1, img2, flow): + img1, img2 = self.color_transform(img1, img2) + img1, img2 = self.eraser_transform(img1, img2) + img1, img2, flow = self.spatial_transform(img1, img2, flow) + + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + + return img1, img2, flow + +class SparseFlowAugmentor: + def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False): + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = 0.8 + self.stretch_prob = 0.8 + self.max_stretch = 0.2 + + # flip augmentation params + self.do_flip = do_flip + self.h_flip_prob = 0.5 + self.v_flip_prob = 0.1 + + # photometric augmentation params + self.photo_aug = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3/3.14) + self.asymmetric_color_aug_prob = 0.2 + self.eraser_aug_prob = 0.5 + + def color_transform(self, img1, img2): + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) + img1, img2 = np.split(image_stack, 2, axis=0) + return img1, img2 + + def eraser_transform(self, img1, img2): + ht, wd = img1.shape[:2] + if np.random.rand() < self.eraser_aug_prob: + mean_color = np.mean(img2.reshape(-1, 3), axis=0) + for _ in range(np.random.randint(1, 3)): + x0 = np.random.randint(0, wd) + y0 = np.random.randint(0, ht) + dx = np.random.randint(50, 100) + dy = np.random.randint(50, 100) + img2[y0:y0+dy, x0:x0+dx, :] = mean_color + + return img1, img2 + + def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0): + ht, wd = flow.shape[:2] + coords = np.meshgrid(np.arange(wd), np.arange(ht)) + coords = np.stack(coords, axis=-1) + + coords = coords.reshape(-1, 2).astype(np.float32) + flow = flow.reshape(-1, 2).astype(np.float32) + valid = valid.reshape(-1).astype(np.float32) + + coords0 = coords[valid>=1] + flow0 = flow[valid>=1] + + ht1 = int(round(ht * fy)) + wd1 = int(round(wd * fx)) + + coords1 = coords0 * [fx, fy] + flow1 = flow0 * [fx, fy] + + xx = np.round(coords1[:,0]).astype(np.int32) + yy = np.round(coords1[:,1]).astype(np.int32) + + v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) + xx = xx[v] + yy = yy[v] + flow1 = flow1[v] + + flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) + valid_img = np.zeros([ht1, wd1], dtype=np.int32) + + flow_img[yy, xx] = flow1 + valid_img[yy, xx] = 1 + + return flow_img, valid_img + + def spatial_transform(self, img1, img2, flow, valid): + # randomly sample scale + + ht, wd = img1.shape[:2] + min_scale = np.maximum( + (self.crop_size[0] + 1) / float(ht), + (self.crop_size[1] + 1) / float(wd)) + + scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) + scale_x = np.clip(scale, min_scale, None) + scale_y = np.clip(scale, min_scale, None) + + if np.random.rand() < self.spatial_aug_prob: + # rescale the images + img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y) + + if self.do_flip: + if np.random.rand() < 0.5: # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + valid = valid[:, ::-1] + + margin_y = 20 + margin_x = 50 + + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y) + x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x) + + y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0]) + x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1]) + + img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + return img1, img2, flow, valid + + + def __call__(self, img1, img2, flow, valid): + img1, img2 = self.color_transform(img1, img2) + img1, img2 = self.eraser_transform(img1, img2) + img1, img2, flow, valid = self.spatial_transform(img1, img2, flow, valid) + + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + valid = np.ascontiguousarray(valid) + + return img1, img2, flow, valid diff --git a/backend/inpaint/video/raft/utils/flow_viz.py b/backend/inpaint/video/raft/utils/flow_viz.py new file mode 100755 index 0000000..dcee65e --- /dev/null +++ b/backend/inpaint/video/raft/utils/flow_viz.py @@ -0,0 +1,132 @@ +# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization + + +# MIT License +# +# Copyright (c) 2018 Tom Runia +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to conditions. +# +# Author: Tom Runia +# Date Created: 2018-08-03 + +import numpy as np + +def make_colorwheel(): + """ + Generates a color wheel for optical flow visualization as presented in: + Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) + URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf + + Code follows the original C++ source code of Daniel Scharstein. + Code follows the the Matlab source code of Deqing Sun. + + Returns: + np.ndarray: Color wheel + """ + + RY = 15 + YG = 6 + GC = 4 + CB = 11 + BM = 13 + MR = 6 + + ncols = RY + YG + GC + CB + BM + MR + colorwheel = np.zeros((ncols, 3)) + col = 0 + + # RY + colorwheel[0:RY, 0] = 255 + colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY) + col = col+RY + # YG + colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG) + colorwheel[col:col+YG, 1] = 255 + col = col+YG + # GC + colorwheel[col:col+GC, 1] = 255 + colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC) + col = col+GC + # CB + colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB) + colorwheel[col:col+CB, 2] = 255 + col = col+CB + # BM + colorwheel[col:col+BM, 2] = 255 + colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM) + col = col+BM + # MR + colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR) + colorwheel[col:col+MR, 0] = 255 + return colorwheel + + +def flow_uv_to_colors(u, v, convert_to_bgr=False): + """ + Applies the flow color wheel to (possibly clipped) flow components u and v. + + According to the C++ source code of Daniel Scharstein + According to the Matlab source code of Deqing Sun + + Args: + u (np.ndarray): Input horizontal flow of shape [H,W] + v (np.ndarray): Input vertical flow of shape [H,W] + convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. + + Returns: + np.ndarray: Flow visualization image of shape [H,W,3] + """ + flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8) + colorwheel = make_colorwheel() # shape [55x3] + ncols = colorwheel.shape[0] + rad = np.sqrt(np.square(u) + np.square(v)) + a = np.arctan2(-v, -u)/np.pi + fk = (a+1) / 2*(ncols-1) + k0 = np.floor(fk).astype(np.int32) + k1 = k0 + 1 + k1[k1 == ncols] = 0 + f = fk - k0 + for i in range(colorwheel.shape[1]): + tmp = colorwheel[:,i] + col0 = tmp[k0] / 255.0 + col1 = tmp[k1] / 255.0 + col = (1-f)*col0 + f*col1 + idx = (rad <= 1) + col[idx] = 1 - rad[idx] * (1-col[idx]) + col[~idx] = col[~idx] * 0.75 # out of range + # Note the 2-i => BGR instead of RGB + ch_idx = 2-i if convert_to_bgr else i + flow_image[:,:,ch_idx] = np.floor(255 * col) + return flow_image + + +def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False): + """ + Expects a two dimensional flow image of shape. + + Args: + flow_uv (np.ndarray): Flow UV image of shape [H,W,2] + clip_flow (float, optional): Clip maximum of flow values. Defaults to None. + convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. + + Returns: + np.ndarray: Flow visualization image of shape [H,W,3] + """ + assert flow_uv.ndim == 3, 'input flow must have three dimensions' + assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]' + if clip_flow is not None: + flow_uv = np.clip(flow_uv, 0, clip_flow) + u = flow_uv[:,:,0] + v = flow_uv[:,:,1] + rad = np.sqrt(np.square(u) + np.square(v)) + rad_max = np.max(rad) + epsilon = 1e-5 + u = u / (rad_max + epsilon) + v = v / (rad_max + epsilon) + return flow_uv_to_colors(u, v, convert_to_bgr) \ No newline at end of file diff --git a/backend/inpaint/video/raft/utils/flow_viz_pt.py b/backend/inpaint/video/raft/utils/flow_viz_pt.py new file mode 100644 index 0000000..12e666a --- /dev/null +++ b/backend/inpaint/video/raft/utils/flow_viz_pt.py @@ -0,0 +1,118 @@ +# Flow visualization code adapted from https://github.com/tomrunia/OpticalFlow_Visualization +import torch +torch.pi = torch.acos(torch.zeros(1)).item() * 2 # which is 3.1415927410125732 + +@torch.no_grad() +def flow_to_image(flow: torch.Tensor) -> torch.Tensor: + + """ + Converts a flow to an RGB image. + + Args: + flow (Tensor): Flow of shape (N, 2, H, W) or (2, H, W) and dtype torch.float. + + Returns: + img (Tensor): Image Tensor of dtype uint8 where each color corresponds + to a given flow direction. Shape is (N, 3, H, W) or (3, H, W) depending on the input. + """ + + if flow.dtype != torch.float: + raise ValueError(f"Flow should be of dtype torch.float, got {flow.dtype}.") + + orig_shape = flow.shape + if flow.ndim == 3: + flow = flow[None] # Add batch dim + + if flow.ndim != 4 or flow.shape[1] != 2: + raise ValueError(f"Input flow should have shape (2, H, W) or (N, 2, H, W), got {orig_shape}.") + + max_norm = torch.sum(flow**2, dim=1).sqrt().max() + epsilon = torch.finfo((flow).dtype).eps + normalized_flow = flow / (max_norm + epsilon) + img = _normalized_flow_to_image(normalized_flow) + + if len(orig_shape) == 3: + img = img[0] # Remove batch dim + return img + +@torch.no_grad() +def _normalized_flow_to_image(normalized_flow: torch.Tensor) -> torch.Tensor: + + """ + Converts a batch of normalized flow to an RGB image. + + Args: + normalized_flow (torch.Tensor): Normalized flow tensor of shape (N, 2, H, W) + Returns: + img (Tensor(N, 3, H, W)): Flow visualization image of dtype uint8. + """ + + N, _, H, W = normalized_flow.shape + device = normalized_flow.device + flow_image = torch.zeros((N, 3, H, W), dtype=torch.uint8, device=device) + colorwheel = _make_colorwheel().to(device) # shape [55x3] + num_cols = colorwheel.shape[0] + norm = torch.sum(normalized_flow**2, dim=1).sqrt() + a = torch.atan2(-normalized_flow[:, 1, :, :], -normalized_flow[:, 0, :, :]) / torch.pi + fk = (a + 1) / 2 * (num_cols - 1) + k0 = torch.floor(fk).to(torch.long) + k1 = k0 + 1 + k1[k1 == num_cols] = 0 + f = fk - k0 + + for c in range(colorwheel.shape[1]): + tmp = colorwheel[:, c] + col0 = tmp[k0] / 255.0 + col1 = tmp[k1] / 255.0 + col = (1 - f) * col0 + f * col1 + col = 1 - norm * (1 - col) + flow_image[:, c, :, :] = torch.floor(255. * col) + return flow_image + + +@torch.no_grad() +def _make_colorwheel() -> torch.Tensor: + """ + Generates a color wheel for optical flow visualization as presented in: + Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) + URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf. + + Returns: + colorwheel (Tensor[55, 3]): Colorwheel Tensor. + """ + + RY = 15 + YG = 6 + GC = 4 + CB = 11 + BM = 13 + MR = 6 + + ncols = RY + YG + GC + CB + BM + MR + colorwheel = torch.zeros((ncols, 3)) + col = 0 + + # RY + colorwheel[0:RY, 0] = 255 + colorwheel[0:RY, 1] = torch.floor(255. * torch.arange(0., RY) / RY) + col = col + RY + # YG + colorwheel[col : col + YG, 0] = 255 - torch.floor(255. * torch.arange(0., YG) / YG) + colorwheel[col : col + YG, 1] = 255 + col = col + YG + # GC + colorwheel[col : col + GC, 1] = 255 + colorwheel[col : col + GC, 2] = torch.floor(255. * torch.arange(0., GC) / GC) + col = col + GC + # CB + colorwheel[col : col + CB, 1] = 255 - torch.floor(255. * torch.arange(CB) / CB) + colorwheel[col : col + CB, 2] = 255 + col = col + CB + # BM + colorwheel[col : col + BM, 2] = 255 + colorwheel[col : col + BM, 0] = torch.floor(255. * torch.arange(0., BM) / BM) + col = col + BM + # MR + colorwheel[col : col + MR, 2] = 255 - torch.floor(255. * torch.arange(MR) / MR) + colorwheel[col : col + MR, 0] = 255 + return colorwheel diff --git a/backend/inpaint/video/raft/utils/frame_utils.py b/backend/inpaint/video/raft/utils/frame_utils.py new file mode 100755 index 0000000..6c49113 --- /dev/null +++ b/backend/inpaint/video/raft/utils/frame_utils.py @@ -0,0 +1,137 @@ +import numpy as np +from PIL import Image +from os.path import * +import re + +import cv2 +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +TAG_CHAR = np.array([202021.25], np.float32) + +def readFlow(fn): + """ Read .flo file in Middlebury format""" + # Code adapted from: + # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy + + # WARNING: this will work on little-endian architectures (eg Intel x86) only! + # print 'fn = %s'%(fn) + with open(fn, 'rb') as f: + magic = np.fromfile(f, np.float32, count=1) + if 202021.25 != magic: + print('Magic number incorrect. Invalid .flo file') + return None + else: + w = np.fromfile(f, np.int32, count=1) + h = np.fromfile(f, np.int32, count=1) + # print 'Reading %d x %d flo file\n' % (w, h) + data = np.fromfile(f, np.float32, count=2*int(w)*int(h)) + # Reshape data into 3D array (columns, rows, bands) + # The reshape here is for visualization, the original code is (w,h,2) + return np.resize(data, (int(h), int(w), 2)) + +def readPFM(file): + file = open(file, 'rb') + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header == b'PF': + color = True + elif header == b'Pf': + color = False + else: + raise Exception('Not a PFM file.') + + dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline()) + if dim_match: + width, height = map(int, dim_match.groups()) + else: + raise Exception('Malformed PFM header.') + + scale = float(file.readline().rstrip()) + if scale < 0: # little-endian + endian = '<' + scale = -scale + else: + endian = '>' # big-endian + + data = np.fromfile(file, endian + 'f') + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + return data + +def writeFlow(filename,uv,v=None): + """ Write optical flow to file. + + If v is None, uv is assumed to contain both u and v channels, + stacked in depth. + Original code by Deqing Sun, adapted from Daniel Scharstein. + """ + nBands = 2 + + if v is None: + assert(uv.ndim == 3) + assert(uv.shape[2] == 2) + u = uv[:,:,0] + v = uv[:,:,1] + else: + u = uv + + assert(u.shape == v.shape) + height,width = u.shape + f = open(filename,'wb') + # write the header + f.write(TAG_CHAR) + np.array(width).astype(np.int32).tofile(f) + np.array(height).astype(np.int32).tofile(f) + # arrange into matrix form + tmp = np.zeros((height, width*nBands)) + tmp[:,np.arange(width)*2] = u + tmp[:,np.arange(width)*2 + 1] = v + tmp.astype(np.float32).tofile(f) + f.close() + + +def readFlowKITTI(filename): + flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR) + flow = flow[:,:,::-1].astype(np.float32) + flow, valid = flow[:, :, :2], flow[:, :, 2] + flow = (flow - 2**15) / 64.0 + return flow, valid + +def readDispKITTI(filename): + disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0 + valid = disp > 0.0 + flow = np.stack([-disp, np.zeros_like(disp)], -1) + return flow, valid + + +def writeFlowKITTI(filename, uv): + uv = 64.0 * uv + 2**15 + valid = np.ones([uv.shape[0], uv.shape[1], 1]) + uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) + cv2.imwrite(filename, uv[..., ::-1]) + + +def read_gen(file_name, pil=False): + ext = splitext(file_name)[-1] + if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': + return Image.open(file_name) + elif ext == '.bin' or ext == '.raw': + return np.load(file_name) + elif ext == '.flo': + return readFlow(file_name).astype(np.float32) + elif ext == '.pfm': + flow = readPFM(file_name).astype(np.float32) + if len(flow.shape) == 2: + return flow + else: + return flow[:, :, :-1] + return [] \ No newline at end of file diff --git a/backend/inpaint/video/raft/utils/utils.py b/backend/inpaint/video/raft/utils/utils.py new file mode 100755 index 0000000..5f32d28 --- /dev/null +++ b/backend/inpaint/video/raft/utils/utils.py @@ -0,0 +1,82 @@ +import torch +import torch.nn.functional as F +import numpy as np +from scipy import interpolate + + +class InputPadder: + """ Pads images such that dimensions are divisible by 8 """ + def __init__(self, dims, mode='sintel'): + self.ht, self.wd = dims[-2:] + pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 + pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 + if mode == 'sintel': + self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] + else: + self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] + + def pad(self, *inputs): + return [F.pad(x, self._pad, mode='replicate') for x in inputs] + + def unpad(self,x): + ht, wd = x.shape[-2:] + c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] + return x[..., c[0]:c[1], c[2]:c[3]] + +def forward_interpolate(flow): + flow = flow.detach().cpu().numpy() + dx, dy = flow[0], flow[1] + + ht, wd = dx.shape + x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) + + x1 = x0 + dx + y1 = y0 + dy + + x1 = x1.reshape(-1) + y1 = y1.reshape(-1) + dx = dx.reshape(-1) + dy = dy.reshape(-1) + + valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) + x1 = x1[valid] + y1 = y1[valid] + dx = dx[valid] + dy = dy[valid] + + flow_x = interpolate.griddata( + (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) + + flow_y = interpolate.griddata( + (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) + + flow = np.stack([flow_x, flow_y], axis=0) + return torch.from_numpy(flow).float() + + +def bilinear_sampler(img, coords, mode='bilinear', mask=False): + """ Wrapper for grid_sample, uses pixel coordinates """ + H, W = img.shape[-2:] + xgrid, ygrid = coords.split([1,1], dim=-1) + xgrid = 2*xgrid/(W-1) - 1 + ygrid = 2*ygrid/(H-1) - 1 + + grid = torch.cat([xgrid, ygrid], dim=-1) + img = F.grid_sample(img, grid, align_corners=True) + + if mask: + mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) + return img, mask.float() + + return img + + +def coords_grid(batch, ht, wd): + coords = torch.meshgrid(torch.arange(ht), torch.arange(wd)) + coords = torch.stack(coords[::-1], dim=0).float() + return coords[None].repeat(batch, 1, 1, 1) + + +def upflow8(flow, mode='bilinear'): + new_size = (8 * flow.shape[2], 8 * flow.shape[3]) + return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) diff --git a/backend/inpaint/video_inpaint.py b/backend/inpaint/video_inpaint.py new file mode 100644 index 0000000..19ee802 --- /dev/null +++ b/backend/inpaint/video_inpaint.py @@ -0,0 +1,378 @@ +# -*- coding: utf-8 -*- +import os +import cv2 +import numpy as np +import scipy.ndimage +from PIL import Image + +import torch +import torchvision + +from backend import config +from backend.inpaint.video.model.modules.flow_comp_raft import RAFT_bi +from backend.inpaint.video.model.recurrent_flow_completion import RecurrentFlowCompleteNet +from backend.inpaint.video.model.propainter import InpaintGenerator +from backend.inpaint.video.core.utils import to_tensors +from backend.inpaint.video.model.misc import get_device + +import warnings + +warnings.filterwarnings("ignore") + + +def binary_mask(mask, th=0.1): + mask[mask > th] = 1 + mask[mask <= th] = 0 + return mask + + +# read frame-wise masks +def read_mask(mpath, length, size, flow_mask_dilates=8, mask_dilates=5): + masks_img = [] + masks_dilated = [] + flow_masks = [] + # 如果传入的直接为numpy array + if isinstance(mpath, np.ndarray): + masks_img = [Image.fromarray(mpath)] + # input single img path + else: + if isinstance(mpath, str): + if mpath.endswith(('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')): + masks_img = [Image.open(mpath)] + else: + mnames = sorted(os.listdir(mpath)) + for mp in mnames: + masks_img.append(Image.open(os.path.join(mpath, mp))) + + for mask_img in masks_img: + mask_img = np.array(mask_img.convert('L')) + + # Dilate 8 pixel so that all known pixel is trustworthy + if flow_mask_dilates > 0: + flow_mask_img = scipy.ndimage.binary_dilation(mask_img, iterations=flow_mask_dilates).astype(np.uint8) + else: + flow_mask_img = binary_mask(mask_img).astype(np.uint8) + # Close the small holes inside the foreground objects + # flow_mask_img = cv2.morphologyEx(flow_mask_img, cv2.MORPH_CLOSE, np.ones((21, 21),np.uint8)).astype(bool) + # flow_mask_img = scipy.ndimage.binary_fill_holes(flow_mask_img).astype(np.uint8) + flow_masks.append(Image.fromarray(flow_mask_img * 255)) + + if mask_dilates > 0: + mask_img = scipy.ndimage.binary_dilation(mask_img, iterations=mask_dilates).astype(np.uint8) + else: + mask_img = binary_mask(mask_img).astype(np.uint8) + masks_dilated.append(Image.fromarray(mask_img * 255)) + + if len(masks_img) == 1: + flow_masks = flow_masks * length + masks_dilated = masks_dilated * length + + return flow_masks, masks_dilated + + +def extrapolation(video_ori, scale): + """Prepares the data for video outpainting. + """ + nFrame = len(video_ori) + imgW, imgH = video_ori[0].size + + # Defines new FOV. + imgH_extr = int(scale[0] * imgH) + imgW_extr = int(scale[1] * imgW) + imgH_extr = imgH_extr - imgH_extr % 8 + imgW_extr = imgW_extr - imgW_extr % 8 + H_start = int((imgH_extr - imgH) / 2) + W_start = int((imgW_extr - imgW) / 2) + + # Extrapolates the FOV for video. + frames = [] + for v in video_ori: + frame = np.zeros((imgH_extr, imgW_extr, 3), dtype=np.uint8) + frame[H_start: H_start + imgH, W_start: W_start + imgW, :] = v + frames.append(Image.fromarray(frame)) + + # Generates the mask for missing region. + masks_dilated = [] + flow_masks = [] + + dilate_h = 4 if H_start > 10 else 0 + dilate_w = 4 if W_start > 10 else 0 + mask = np.ones(((imgH_extr, imgW_extr)), dtype=np.uint8) + + mask[H_start + dilate_h: H_start + imgH - dilate_h, + W_start + dilate_w: W_start + imgW - dilate_w] = 0 + flow_masks.append(Image.fromarray(mask * 255)) + + mask[H_start: H_start + imgH, W_start: W_start + imgW] = 0 + masks_dilated.append(Image.fromarray(mask * 255)) + + flow_masks = flow_masks * nFrame + masks_dilated = masks_dilated * nFrame + + return frames, flow_masks, masks_dilated, (imgW_extr, imgH_extr) + + +def get_ref_index(mid_neighbor_id, neighbor_ids, length, ref_stride=10, ref_num=-1): + ref_index = [] + if ref_num == -1: + for i in range(0, length, ref_stride): + if i not in neighbor_ids: + ref_index.append(i) + else: + start_idx = max(0, mid_neighbor_id - ref_stride * (ref_num // 2)) + end_idx = min(length, mid_neighbor_id + ref_stride * (ref_num // 2)) + for i in range(start_idx, end_idx, ref_stride): + if i not in neighbor_ids: + if len(ref_index) > ref_num: + break + ref_index.append(i) + return ref_index + + +class VideoInpaint: + def __init__(self, sub_video_length=config.MAX_PROCESS_NUM, use_fp16=True): + self.device = get_device() + self.use_fp16 = use_fp16 + self.use_half = True if self.use_fp16 else False + if self.device == torch.device('cpu'): + self.use_half = False + # Length of sub-video for long video inference. + self.sub_video_length = sub_video_length + # Length of local neighboring frames.' + self.neighbor_length = 10 + # Mask dilation for video and flow masking + self.mask_dilation = 4 + # Stride of global reference frames + self.ref_stride = 10 + # Iterations for RAFT inference + self.raft_iter = 20 + # Stride of global reference frames + self.ref_stride = 10 + # 设置raft模型 + self.fix_raft = self.init_raft_model() + # 设置fix_flow模型 + self.fix_flow_complete = self.init_fix_flow_model() + # 设置inpaint模型 + self.model = self.init_inpaint_model() + + def init_raft_model(self): + # set up RAFT and flow competition model + return RAFT_bi(os.path.join(config.VIDEO_INPAINT_MODEL_PATH, 'raft-things.pth'), self.device) + + def init_fix_flow_model(self): + fix_flow_complete_model = RecurrentFlowCompleteNet( + os.path.join(config.VIDEO_INPAINT_MODEL_PATH, 'recurrent_flow_completion.pth')) + for p in fix_flow_complete_model.parameters(): + p.requires_grad = False + fix_flow_complete_model.to(self.device) + fix_flow_complete_model.eval() + return fix_flow_complete_model + + def init_inpaint_model(self): + # set up ProPainter model + return InpaintGenerator(model_path=os.path.join(config.VIDEO_INPAINT_MODEL_PATH, 'ProPainter.pth')).to( + self.device).eval() + + def inpaint(self, frames, mask): + if isinstance(frames[0], np.ndarray): + frames = [Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB)) for f in frames] + size = frames[0].size + frames_len = len(frames) + flow_masks, masks_dilated = read_mask(mask, frames_len, size, + flow_mask_dilates=self.mask_dilation, + mask_dilates=self.mask_dilation) + w, h = size + # for saving the masked frames or video + masked_frame_for_save = [] + for i in range(len(frames)): + mask_ = np.expand_dims(np.array(masks_dilated[i]), 2).repeat(3, axis=2) / 255. + img = np.array(frames[i]) + green = np.zeros([h, w, 3]) + green[:, :, 1] = 255 + alpha = 0.6 + # alpha = 1.0 + fuse_img = (1 - alpha) * img + alpha * green + fuse_img = mask_ * fuse_img + (1 - mask_) * img + masked_frame_for_save.append(fuse_img.astype(np.uint8)) + + frames_inp = [np.array(f).astype(np.uint8) for f in frames] + frames = to_tensors()(frames).unsqueeze(0) * 2 - 1 + flow_masks = to_tensors()(flow_masks).unsqueeze(0) + masks_dilated = to_tensors()(masks_dilated).unsqueeze(0) + frames, flow_masks, masks_dilated = frames.to(self.device), flow_masks.to(self.device), masks_dilated.to( + self.device) + video_length = frames.size(1) + with torch.no_grad(): + # ---- compute flow ---- + if frames.size(-1) <= 640: + short_clip_len = 12 + elif frames.size(-1) <= 720: + short_clip_len = 8 + elif frames.size(-1) <= 1280: + short_clip_len = 4 + else: + short_clip_len = 2 + + # use fp32 for RAFT + if frames.size(1) > short_clip_len: + gt_flows_f_list, gt_flows_b_list = [], [] + for f in range(0, video_length, short_clip_len): + end_f = min(video_length, f + short_clip_len) + if f == 0: + flows_f, flows_b = self.fix_raft(frames[:, f:end_f], iters=self.raft_iter) + else: + flows_f, flows_b = self.fix_raft(frames[:, f - 1:end_f], iters=self.raft_iter) + gt_flows_f_list.append(flows_f) + gt_flows_b_list.append(flows_b) + torch.cuda.empty_cache() + gt_flows_f = torch.cat(gt_flows_f_list, dim=1) + gt_flows_b = torch.cat(gt_flows_b_list, dim=1) + gt_flows_bi = (gt_flows_f, gt_flows_b) + else: + gt_flows_bi = self.fix_raft(frames, iters=self.raft_iter) + torch.cuda.empty_cache() + + if self.use_half: + frames, flow_masks, masks_dilated = frames.half(), flow_masks.half(), masks_dilated.half() + gt_flows_bi = (gt_flows_bi[0].half(), gt_flows_bi[1].half()) + fix_flow_complete = self.fix_flow_complete.half() + self.model = self.model.half() + + # ---- complete flow ---- + flow_length = gt_flows_bi[0].size(1) + if flow_length > self.sub_video_length: + pred_flows_f, pred_flows_b = [], [] + pad_len = 5 + for f in range(0, flow_length, self.sub_video_length): + s_f = max(0, f - pad_len) + e_f = min(flow_length, f + self.sub_video_length + pad_len) + pad_len_s = max(0, f) - s_f + pad_len_e = e_f - min(flow_length, f + self.sub_video_length) + pred_flows_bi_sub, _ = fix_flow_complete.forward_bidirect_flow( + (gt_flows_bi[0][:, s_f:e_f], gt_flows_bi[1][:, s_f:e_f]), + flow_masks[:, s_f:e_f + 1]) + pred_flows_bi_sub = fix_flow_complete.combine_flow( + (gt_flows_bi[0][:, s_f:e_f], gt_flows_bi[1][:, s_f:e_f]), + pred_flows_bi_sub, + flow_masks[:, s_f:e_f + 1]) + + pred_flows_f.append(pred_flows_bi_sub[0][:, pad_len_s:e_f - s_f - pad_len_e]) + pred_flows_b.append(pred_flows_bi_sub[1][:, pad_len_s:e_f - s_f - pad_len_e]) + torch.cuda.empty_cache() + + pred_flows_f = torch.cat(pred_flows_f, dim=1) + pred_flows_b = torch.cat(pred_flows_b, dim=1) + pred_flows_bi = (pred_flows_f, pred_flows_b) + else: + pred_flows_bi, _ = fix_flow_complete.forward_bidirect_flow(gt_flows_bi, flow_masks) + pred_flows_bi = fix_flow_complete.combine_flow(gt_flows_bi, pred_flows_bi, flow_masks) + torch.cuda.empty_cache() + + # ---- image propagation ---- + masked_frames = frames * (1 - masks_dilated) + # ensure a minimum of 100 frames for image propagation + subvideo_length_img_prop = min(100, self.sub_video_length) + if video_length > subvideo_length_img_prop: + updated_frames, updated_masks = [], [] + pad_len = 10 + for f in range(0, video_length, subvideo_length_img_prop): + s_f = max(0, f - pad_len) + e_f = min(video_length, f + subvideo_length_img_prop + pad_len) + pad_len_s = max(0, f) - s_f + pad_len_e = e_f - min(video_length, f + subvideo_length_img_prop) + + b, t, _, _, _ = masks_dilated[:, s_f:e_f].size() + pred_flows_bi_sub = (pred_flows_bi[0][:, s_f:e_f - 1], pred_flows_bi[1][:, s_f:e_f - 1]) + prop_imgs_sub, updated_local_masks_sub = self.model.img_propagation(masked_frames[:, s_f:e_f], + pred_flows_bi_sub, + masks_dilated[:, s_f:e_f], + 'nearest') + updated_frames_sub = frames[:, s_f:e_f] * (1 - masks_dilated[:, s_f:e_f]) + prop_imgs_sub.view(b, t, 3, h, w) * masks_dilated[:, s_f:e_f] + updated_masks_sub = updated_local_masks_sub.view(b, t, 1, h, w) + updated_frames.append(updated_frames_sub[:, pad_len_s:e_f - s_f - pad_len_e]) + updated_masks.append(updated_masks_sub[:, pad_len_s:e_f - s_f - pad_len_e]) + torch.cuda.empty_cache() + + updated_frames = torch.cat(updated_frames, dim=1) + updated_masks = torch.cat(updated_masks, dim=1) + else: + b, t, _, _, _ = masks_dilated.size() + prop_imgs, updated_local_masks = self.model.img_propagation(masked_frames, pred_flows_bi, masks_dilated, + 'nearest') + updated_frames = frames * (1 - masks_dilated) + prop_imgs.view(b, t, 3, h, w) * masks_dilated + updated_masks = updated_local_masks.view(b, t, 1, h, w) + torch.cuda.empty_cache() + + ori_frames = frames_inp + comp_frames = [None] * video_length + + neighbor_stride = self.neighbor_length // 2 + if video_length > self.sub_video_length: + ref_num = self.sub_video_length // self.ref_stride + else: + ref_num = -1 + + # ---- feature propagation + transformer ---- + for f in range(0, video_length, neighbor_stride): + neighbor_ids = [ + i for i in range(max(0, f - neighbor_stride), + min(video_length, f + neighbor_stride + 1)) + ] + ref_ids = get_ref_index(f, neighbor_ids, video_length, self.ref_stride, ref_num) + selected_imgs = updated_frames[:, neighbor_ids + ref_ids, :, :, :] + selected_masks = masks_dilated[:, neighbor_ids + ref_ids, :, :, :] + selected_update_masks = updated_masks[:, neighbor_ids + ref_ids, :, :, :] + selected_pred_flows_bi = ( + pred_flows_bi[0][:, neighbor_ids[:-1], :, :, :], pred_flows_bi[1][:, neighbor_ids[:-1], :, :, :]) + + with torch.no_grad(): + # 1.0 indicates mask + l_t = len(neighbor_ids) + pred_img = self.model(selected_imgs, selected_pred_flows_bi, selected_masks, selected_update_masks, l_t) + pred_img = pred_img.view(-1, 3, h, w) + pred_img = (pred_img + 1) / 2 + pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255 + binary_masks = masks_dilated[0, neighbor_ids, :, :, :].cpu().permute( + 0, 2, 3, 1).numpy().astype(np.uint8) + for i in range(len(neighbor_ids)): + idx = neighbor_ids[i] + img = np.array(pred_img[i]).astype(np.uint8) * binary_masks[i] \ + + ori_frames[idx] * (1 - binary_masks[i]) + if comp_frames[idx] is None: + comp_frames[idx] = img + else: + comp_frames[idx] = comp_frames[idx].astype(np.float32) * 0.5 + img.astype(np.float32) * 0.5 + comp_frames[idx] = comp_frames[idx].astype(np.uint8) + torch.cuda.empty_cache() + # save videos frame + comp_frames = [cv2.cvtColor(i, cv2.COLOR_RGB2BGR) for i in comp_frames] + return comp_frames + + +def read_frames(v_path): + video_cap = cv2.VideoCapture(v_path) + video_frames = [] + while True: + ret, frame = video_cap.read() + if not ret: + break + video_frames.append(frame) + video_frames = [Image.fromarray(f) for f in video_frames] + return video_frames + + +if __name__ == '__main__': + # VideoInpaint + video_inpaint = VideoInpaint(sub_video_length=80) + frames = read_frames('/home/yao/Documents/Project/video-subtitle-remover/local_test/test1.mp4') + mask = cv2.imread('/home/yao/Documents/Project/video-subtitle-remover/local_test/test1_mask.png') + inpainted_frames = video_inpaint.inpaint(frames, mask) + save_root = '/home/yao/Documents/Project/video-subtitle-remover/local_test/' + video_out_path = os.path.join(save_root, 'inpaint_out.mp4') + print("size: ", inpainted_frames[0].shape) + video_writer = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'mp4v'), 24, (640, 360)) + for comp_frame in inpainted_frames: + video_writer.write(comp_frame) + video_writer.release() + print(f'\nAll results are saved in {save_root}') + diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..b0f7213 --- /dev/null +++ b/backend/main.py @@ -0,0 +1,684 @@ +import shutil +import subprocess +import os +from pathlib import Path +import threading +import cv2 +import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from backend.inpaint.lama_inpaint import LamaInpaint +from backend.inpaint.video_inpaint import VideoInpaint +from backend.tools.inpaint_tools import create_mask, batch_generator +import config +import importlib +import platform +import tempfile +import torch +import multiprocessing +from shapely.geometry import Polygon +import time +from tqdm import tqdm +from tools.infer import utility +from tools.infer.predict_det import TextDetector + + +class SubtitleDetect: + """ + 文本框检测类,用于检测视频帧中是否存在文本框 + """ + + def __init__(self, video_path, sub_area=None): + # 获取参数对象 + importlib.reload(config) + args = utility.parse_args() + args.det_algorithm = 'DB' + args.det_model_dir = config.DET_MODEL_PATH + self.text_detector = TextDetector(args) + self.video_path = video_path + self.sub_area = sub_area + + def detect_subtitle(self, img): + dt_boxes, elapse = self.text_detector(img) + return dt_boxes, elapse + + @staticmethod + def get_coordinates(dt_box): + """ + 从返回的检测框中获取坐标 + :param dt_box 检测框返回结果 + :return list 坐标点列表 + """ + coordinate_list = list() + if isinstance(dt_box, list): + for i in dt_box: + i = list(i) + (x1, y1) = int(i[0][0]), int(i[0][1]) + (x2, y2) = int(i[1][0]), int(i[1][1]) + (x3, y3) = int(i[2][0]), int(i[2][1]) + (x4, y4) = int(i[3][0]), int(i[3][1]) + xmin = max(x1, x4) + xmax = min(x2, x3) + ymin = max(y1, y2) + ymax = min(y3, y4) + coordinate_list.append((xmin, xmax, ymin, ymax)) + return coordinate_list + + def find_subtitle_frame_no(self, sub_remover=None): + video_cap = cv2.VideoCapture(self.video_path) + frame_count = video_cap.get(cv2.CAP_PROP_FRAME_COUNT) + tbar = tqdm(total=int(frame_count), unit='frame', position=0, file=sys.__stdout__, desc='Subtitle Finding') + current_frame_no = 0 + subtitle_frame_no_box_dict = {} + print('[Processing] start finding subtitles...') + while video_cap.isOpened(): + ret, frame = video_cap.read() + # 如果读取视频帧失败(视频读到最后一帧) + if not ret: + break + # 读取视频帧成功 + current_frame_no += 1 + dt_boxes, elapse = self.detect_subtitle(frame) + coordinate_list = self.get_coordinates(dt_boxes.tolist()) + if coordinate_list: + temp_list = [] + for coordinate in coordinate_list: + xmin, xmax, ymin, ymax = coordinate + if self.sub_area is not None: + s_ymin, s_ymax, s_xmin, s_xmax = self.sub_area + if (s_xmin <= xmin and xmax <= s_xmax + and s_ymin <= ymin + and ymax <= s_ymax): + temp_list.append((xmin, xmax, ymin, ymax)) + else: + temp_list.append((xmin, xmax, ymin, ymax)) + if len(temp_list) > 0: + subtitle_frame_no_box_dict[current_frame_no] = temp_list + tbar.update(1) + if sub_remover: + sub_remover.progress_total = (100 * float(current_frame_no) / float(frame_count)) // 2 + subtitle_frame_no_box_dict = self.unify_regions(subtitle_frame_no_box_dict) + # if config.UNITE_COORDINATES: + # subtitle_frame_no_box_dict = self.get_subtitle_frame_no_box_dict_with_united_coordinates(subtitle_frame_no_box_dict) + # if sub_remover is not None: + # try: + # # 当帧数大于1时,说明并非图片或单帧 + # if sub_remover.frame_count > 1: + # subtitle_frame_no_box_dict = self.filter_mistake_sub_area(subtitle_frame_no_box_dict, + # sub_remover.fps) + # except Exception: + # pass + # subtitle_frame_no_box_dict = self.prevent_missed_detection(subtitle_frame_no_box_dict) + print('[Finished] Finished finding subtitles...') + new_subtitle_frame_no_box_dict = dict() + for key in subtitle_frame_no_box_dict.keys(): + if len(subtitle_frame_no_box_dict[key]) > 0: + new_subtitle_frame_no_box_dict[key] = subtitle_frame_no_box_dict[key] + return new_subtitle_frame_no_box_dict + + @staticmethod + def are_similar(region1, region2): + """判断两个区域是否相似。""" + xmin1, xmax1, ymin1, ymax1 = region1 + xmin2, xmax2, ymin2, ymax2 = region2 + + return abs(xmin1 - xmin2) <= config.PIXEL_TOLERANCE_X and abs(xmax1 - xmax2) <= config.PIXEL_TOLERANCE_X and \ + abs(ymin1 - ymin2) <= config.PIXEL_TOLERANCE_Y and abs(ymax1 - ymax2) <= config.PIXEL_TOLERANCE_Y + + def unify_regions(self, raw_regions): + """将连续相似的区域统一,保持列表结构。""" + keys = sorted(raw_regions.keys()) # 对键进行排序以确保它们是连续的 + unified_regions = {} + + # 初始化 + last_key = keys[0] + unify_value_map = {last_key: raw_regions[last_key]} + + for key in keys[1:]: + current_regions = raw_regions[key] + + # 新增一个列表来存放匹配过的标准区间 + new_unify_values = [] + + for idx, region in enumerate(current_regions): + last_standard_region = unify_value_map[last_key][idx] if idx < len(unify_value_map[last_key]) else None + + # 如果当前的区间与前一个键的对应区间相似,我们统一它们 + if last_standard_region and self.are_similar(region, last_standard_region): + new_unify_values.append(last_standard_region) + else: + new_unify_values.append(region) + + # 更新unify_value_map为最新的区间值 + unify_value_map[key] = new_unify_values + last_key = key + + # 将最终统一后的结果传递给unified_regions + for key in keys: + unified_regions[key] = unify_value_map[key] + return unified_regions + + @staticmethod + def find_continuous_ranges(subtitle_frame_no_box_dict): + """ + 获取字幕出现的起始帧号与结束帧号 + """ + numbers = sorted(list(subtitle_frame_no_box_dict.keys())) + ranges = [] + start = numbers[0] # 初始区间开始值 + + for i in range(1, len(numbers)): + # 如果当前数字与前一个数字间隔超过1, + # 则上一个区间结束,记录当前区间的开始与结束 + if numbers[i] - numbers[i - 1] != 1: + end = numbers[i - 1] # 则该数字是当前连续区间的终点 + ranges.append((start, end)) + start = numbers[i] # 开始下一个连续区间 + # 添加最后一个区间 + ranges.append((start, numbers[-1])) + return ranges + + @staticmethod + def find_continuous_ranges_with_same_mask(subtitle_frame_no_box_dict): + numbers = sorted(list(subtitle_frame_no_box_dict.keys())) + ranges = [] + start = numbers[0] # 初始区间开始值 + for i in range(1, len(numbers)): + # 如果当前帧号与前一个帧号间隔超过1, + # 则上一个区间结束,记录当前区间的开始与结束 + if numbers[i] - numbers[i - 1] != 1: + end = numbers[i - 1] # 则该数字是当前连续区间的终点 + ranges.append((start, end)) + start = numbers[i] # 开始下一个连续区间 + # 如果当前帧号与前一个帧号间隔为1,且当前帧号对应的坐标点与上一帧号对应的坐标点不一致 + # 记录当前区间的开始与结束 + if numbers[i] - numbers[i - 1] == 1: + if subtitle_frame_no_box_dict[numbers[i]] != subtitle_frame_no_box_dict[numbers[i - 1]]: + end = numbers[i - 1] # 则该数字是当前连续区间的终点 + ranges.append((start, end)) + start = numbers[i] # 开始下一个连续区间 + # 添加最后一个区间 + ranges.append((start, numbers[-1])) + return ranges + + @staticmethod + def sub_area_to_polygon(sub_area): + """ + xmin, xmax, ymin, ymax = sub_area + """ + s_xmin = sub_area[0] + s_xmax = sub_area[1] + s_ymin = sub_area[2] + s_ymax = sub_area[3] + return Polygon([[s_xmin, s_ymin], [s_xmax, s_ymin], [s_xmax, s_ymax], [s_xmin, s_ymax]]) + + def compute_iou(self, box1, box2): + box1_polygon = self.sub_area_to_polygon(box1) + box2_polygon = self.sub_area_to_polygon(box2) + intersection = box1_polygon.intersection(box2_polygon) + if intersection.is_empty: + return -1 + else: + union_area = (box1_polygon.area + box2_polygon.area - intersection.area) + if union_area > 0: + intersection_area_rate = intersection.area / union_area + else: + intersection_area_rate = 0 + return intersection_area_rate + + def get_area_max_box_dict(self, sub_frame_no_list_continuous, subtitle_frame_no_box_dict): + _area_max_box_dict = dict() + for start_no, end_no in sub_frame_no_list_continuous: + # 寻找面积最大文本框 + current_no = start_no + # 查找当前区间矩形框最大面积 + area_max_box_list = [] + while current_no <= end_no: + for coord in subtitle_frame_no_box_dict[current_no]: + # 取出每一个文本框坐标 + xmin, xmax, ymin, ymax = coord + # 计算当前文本框坐标面积 + current_area = abs(xmax - xmin) * abs(ymax - ymin) + # 如果区间最大框列表为空,则当前面积为区间最大面积 + if len(area_max_box_list) < 1: + area_max_box_list.append({ + 'area': current_area, + 'xmin': xmin, + 'xmax': xmax, + 'ymin': ymin, + 'ymax': ymax + }) + # 如果列表非空,判断当前文本框是与区间最大文本框在同一区域 + else: + has_same_position = False + # 遍历每个区间最大文本框,判断当前文本框位置是否与区间最大文本框列表的某个文本框位于同一行且交叉 + for area_max_box in area_max_box_list: + if (area_max_box['ymin'] - config.TOLERANCE_Y <= ymin + and ymax <= area_max_box['ymax'] + config.TOLERANCE_Y): + if self.compute_iou((xmin, xmax, ymin, ymax), ( + area_max_box['xmin'], area_max_box['xmax'], area_max_box['ymin'], + area_max_box['ymax'])) != -1: + # 如果高度差异不一样 + if abs(abs(area_max_box['ymax'] - area_max_box['ymin']) - abs( + ymax - ymin)) < config.THRESHOLD_HEIGHT_DIFFERENCE: + has_same_position = True + # 如果在同一行,则计算当前面积是不是最大 + # 判断面积大小,若当前面积更大,则将当前行的最大区域坐标点更新 + if has_same_position and current_area > area_max_box['area']: + area_max_box['area'] = current_area + area_max_box['xmin'] = xmin + area_max_box['xmax'] = xmax + area_max_box['ymin'] = ymin + area_max_box['ymax'] = ymax + # 如果遍历了所有的区间最大文本框列表,发现是新的一行,则直接添加 + if not has_same_position: + new_large_area = { + 'area': current_area, + 'xmin': xmin, + 'xmax': xmax, + 'ymin': ymin, + 'ymax': ymax + } + if new_large_area not in area_max_box_list: + area_max_box_list.append(new_large_area) + break + current_no += 1 + _area_max_box_list = list() + for area_max_box in area_max_box_list: + if area_max_box not in _area_max_box_list: + _area_max_box_list.append(area_max_box) + _area_max_box_dict[f'{start_no}->{end_no}'] = _area_max_box_list + return _area_max_box_dict + + def get_subtitle_frame_no_box_dict_with_united_coordinates(self, subtitle_frame_no_box_dict): + """ + 将多个视频帧的文本区域坐标统一 + """ + subtitle_frame_no_box_dict_with_united_coordinates = dict() + frame_no_list = self.find_continuous_ranges_with_same_mask(subtitle_frame_no_box_dict) + area_max_box_dict = self.get_area_max_box_dict(frame_no_list, subtitle_frame_no_box_dict) + for start_no, end_no in frame_no_list: + current_no = start_no + while True: + area_max_box_list = area_max_box_dict[f'{start_no}->{end_no}'] + current_boxes = subtitle_frame_no_box_dict[current_no] + new_subtitle_frame_no_box_list = [] + for current_box in current_boxes: + current_xmin, current_xmax, current_ymin, current_ymax = current_box + for max_box in area_max_box_list: + large_xmin = max_box['xmin'] + large_xmax = max_box['xmax'] + large_ymin = max_box['ymin'] + large_ymax = max_box['ymax'] + box1 = (current_xmin, current_xmax, current_ymin, current_ymax) + box2 = (large_xmin, large_xmax, large_ymin, large_ymax) + res = self.compute_iou(box1, box2) + if res != -1: + new_subtitle_frame_no_box = (large_xmin, large_xmax, large_ymin, large_ymax) + if new_subtitle_frame_no_box not in new_subtitle_frame_no_box_list: + new_subtitle_frame_no_box_list.append(new_subtitle_frame_no_box) + subtitle_frame_no_box_dict_with_united_coordinates[current_no] = new_subtitle_frame_no_box_list + current_no += 1 + if current_no > end_no: + break + return subtitle_frame_no_box_dict_with_united_coordinates + + def prevent_missed_detection(self, subtitle_frame_no_box_dict): + """ + 添加额外的文本框,防止漏检 + """ + frame_no_list = self.find_continuous_ranges_with_same_mask(subtitle_frame_no_box_dict) + for start_no, end_no in frame_no_list: + current_no = start_no + while True: + current_box_list = subtitle_frame_no_box_dict[current_no] + if current_no + 1 != end_no and (current_no + 1) in subtitle_frame_no_box_dict.keys(): + next_box_list = subtitle_frame_no_box_dict[current_no + 1] + if set(current_box_list).issubset(set(next_box_list)): + subtitle_frame_no_box_dict[current_no] = subtitle_frame_no_box_dict[current_no + 1] + current_no += 1 + if current_no > end_no: + break + return subtitle_frame_no_box_dict + + @staticmethod + def get_frequency_in_range(sub_frame_no_list_continuous, subtitle_frame_no_box_dict): + sub_area_with_frequency = {} + for start_no, end_no in sub_frame_no_list_continuous: + current_no = start_no + while True: + current_box_list = subtitle_frame_no_box_dict[current_no] + for current_box in current_box_list: + if str(current_box) not in sub_area_with_frequency.keys(): + sub_area_with_frequency[f'{current_box}'] = 1 + else: + sub_area_with_frequency[f'{current_box}'] += 1 + current_no += 1 + if current_no > end_no: + break + return sub_area_with_frequency + + def filter_mistake_sub_area(self, subtitle_frame_no_box_dict, fps): + """ + 过滤错误的字幕区域 + """ + sub_frame_no_list_continuous = self.find_continuous_ranges_with_same_mask(subtitle_frame_no_box_dict) + sub_area_with_frequency = self.get_frequency_in_range(sub_frame_no_list_continuous, subtitle_frame_no_box_dict) + correct_sub_area = [] + for sub_area in sub_area_with_frequency.keys(): + if sub_area_with_frequency[sub_area] >= (fps // 2): + correct_sub_area.append(sub_area) + else: + print(f'drop {sub_area}') + correct_subtitle_frame_no_box_dict = dict() + for frame_no in subtitle_frame_no_box_dict.keys(): + current_box_list = subtitle_frame_no_box_dict[frame_no] + new_box_list = [] + for current_box in current_box_list: + if str(current_box) in correct_sub_area and current_box not in new_box_list: + new_box_list.append(current_box) + correct_subtitle_frame_no_box_dict[frame_no] = new_box_list + return correct_subtitle_frame_no_box_dict + + +class SubtitleRemover: + def __init__(self, vd_path, sub_area=None): + importlib.reload(config) + # 线程锁 + self.lock = threading.RLock() + # 用户指定的字幕区域位置 + self.sub_area = sub_area + # 判断是否为图片 + self.is_picture = False + if str(vd_path).endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')): + self.sub_area = None + self.is_picture = True + # 视频路径 + self.video_path = vd_path + self.video_cap = cv2.VideoCapture(vd_path) + # 通过视频路径获取视频名称 + self.vd_name = Path(self.video_path).stem + # 视频帧总数 + self.frame_count = self.video_cap.get(cv2.CAP_PROP_FRAME_COUNT) + # 视频帧率 + self.fps = self.video_cap.get(cv2.CAP_PROP_FPS) + # 视频尺寸 + self.size = (int(self.video_cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(self.video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))) + self.mask_size = (int(self.video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(self.video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))) + self.frame_height = int(self.video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + self.frame_width = int(self.video_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + # 创建字幕检测对象 + self.sub_detector = SubtitleDetect(self.video_path, self.sub_area) + # 创建视频临时对象,windows下delete=True会有permission denied的报错 + self.video_temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) + # 创建视频写对象 + self.video_writer = cv2.VideoWriter(self.video_temp_file.name, cv2.VideoWriter_fourcc(*'mp4v'), self.fps, + self.size) + self.video_out_name = os.path.join(os.path.dirname(self.video_path), f'{self.vd_name}_no_sub.mp4') + self.video_inpaint = None + self.lama_inpaint = None + self.ext = os.path.splitext(vd_path)[-1] + if self.is_picture: + pic_dir = os.path.join(os.path.dirname(self.video_path), 'no_sub') + if not os.path.exists(pic_dir): + os.makedirs(pic_dir) + self.video_out_name = os.path.join(pic_dir, f'{self.vd_name}{self.ext}') + if torch.cuda.is_available(): + print('use GPU for acceleration') + # 总处理进度 + self.progress_total = 0 + self.progress_remover = 0 + self.isFinished = False + # 预览帧 + self.preview_frame = None + # 是否将原音频嵌入到去除字幕后的视频 + self.is_successful_merged = False + + @staticmethod + def get_coordinates(dt_box): + """ + 从返回的检测框中获取坐标 + :param dt_box 检测框返回结果 + :return list 坐标点列表 + """ + coordinate_list = list() + if isinstance(dt_box, list): + for i in dt_box: + i = list(i) + (x1, y1) = int(i[0][0]), int(i[0][1]) + (x2, y2) = int(i[1][0]), int(i[1][1]) + (x3, y3) = int(i[2][0]), int(i[2][1]) + (x4, y4) = int(i[3][0]), int(i[3][1]) + xmin = max(x1, x4) + xmax = min(x2, x3) + ymin = max(y1, y2) + ymax = min(y3, y4) + coordinate_list.append((xmin, xmax, ymin, ymax)) + return coordinate_list + + @staticmethod + def is_current_frame_no_start(frame_no, continuous_frame_no_list): + """ + 判断给定的帧号是否为开头,是的话返回结束帧号,不是的话返回-1 + """ + for start_no, end_no in continuous_frame_no_list: + if start_no == frame_no: + return True + return False + + @staticmethod + def find_frame_no_end(frame_no, continuous_frame_no_list): + """ + 判断给定的帧号是否为开头,是的话返回结束帧号,不是的话返回-1 + """ + for start_no, end_no in continuous_frame_no_list: + if start_no <= frame_no <= end_no: + return end_no + return -1 + + def update_progress(self, tbar, increment): + tbar.update(increment) + current_percentage = (tbar.n / tbar.total) * 100 + self.progress_remover = int(current_percentage) // 2 + self.progress_total = 50 + self.progress_remover + + def run(self): + + # 记录开始时间 + start_time = time.time() + # 寻找字幕帧 + self.progress_total = 0 + sub_list = self.sub_detector.find_subtitle_frame_no(sub_remover=self) + # 测试代码 + # from test1_dict_raw import test1_raw + # sub_list = self.sub_detector.unify_regions(test1_raw) + continuous_frame_no_list = self.sub_detector.find_continuous_ranges_with_same_mask(sub_list) + tbar = tqdm(total=int(self.frame_count), unit='frame', position=0, file=sys.__stdout__, + desc='Subtitle Removing') + print('[Processing] start removing subtitles...') + + if self.is_picture: + self.lama_inpaint = LamaInpaint() + original_frame = cv2.imread(self.video_path) + mask = create_mask(original_frame.shape[0:2], sub_list[1]) + frame = self.lama_inpaint(original_frame, mask) + cv2.imencode(self.ext, frame)[1].tofile(self.video_out_name) + self.preview_frame = cv2.hconcat([original_frame, frame]) + tbar.update(1) + self.progress_total = 100 + else: + if config.ACCURATE_MODE: + # *********************** 批推理方案 start *********************** + print('use accurate mode') + self.video_inpaint = VideoInpaint(config.MAX_PROCESS_NUM) + index = 0 + while True: + ret, frame = self.video_cap.read() + if not ret: + break + index += 1 + # 如果当前帧没有水印/文本则直接写 + if index not in sub_list.keys(): + self.video_writer.write(frame) + print(f'write frame: {index}') + self.update_progress(tbar, increment=1) + continue + # 如果有水印,判断该帧是不是开头帧 + else: + # 如果是开头帧,则批推理到尾帧 + if self.is_current_frame_no_start(index, continuous_frame_no_list): + # print(f'No 1 Current index: {index}') + start_frame_no = index + print(f'find start: {start_frame_no}') + # 找到结束帧 + end_frame_no = self.find_frame_no_end(index, continuous_frame_no_list) + # 判断当前帧号是不是字幕起始位置 + # 如果获取的结束帧号不为-1则说明 + if end_frame_no != -1: + print(f'find end: {end_frame_no}') + # ************ 读取该区间所有帧 start ************ + temp_frames = list() + # 将头帧加入处理列表 + temp_frames.append(frame) + inner_index = 0 + # 一直读取到尾帧 + while index < end_frame_no: + ret, frame = self.video_cap.read() + if not ret: + break + index += 1 + temp_frames.append(frame) + # ************ 读取该区间所有帧 end ************ + if len(temp_frames) < 1: + # 没有待处理,直接跳过 + continue + elif len(temp_frames) == 1: + inner_index += 1 + single_mask = create_mask(self.mask_size, sub_list[index]) + if self.lama_inpaint is None: + self.lama_inpaint = LamaInpaint() + inpainted_frame = self.lama_inpaint(frame, single_mask) + self.video_writer.write(inpainted_frame) + print(f'write frame: {start_frame_no + inner_index} with mask {sub_list[start_frame_no]}') + self.update_progress(tbar, increment=1) + continue + else: + # 将读取的视频帧分批处理 + # 1. 获取当前批次使用的mask + mask = create_mask(self.mask_size, sub_list[start_frame_no]) + for batch in batch_generator(temp_frames, config.MAX_LOAD_NUM): + # 2. 调用批推理 + if len(batch) == 1: + single_mask = create_mask(self.mask_size, sub_list[start_frame_no]) + if self.lama_inpaint is None: + self.lama_inpaint = LamaInpaint() + inpainted_frame = self.lama_inpaint(frame, single_mask) + self.video_writer.write(inpainted_frame) + print(f'write frame: {start_frame_no + inner_index} with mask {sub_list[start_frame_no]}') + inner_index += 1 + self.update_progress(tbar, increment=1) + elif len(batch) > 1: + inpainted_frames = self.video_inpaint.inpaint(batch, mask) + for i, inpainted_frame in enumerate(inpainted_frames): + self.video_writer.write(inpainted_frame) + print(f'write frame: {start_frame_no + inner_index} with mask {sub_list[index]}') + inner_index += 1 + self.preview_frame = cv2.hconcat([batch[i], inpainted_frame]) + self.update_progress(tbar, increment=len(batch)) + # *********************** 批推理方案 end *********************** + else: + # *********************** 单线程方案 start *********************** + print('use normal mode') + if self.lama_inpaint is None: + self.lama_inpaint = LamaInpaint() + index = 0 + while True: + ret, frame = self.video_cap.read() + if not ret: + break + original_frame = frame + index += 1 + if index in sub_list.keys(): + mask = create_mask(self.mask_size, sub_list[index]) + if config.FAST_MODE: + frame = cv2.inpaint(frame, mask, 3, cv2.INPAINT_TELEA) + else: + frame = self.lama_inpaint(frame, mask) + self.preview_frame = cv2.hconcat([original_frame, frame]) + if self.is_picture: + cv2.imencode(self.ext, frame)[1].tofile(self.video_out_name) + else: + self.video_writer.write(frame) + tbar.update(1) + self.progress_remover = 100 * float(index) / float(self.frame_count) // 2 + self.progress_total = 50 + self.progress_remover + # *********************** 单线程方案 end *********************** + self.video_cap.release() + self.video_writer.release() + if not self.is_picture: + # 将原音频合并到新生成的视频文件中 + self.merge_audio_to_video() + print(f"[Finished]Subtitle successfully removed, video generated at:{self.video_out_name}") + else: + print(f"[Finished]Subtitle successfully removed, picture generated at:{self.video_out_name}") + print(f'time cost: {round(time.time() - start_time, 2)}s') + self.isFinished = True + self.progress_total = 100 + if os.path.exists(self.video_temp_file.name): + try: + os.remove(self.video_temp_file.name) + except Exception: + if platform.system() in ['Windows']: + pass + else: + print(f'failed to delete temp file {self.video_temp_file.name}') + + def merge_audio_to_video(self): + # 创建音频临时对象,windows下delete=True会有permission denied的报错 + temp = tempfile.NamedTemporaryFile(suffix='.aac', delete=False) + audio_extract_command = [config.FFMPEG_PATH, + "-y", "-i", self.video_path, + "-acodec", "copy", + "-vn", "-loglevel", "error", temp.name] + use_shell = True if os.name == "nt" else False + try: + subprocess.check_output(audio_extract_command, stdin=open(os.devnull), shell=use_shell) + except Exception: + print('fail to extract audio') + return + else: + if os.path.exists(self.video_temp_file.name): + audio_merge_command = [config.FFMPEG_PATH, + "-y", "-i", self.video_temp_file.name, + "-i", temp.name, + "-vcodec", "copy", + "-acodec", "copy", + "-loglevel", "error", self.video_out_name] + try: + subprocess.check_output(audio_merge_command, stdin=open(os.devnull), shell=use_shell) + except Exception: + print('fail to merge audio') + return + if os.path.exists(temp.name): + try: + os.remove(temp.name) + except Exception: + print(f'failed to delete temp file {temp.name}') + self.is_successful_merged = True + finally: + temp.close() + if not self.is_successful_merged: + try: + shutil.copy2(self.video_temp_file.name, self.video_out_name) + except IOError as e: + print("Unable to copy file. %s" % e) + self.video_temp_file.close() + + +if __name__ == '__main__': + multiprocessing.set_start_method("spawn") + # 提示用户输入视频路径 + video_path = input(f"Please input video file path: ").strip() + # 新建字幕提取对象 + sd = SubtitleRemover(video_path) + sd.run() diff --git a/backend/models/V4/ch_det/fs_manifest.csv b/backend/models/V4/ch_det/fs_manifest.csv new file mode 100644 index 0000000..9cc6786 --- /dev/null +++ b/backend/models/V4/ch_det/fs_manifest.csv @@ -0,0 +1,4 @@ +filename,filesize,encoding,header +inference_1.pdiparams,50000000,, +inference_2.pdiparams,50000000,, +inference_3.pdiparams,13295054,, diff --git a/backend/models/V4/ch_det/inference.pdiparams.info b/backend/models/V4/ch_det/inference.pdiparams.info new file mode 100644 index 0000000..272488f Binary files /dev/null and b/backend/models/V4/ch_det/inference.pdiparams.info differ diff --git a/backend/models/V4/ch_det/inference.pdmodel b/backend/models/V4/ch_det/inference.pdmodel new file mode 100644 index 0000000..1579721 Binary files /dev/null and b/backend/models/V4/ch_det/inference.pdmodel differ diff --git a/backend/models/V4/ch_det/inference_1.pdiparams b/backend/models/V4/ch_det/inference_1.pdiparams new file mode 100644 index 0000000..322c93d Binary files /dev/null and b/backend/models/V4/ch_det/inference_1.pdiparams differ diff --git a/backend/models/V4/ch_det/inference_2.pdiparams b/backend/models/V4/ch_det/inference_2.pdiparams new file mode 100644 index 0000000..a3aa060 Binary files /dev/null and b/backend/models/V4/ch_det/inference_2.pdiparams differ diff --git a/backend/models/V4/ch_det/inference_3.pdiparams b/backend/models/V4/ch_det/inference_3.pdiparams new file mode 100644 index 0000000..030d488 Binary files /dev/null and b/backend/models/V4/ch_det/inference_3.pdiparams differ diff --git a/backend/models/V4/ch_det_fast/inference.pdiparams b/backend/models/V4/ch_det_fast/inference.pdiparams new file mode 100644 index 0000000..089594a Binary files /dev/null and b/backend/models/V4/ch_det_fast/inference.pdiparams differ diff --git a/backend/models/V4/ch_det_fast/inference.pdiparams.info b/backend/models/V4/ch_det_fast/inference.pdiparams.info new file mode 100644 index 0000000..082c148 Binary files /dev/null and b/backend/models/V4/ch_det_fast/inference.pdiparams.info differ diff --git a/backend/models/V4/ch_det_fast/inference.pdmodel b/backend/models/V4/ch_det_fast/inference.pdmodel new file mode 100644 index 0000000..223b861 Binary files /dev/null and b/backend/models/V4/ch_det_fast/inference.pdmodel differ diff --git a/backend/models/big-lama/big-lama_1.pt b/backend/models/big-lama/big-lama_1.pt new file mode 100644 index 0000000..0085190 Binary files /dev/null and b/backend/models/big-lama/big-lama_1.pt differ diff --git a/backend/models/big-lama/big-lama_2.pt b/backend/models/big-lama/big-lama_2.pt new file mode 100644 index 0000000..ca8e0ad Binary files /dev/null and b/backend/models/big-lama/big-lama_2.pt differ diff --git a/backend/models/big-lama/big-lama_3.pt b/backend/models/big-lama/big-lama_3.pt new file mode 100644 index 0000000..063c1fb Binary files /dev/null and b/backend/models/big-lama/big-lama_3.pt differ diff --git a/backend/models/big-lama/big-lama_4.pt b/backend/models/big-lama/big-lama_4.pt new file mode 100644 index 0000000..7e8fdcc Binary files /dev/null and b/backend/models/big-lama/big-lama_4.pt differ diff --git a/backend/models/big-lama/big-lama_5.pt b/backend/models/big-lama/big-lama_5.pt new file mode 100644 index 0000000..9198d0b Binary files /dev/null and b/backend/models/big-lama/big-lama_5.pt differ diff --git a/backend/models/big-lama/fs_manifest.csv b/backend/models/big-lama/fs_manifest.csv new file mode 100644 index 0000000..593582e --- /dev/null +++ b/backend/models/big-lama/fs_manifest.csv @@ -0,0 +1,6 @@ +filename,filesize,encoding,header +big-lama_1.pt,50000000,, +big-lama_2.pt,50000000,, +big-lama_3.pt,50000000,, +big-lama_4.pt,50000000,, +big-lama_5.pt,5803670,, diff --git a/backend/models/video/ProPainter_1.pth b/backend/models/video/ProPainter_1.pth new file mode 100644 index 0000000..0a85ad6 Binary files /dev/null and b/backend/models/video/ProPainter_1.pth differ diff --git a/backend/models/video/ProPainter_2.pth b/backend/models/video/ProPainter_2.pth new file mode 100644 index 0000000..948aebc Binary files /dev/null and b/backend/models/video/ProPainter_2.pth differ diff --git a/backend/models/video/ProPainter_3.pth b/backend/models/video/ProPainter_3.pth new file mode 100644 index 0000000..cc3586e Binary files /dev/null and b/backend/models/video/ProPainter_3.pth differ diff --git a/backend/models/video/ProPainter_4.pth b/backend/models/video/ProPainter_4.pth new file mode 100644 index 0000000..aff41a0 Binary files /dev/null and b/backend/models/video/ProPainter_4.pth differ diff --git a/backend/models/video/fs_manifest.csv b/backend/models/video/fs_manifest.csv new file mode 100644 index 0000000..3583bcc --- /dev/null +++ b/backend/models/video/fs_manifest.csv @@ -0,0 +1,5 @@ +filename,filesize,encoding,header +ProPainter_1.pth,50000000,, +ProPainter_2.pth,50000000,, +ProPainter_3.pth,50000000,, +ProPainter_4.pth,7780510,, diff --git a/backend/models/video/raft-things.pth b/backend/models/video/raft-things.pth new file mode 100644 index 0000000..dbe6f9f Binary files /dev/null and b/backend/models/video/raft-things.pth differ diff --git a/backend/models/video/recurrent_flow_completion.pth b/backend/models/video/recurrent_flow_completion.pth new file mode 100644 index 0000000..28d11ea Binary files /dev/null and b/backend/models/video/recurrent_flow_completion.pth differ diff --git a/backend/ppocr/__init__.py b/backend/ppocr/__init__.py new file mode 100755 index 0000000..e438e53 --- /dev/null +++ b/backend/ppocr/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +warnings.filterwarnings("ignore", category=Warning) +warnings.filterwarnings("ignore", category=DeprecationWarning) diff --git a/backend/ppocr/data/__init__.py b/backend/ppocr/data/__init__.py new file mode 100644 index 0000000..78c3279 --- /dev/null +++ b/backend/ppocr/data/__init__.py @@ -0,0 +1,109 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import numpy as np +import skimage +import paddle +import signal +import random + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + +import copy +from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler +import paddle.distributed as dist + +from ppocr.data.imaug import transform, create_operators +from ppocr.data.simple_dataset import SimpleDataSet +from ppocr.data.lmdb_dataset import LMDBDataSet +from ppocr.data.pgnet_dataset import PGDataSet +from ppocr.data.pubtab_dataset import PubTabDataSet + +__all__ = ['build_dataloader', 'transform', 'create_operators'] + + +def term_mp(sig_num, frame): + """ kill all child processes + """ + pid = os.getpid() + pgid = os.getpgid(os.getpid()) + print("main proc {} exit, kill process group " "{}".format(pid, pgid)) + os.killpg(pgid, signal.SIGKILL) + + +def build_dataloader(config, mode, device, logger, seed=None): + config = copy.deepcopy(config) + + support_dict = [ + 'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet' + ] + module_name = config[mode]['dataset']['name'] + assert module_name in support_dict, Exception( + 'DataSet only support {}'.format(support_dict)) + assert mode in ['Train', 'Eval', 'Test' + ], "Mode should be Train, Eval or Test." + + dataset = eval(module_name)(config, mode, logger, seed) + loader_config = config[mode]['loader'] + batch_size = loader_config['batch_size_per_card'] + drop_last = loader_config['drop_last'] + shuffle = loader_config['shuffle'] + num_workers = loader_config['num_workers'] + if 'use_shared_memory' in loader_config.keys(): + use_shared_memory = loader_config['use_shared_memory'] + else: + use_shared_memory = True + + if mode == "Train": + # Distribute data to multiple cards + batch_sampler = DistributedBatchSampler( + dataset=dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last) + else: + # Distribute data to single card + batch_sampler = BatchSampler( + dataset=dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last) + + if 'collate_fn' in loader_config: + from . import collate_fn + collate_fn = getattr(collate_fn, loader_config['collate_fn'])() + else: + collate_fn = None + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=num_workers, + return_list=True, + use_shared_memory=use_shared_memory, + collate_fn=collate_fn) + + # support exit using ctrl+c + signal.signal(signal.SIGINT, term_mp) + signal.signal(signal.SIGTERM, term_mp) + + return data_loader diff --git a/backend/ppocr/data/collate_fn.py b/backend/ppocr/data/collate_fn.py new file mode 100644 index 0000000..0da6060 --- /dev/null +++ b/backend/ppocr/data/collate_fn.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numbers +import numpy as np +from collections import defaultdict + + +class DictCollator(object): + """ + data batch + """ + + def __call__(self, batch): + # todo:support batch operators + data_dict = defaultdict(list) + to_tensor_keys = [] + for sample in batch: + for k, v in sample.items(): + if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)): + if k not in to_tensor_keys: + to_tensor_keys.append(k) + data_dict[k].append(v) + for k in to_tensor_keys: + data_dict[k] = paddle.to_tensor(data_dict[k]) + return data_dict + + +class ListCollator(object): + """ + data batch + """ + + def __call__(self, batch): + # todo:support batch operators + data_dict = defaultdict(list) + to_tensor_idxs = [] + for sample in batch: + for idx, v in enumerate(sample): + if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)): + if idx not in to_tensor_idxs: + to_tensor_idxs.append(idx) + data_dict[idx].append(v) + for idx in to_tensor_idxs: + data_dict[idx] = paddle.to_tensor(data_dict[idx]) + return list(data_dict.values()) + + +class SSLRotateCollate(object): + """ + bach: [ + [(4*3xH*W), (4,)] + [(4*3xH*W), (4,)] + ... + ] + """ + + def __call__(self, batch): + output = [np.concatenate(d, axis=0) for d in zip(*batch)] + return output diff --git a/backend/ppocr/data/imaug/ColorJitter.py b/backend/ppocr/data/imaug/ColorJitter.py new file mode 100644 index 0000000..4b542ab --- /dev/null +++ b/backend/ppocr/data/imaug/ColorJitter.py @@ -0,0 +1,26 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle.vision.transforms import ColorJitter as pp_ColorJitter + +__all__ = ['ColorJitter'] + +class ColorJitter(object): + def __init__(self, brightness=0, contrast=0, saturation=0, hue=0,**kwargs): + self.aug = pp_ColorJitter(brightness, contrast, saturation, hue) + + def __call__(self, data): + image = data['image'] + image = self.aug(image) + data['image'] = image + return data diff --git a/backend/ppocr/data/imaug/__init__.py b/backend/ppocr/data/imaug/__init__.py new file mode 100644 index 0000000..548832f --- /dev/null +++ b/backend/ppocr/data/imaug/__init__.py @@ -0,0 +1,74 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .iaa_augment import IaaAugment +from .make_border_map import MakeBorderMap +from .make_shrink_map import MakeShrinkMap +from .random_crop_data import EastRandomCropData, RandomCropImgMask +from .make_pse_gt import MakePseGt + +from .rec_img_aug import RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ + SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg, PRENResizeImg +from .ssl_img_aug import SSLRotateResize +from .randaugment import RandAugment +from .copy_paste import CopyPaste +from .ColorJitter import ColorJitter +from .operators import * +from .label_ops import * + +from .east_process import * +from .sast_process import * +from .pg_process import * +from .gen_table_mask import * + +from .vqa import * + +from .fce_aug import * +from .fce_targets import FCENetTargets + + +def transform(data, ops=None): + """ transform """ + if ops is None: + ops = [] + for op in ops: + data = op(data) + if data is None: + return None + return data + + +def create_operators(op_param_list, global_config=None): + """ + create operators based on the config + + Args: + params(list): a dict list, used to create some operators + """ + assert isinstance(op_param_list, list), ('operator config should be a list') + ops = [] + for operator in op_param_list: + assert isinstance(operator, + dict) and len(operator) == 1, "yaml format error" + op_name = list(operator)[0] + param = {} if operator[op_name] is None else operator[op_name] + if global_config is not None: + param.update(global_config) + op = eval(op_name)(**param) + ops.append(op) + return ops diff --git a/backend/ppocr/data/imaug/copy_paste.py b/backend/ppocr/data/imaug/copy_paste.py new file mode 100644 index 0000000..0b3386c --- /dev/null +++ b/backend/ppocr/data/imaug/copy_paste.py @@ -0,0 +1,170 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import cv2 +import random +import numpy as np +from PIL import Image +from shapely.geometry import Polygon + +from ppocr.data.imaug.iaa_augment import IaaAugment +from ppocr.data.imaug.random_crop_data import is_poly_outside_rect +from tools.infer.utility import get_rotate_crop_image + + +class CopyPaste(object): + def __init__(self, objects_paste_ratio=0.2, limit_paste=True, **kwargs): + self.ext_data_num = 1 + self.objects_paste_ratio = objects_paste_ratio + self.limit_paste = limit_paste + augmenter_args = [{'type': 'Resize', 'args': {'size': [0.5, 3]}}] + self.aug = IaaAugment(augmenter_args) + + def __call__(self, data): + point_num = data['polys'].shape[1] + src_img = data['image'] + src_polys = data['polys'].tolist() + src_ignores = data['ignore_tags'].tolist() + ext_data = data['ext_data'][0] + ext_image = ext_data['image'] + ext_polys = ext_data['polys'] + ext_ignores = ext_data['ignore_tags'] + + indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]] + select_num = max( + 1, min(int(self.objects_paste_ratio * len(ext_polys)), 30)) + + random.shuffle(indexs) + select_idxs = indexs[:select_num] + select_polys = ext_polys[select_idxs] + select_ignores = ext_ignores[select_idxs] + + src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB) + ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB) + src_img = Image.fromarray(src_img).convert('RGBA') + for poly, tag in zip(select_polys, select_ignores): + box_img = get_rotate_crop_image(ext_image, poly) + + src_img, box = self.paste_img(src_img, box_img, src_polys) + if box is not None: + box = box.tolist() + for _ in range(len(box), point_num): + box.append(box[-1]) + src_polys.append(box) + src_ignores.append(tag) + src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR) + h, w = src_img.shape[:2] + src_polys = np.array(src_polys) + src_polys[:, :, 0] = np.clip(src_polys[:, :, 0], 0, w) + src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h) + data['image'] = src_img + data['polys'] = src_polys + data['ignore_tags'] = np.array(src_ignores) + return data + + def paste_img(self, src_img, box_img, src_polys): + box_img_pil = Image.fromarray(box_img).convert('RGBA') + src_w, src_h = src_img.size + box_w, box_h = box_img_pil.size + + angle = np.random.randint(0, 360) + box = np.array([[[0, 0], [box_w, 0], [box_w, box_h], [0, box_h]]]) + box = rotate_bbox(box_img, box, angle)[0] + box_img_pil = box_img_pil.rotate(angle, expand=1) + box_w, box_h = box_img_pil.width, box_img_pil.height + if src_w - box_w < 0 or src_h - box_h < 0: + return src_img, None + + paste_x, paste_y = self.select_coord(src_polys, box, src_w - box_w, + src_h - box_h) + if paste_x is None: + return src_img, None + box[:, 0] += paste_x + box[:, 1] += paste_y + r, g, b, A = box_img_pil.split() + src_img.paste(box_img_pil, (paste_x, paste_y), mask=A) + + return src_img, box + + def select_coord(self, src_polys, box, endx, endy): + if self.limit_paste: + xmin, ymin, xmax, ymax = box[:, 0].min(), box[:, 1].min( + ), box[:, 0].max(), box[:, 1].max() + for _ in range(50): + paste_x = random.randint(0, endx) + paste_y = random.randint(0, endy) + xmin1 = xmin + paste_x + xmax1 = xmax + paste_x + ymin1 = ymin + paste_y + ymax1 = ymax + paste_y + + num_poly_in_rect = 0 + for poly in src_polys: + if not is_poly_outside_rect(poly, xmin1, ymin1, + xmax1 - xmin1, ymax1 - ymin1): + num_poly_in_rect += 1 + break + if num_poly_in_rect == 0: + return paste_x, paste_y + return None, None + else: + paste_x = random.randint(0, endx) + paste_y = random.randint(0, endy) + return paste_x, paste_y + + +def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + +def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + +def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + +def rotate_bbox(img, text_polys, angle, scale=1): + """ + from https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/augment.py + Args: + img: np.ndarray + text_polys: np.ndarray N*4*2 + angle: int + scale: int + + Returns: + + """ + w = img.shape[1] + h = img.shape[0] + + rangle = np.deg2rad(angle) + nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w)) + nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w)) + rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, scale) + rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0])) + rot_mat[0, 2] += rot_move[0] + rot_mat[1, 2] += rot_move[1] + + # ---------------------- rotate box ---------------------- + rot_text_polys = list() + for bbox in text_polys: + point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1])) + point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1])) + point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1])) + point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1])) + rot_text_polys.append([point1, point2, point3, point4]) + return np.array(rot_text_polys, dtype=np.float32) diff --git a/backend/ppocr/data/imaug/east_process.py b/backend/ppocr/data/imaug/east_process.py new file mode 100644 index 0000000..df08adf --- /dev/null +++ b/backend/ppocr/data/imaug/east_process.py @@ -0,0 +1,436 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +""" +This code is refered from: +https://github.com/songdejia/EAST/blob/master/data_utils.py +""" +import math +import cv2 +import numpy as np +import json +import sys +import os + +__all__ = ['EASTProcessTrain'] + + +class EASTProcessTrain(object): + def __init__(self, + image_shape=[512, 512], + background_ratio=0.125, + min_crop_side_ratio=0.1, + min_text_size=10, + **kwargs): + self.input_size = image_shape[1] + self.random_scale = np.array([0.5, 1, 2.0, 3.0]) + self.background_ratio = background_ratio + self.min_crop_side_ratio = min_crop_side_ratio + self.min_text_size = min_text_size + + def preprocess(self, im): + input_size = self.input_size + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + im_scale = float(input_size) / float(im_size_max) + im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale) + img_mean = [0.485, 0.456, 0.406] + img_std = [0.229, 0.224, 0.225] + # im = im[:, :, ::-1].astype(np.float32) + im = im / 255 + im -= img_mean + im /= img_std + new_h, new_w, _ = im.shape + im_padded = np.zeros((input_size, input_size, 3), dtype=np.float32) + im_padded[:new_h, :new_w, :] = im + im_padded = im_padded.transpose((2, 0, 1)) + im_padded = im_padded[np.newaxis, :] + return im_padded, im_scale + + def rotate_im_poly(self, im, text_polys): + """ + rotate image with 90 / 180 / 270 degre + """ + im_w, im_h = im.shape[1], im.shape[0] + dst_im = im.copy() + dst_polys = [] + rand_degree_ratio = np.random.rand() + rand_degree_cnt = 1 + if 0.333 < rand_degree_ratio < 0.666: + rand_degree_cnt = 2 + elif rand_degree_ratio > 0.666: + rand_degree_cnt = 3 + for i in range(rand_degree_cnt): + dst_im = np.rot90(dst_im) + rot_degree = -90 * rand_degree_cnt + rot_angle = rot_degree * math.pi / 180.0 + n_poly = text_polys.shape[0] + cx, cy = 0.5 * im_w, 0.5 * im_h + ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0] + for i in range(n_poly): + wordBB = text_polys[i] + poly = [] + for j in range(4): + sx, sy = wordBB[j][0], wordBB[j][1] + dx = math.cos(rot_angle) * (sx - cx)\ + - math.sin(rot_angle) * (sy - cy) + ncx + dy = math.sin(rot_angle) * (sx - cx)\ + + math.cos(rot_angle) * (sy - cy) + ncy + poly.append([dx, dy]) + dst_polys.append(poly) + dst_polys = np.array(dst_polys, dtype=np.float32) + return dst_im, dst_polys + + def polygon_area(self, poly): + """ + compute area of a polygon + :param poly: + :return: + """ + edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), + (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), + (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), + (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] + return np.sum(edge) / 2. + + def check_and_validate_polys(self, polys, tags, img_height, img_width): + """ + check so that the text poly is in the same direction, + and also filter some invalid polygons + :param polys: + :param tags: + :return: + """ + h, w = img_height, img_width + if polys.shape[0] == 0: + return polys + polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) + polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) + + validated_polys = [] + validated_tags = [] + for poly, tag in zip(polys, tags): + p_area = self.polygon_area(poly) + #invalid poly + if abs(p_area) < 1: + continue + if p_area > 0: + #'poly in wrong direction' + if not tag: + tag = True #reversed cases should be ignore + poly = poly[(0, 3, 2, 1), :] + validated_polys.append(poly) + validated_tags.append(tag) + return np.array(validated_polys), np.array(validated_tags) + + def draw_img_polys(self, img, polys): + if len(img.shape) == 4: + img = np.squeeze(img, axis=0) + if img.shape[0] == 3: + img = img.transpose((1, 2, 0)) + img[:, :, 2] += 123.68 + img[:, :, 1] += 116.78 + img[:, :, 0] += 103.94 + cv2.imwrite("tmp.jpg", img) + img = cv2.imread("tmp.jpg") + for box in polys: + box = box.astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2) + import random + ino = random.randint(0, 100) + cv2.imwrite("tmp_%d.jpg" % ino, img) + return + + def shrink_poly(self, poly, r): + """ + fit a poly inside the origin poly, maybe bugs here... + used for generate the score map + :param poly: the text poly + :param r: r in the paper + :return: the shrinked poly + """ + # shrink ratio + R = 0.3 + # find the longer pair + dist0 = np.linalg.norm(poly[0] - poly[1]) + dist1 = np.linalg.norm(poly[2] - poly[3]) + dist2 = np.linalg.norm(poly[0] - poly[3]) + dist3 = np.linalg.norm(poly[1] - poly[2]) + if dist0 + dist1 > dist2 + dist3: + # first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2) + ## p0, p1 + theta = np.arctan2((poly[1][1] - poly[0][1]), + (poly[1][0] - poly[0][0])) + poly[0][0] += R * r[0] * np.cos(theta) + poly[0][1] += R * r[0] * np.sin(theta) + poly[1][0] -= R * r[1] * np.cos(theta) + poly[1][1] -= R * r[1] * np.sin(theta) + ## p2, p3 + theta = np.arctan2((poly[2][1] - poly[3][1]), + (poly[2][0] - poly[3][0])) + poly[3][0] += R * r[3] * np.cos(theta) + poly[3][1] += R * r[3] * np.sin(theta) + poly[2][0] -= R * r[2] * np.cos(theta) + poly[2][1] -= R * r[2] * np.sin(theta) + ## p0, p3 + theta = np.arctan2((poly[3][0] - poly[0][0]), + (poly[3][1] - poly[0][1])) + poly[0][0] += R * r[0] * np.sin(theta) + poly[0][1] += R * r[0] * np.cos(theta) + poly[3][0] -= R * r[3] * np.sin(theta) + poly[3][1] -= R * r[3] * np.cos(theta) + ## p1, p2 + theta = np.arctan2((poly[2][0] - poly[1][0]), + (poly[2][1] - poly[1][1])) + poly[1][0] += R * r[1] * np.sin(theta) + poly[1][1] += R * r[1] * np.cos(theta) + poly[2][0] -= R * r[2] * np.sin(theta) + poly[2][1] -= R * r[2] * np.cos(theta) + else: + ## p0, p3 + # print poly + theta = np.arctan2((poly[3][0] - poly[0][0]), + (poly[3][1] - poly[0][1])) + poly[0][0] += R * r[0] * np.sin(theta) + poly[0][1] += R * r[0] * np.cos(theta) + poly[3][0] -= R * r[3] * np.sin(theta) + poly[3][1] -= R * r[3] * np.cos(theta) + ## p1, p2 + theta = np.arctan2((poly[2][0] - poly[1][0]), + (poly[2][1] - poly[1][1])) + poly[1][0] += R * r[1] * np.sin(theta) + poly[1][1] += R * r[1] * np.cos(theta) + poly[2][0] -= R * r[2] * np.sin(theta) + poly[2][1] -= R * r[2] * np.cos(theta) + ## p0, p1 + theta = np.arctan2((poly[1][1] - poly[0][1]), + (poly[1][0] - poly[0][0])) + poly[0][0] += R * r[0] * np.cos(theta) + poly[0][1] += R * r[0] * np.sin(theta) + poly[1][0] -= R * r[1] * np.cos(theta) + poly[1][1] -= R * r[1] * np.sin(theta) + ## p2, p3 + theta = np.arctan2((poly[2][1] - poly[3][1]), + (poly[2][0] - poly[3][0])) + poly[3][0] += R * r[3] * np.cos(theta) + poly[3][1] += R * r[3] * np.sin(theta) + poly[2][0] -= R * r[2] * np.cos(theta) + poly[2][1] -= R * r[2] * np.sin(theta) + return poly + + def generate_quad(self, im_size, polys, tags): + """ + Generate quadrangle. + """ + h, w = im_size + poly_mask = np.zeros((h, w), dtype=np.uint8) + score_map = np.zeros((h, w), dtype=np.uint8) + # (x1, y1, ..., x4, y4, short_edge_norm) + geo_map = np.zeros((h, w, 9), dtype=np.float32) + # mask used during traning, to ignore some hard areas + training_mask = np.ones((h, w), dtype=np.uint8) + for poly_idx, poly_tag in enumerate(zip(polys, tags)): + poly = poly_tag[0] + tag = poly_tag[1] + + r = [None, None, None, None] + for i in range(4): + dist1 = np.linalg.norm(poly[i] - poly[(i + 1) % 4]) + dist2 = np.linalg.norm(poly[i] - poly[(i - 1) % 4]) + r[i] = min(dist1, dist2) + # score map + shrinked_poly = self.shrink_poly( + poly.copy(), r).astype(np.int32)[np.newaxis, :, :] + cv2.fillPoly(score_map, shrinked_poly, 1) + cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1) + # if the poly is too small, then ignore it during training + poly_h = min( + np.linalg.norm(poly[0] - poly[3]), + np.linalg.norm(poly[1] - poly[2])) + poly_w = min( + np.linalg.norm(poly[0] - poly[1]), + np.linalg.norm(poly[2] - poly[3])) + if min(poly_h, poly_w) < self.min_text_size: + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0) + + if tag: + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0) + + xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1)) + # geo map. + y_in_poly = xy_in_poly[:, 0] + x_in_poly = xy_in_poly[:, 1] + poly[:, 0] = np.minimum(np.maximum(poly[:, 0], 0), w) + poly[:, 1] = np.minimum(np.maximum(poly[:, 1], 0), h) + for pno in range(4): + geo_channel_beg = pno * 2 + geo_map[y_in_poly, x_in_poly, geo_channel_beg] =\ + x_in_poly - poly[pno, 0] + geo_map[y_in_poly, x_in_poly, geo_channel_beg+1] =\ + y_in_poly - poly[pno, 1] + geo_map[y_in_poly, x_in_poly, 8] = \ + 1.0 / max(min(poly_h, poly_w), 1.0) + return score_map, geo_map, training_mask + + def crop_area(self, im, polys, tags, crop_background=False, max_tries=50): + """ + make random crop from the input image + :param im: + :param polys: + :param tags: + :param crop_background: + :param max_tries: + :return: + """ + h, w, _ = im.shape + pad_h = h // 10 + pad_w = w // 10 + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + if len(h_axis) == 0 or len(w_axis) == 0: + return im, polys, tags + + for i in range(max_tries): + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if xmax - xmin < self.min_crop_side_ratio * w or \ + ymax - ymin < self.min_crop_side_ratio * h: + # area too small + continue + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin)\ + & (polys[:, :, 0] <= xmax)\ + & (polys[:, :, 1] >= ymin)\ + & (polys[:, :, 1] <= ymax) + selected_polys = np.where( + np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + + if len(selected_polys) == 0: + # no text in this area + if crop_background: + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = [] + tags = [] + return im, polys, tags + else: + continue + + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = polys[selected_polys] + tags = tags[selected_polys] + polys[:, :, 0] -= xmin + polys[:, :, 1] -= ymin + return im, polys, tags + return im, polys, tags + + def crop_background_infor(self, im, text_polys, text_tags): + im, text_polys, text_tags = self.crop_area( + im, text_polys, text_tags, crop_background=True) + + if len(text_polys) > 0: + return None + # pad and resize image + input_size = self.input_size + im, ratio = self.preprocess(im) + score_map = np.zeros((input_size, input_size), dtype=np.float32) + geo_map = np.zeros((input_size, input_size, 9), dtype=np.float32) + training_mask = np.ones((input_size, input_size), dtype=np.float32) + return im, score_map, geo_map, training_mask + + def crop_foreground_infor(self, im, text_polys, text_tags): + im, text_polys, text_tags = self.crop_area( + im, text_polys, text_tags, crop_background=False) + + if text_polys.shape[0] == 0: + return None + #continue for all ignore case + if np.sum((text_tags * 1.0)) >= text_tags.size: + return None + # pad and resize image + input_size = self.input_size + im, ratio = self.preprocess(im) + text_polys[:, :, 0] *= ratio + text_polys[:, :, 1] *= ratio + _, _, new_h, new_w = im.shape + # print(im.shape) + # self.draw_img_polys(im, text_polys) + score_map, geo_map, training_mask = self.generate_quad( + (new_h, new_w), text_polys, text_tags) + return im, score_map, geo_map, training_mask + + def __call__(self, data): + im = data['image'] + text_polys = data['polys'] + text_tags = data['ignore_tags'] + if im is None: + return None + if text_polys.shape[0] == 0: + return None + + #add rotate cases + if np.random.rand() < 0.5: + im, text_polys = self.rotate_im_poly(im, text_polys) + h, w, _ = im.shape + text_polys, text_tags = self.check_and_validate_polys(text_polys, + text_tags, h, w) + if text_polys.shape[0] == 0: + return None + + # random scale this image + rd_scale = np.random.choice(self.random_scale) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + if np.random.rand() < self.background_ratio: + outs = self.crop_background_infor(im, text_polys, text_tags) + else: + outs = self.crop_foreground_infor(im, text_polys, text_tags) + + if outs is None: + return None + im, score_map, geo_map, training_mask = outs + score_map = score_map[np.newaxis, ::4, ::4].astype(np.float32) + geo_map = np.swapaxes(geo_map, 1, 2) + geo_map = np.swapaxes(geo_map, 1, 0) + geo_map = geo_map[:, ::4, ::4].astype(np.float32) + training_mask = training_mask[np.newaxis, ::4, ::4] + training_mask = training_mask.astype(np.float32) + + data['image'] = im[0] + data['score_map'] = score_map + data['geo_map'] = geo_map + data['training_mask'] = training_mask + return data diff --git a/backend/ppocr/data/imaug/fce_aug.py b/backend/ppocr/data/imaug/fce_aug.py new file mode 100644 index 0000000..66bafef --- /dev/null +++ b/backend/ppocr/data/imaug/fce_aug.py @@ -0,0 +1,564 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/transforms.py +""" +import numpy as np +from PIL import Image, ImageDraw +import cv2 +from shapely.geometry import Polygon +import math +from ppocr.utils.poly_nms import poly_intersection + + +class RandomScaling: + def __init__(self, size=800, scale=(3. / 4, 5. / 2), **kwargs): + """Random scale the image while keeping aspect. + + Args: + size (int) : Base size before scaling. + scale (tuple(float)) : The range of scaling. + """ + assert isinstance(size, int) + assert isinstance(scale, float) or isinstance(scale, tuple) + self.size = size + self.scale = scale if isinstance(scale, tuple) \ + else (1 - scale, 1 + scale) + + def __call__(self, data): + image = data['image'] + text_polys = data['polys'] + h, w, _ = image.shape + + aspect_ratio = np.random.uniform(min(self.scale), max(self.scale)) + scales = self.size * 1.0 / max(h, w) * aspect_ratio + scales = np.array([scales, scales]) + out_size = (int(h * scales[1]), int(w * scales[0])) + image = cv2.resize(image, out_size[::-1]) + + data['image'] = image + text_polys[:, :, 0::2] = text_polys[:, :, 0::2] * scales[1] + text_polys[:, :, 1::2] = text_polys[:, :, 1::2] * scales[0] + data['polys'] = text_polys + + return data + + +class RandomCropFlip: + def __init__(self, + pad_ratio=0.1, + crop_ratio=0.5, + iter_num=1, + min_area_ratio=0.2, + **kwargs): + """Random crop and flip a patch of the image. + + Args: + crop_ratio (float): The ratio of cropping. + iter_num (int): Number of operations. + min_area_ratio (float): Minimal area ratio between cropped patch + and original image. + """ + assert isinstance(crop_ratio, float) + assert isinstance(iter_num, int) + assert isinstance(min_area_ratio, float) + + self.pad_ratio = pad_ratio + self.epsilon = 1e-2 + self.crop_ratio = crop_ratio + self.iter_num = iter_num + self.min_area_ratio = min_area_ratio + + def __call__(self, results): + for i in range(self.iter_num): + results = self.random_crop_flip(results) + + return results + + def random_crop_flip(self, results): + image = results['image'] + polygons = results['polys'] + ignore_tags = results['ignore_tags'] + if len(polygons) == 0: + return results + + if np.random.random() >= self.crop_ratio: + return results + + h, w, _ = image.shape + area = h * w + pad_h = int(h * self.pad_ratio) + pad_w = int(w * self.pad_ratio) + h_axis, w_axis = self.generate_crop_target(image, polygons, pad_h, + pad_w) + if len(h_axis) == 0 or len(w_axis) == 0: + return results + + attempt = 0 + while attempt < 50: + attempt += 1 + polys_keep = [] + polys_new = [] + ignore_tags_keep = [] + ignore_tags_new = [] + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if (xmax - xmin) * (ymax - ymin) < area * self.min_area_ratio: + # area too small + continue + + pts = np.stack([[xmin, xmax, xmax, xmin], + [ymin, ymin, ymax, ymax]]).T.astype(np.int32) + pp = Polygon(pts) + fail_flag = False + for polygon, ignore_tag in zip(polygons, ignore_tags): + ppi = Polygon(polygon.reshape(-1, 2)) + ppiou, _ = poly_intersection(ppi, pp, buffer=0) + if np.abs(ppiou - float(ppi.area)) > self.epsilon and \ + np.abs(ppiou) > self.epsilon: + fail_flag = True + break + elif np.abs(ppiou - float(ppi.area)) < self.epsilon: + polys_new.append(polygon) + ignore_tags_new.append(ignore_tag) + else: + polys_keep.append(polygon) + ignore_tags_keep.append(ignore_tag) + + if fail_flag: + continue + else: + break + + cropped = image[ymin:ymax, xmin:xmax, :] + select_type = np.random.randint(3) + if select_type == 0: + img = np.ascontiguousarray(cropped[:, ::-1]) + elif select_type == 1: + img = np.ascontiguousarray(cropped[::-1, :]) + else: + img = np.ascontiguousarray(cropped[::-1, ::-1]) + image[ymin:ymax, xmin:xmax, :] = img + results['img'] = image + + if len(polys_new) != 0: + height, width, _ = cropped.shape + if select_type == 0: + for idx, polygon in enumerate(polys_new): + poly = polygon.reshape(-1, 2) + poly[:, 0] = width - poly[:, 0] + 2 * xmin + polys_new[idx] = poly + elif select_type == 1: + for idx, polygon in enumerate(polys_new): + poly = polygon.reshape(-1, 2) + poly[:, 1] = height - poly[:, 1] + 2 * ymin + polys_new[idx] = poly + else: + for idx, polygon in enumerate(polys_new): + poly = polygon.reshape(-1, 2) + poly[:, 0] = width - poly[:, 0] + 2 * xmin + poly[:, 1] = height - poly[:, 1] + 2 * ymin + polys_new[idx] = poly + polygons = polys_keep + polys_new + ignore_tags = ignore_tags_keep + ignore_tags_new + results['polys'] = np.array(polygons) + results['ignore_tags'] = ignore_tags + + return results + + def generate_crop_target(self, image, all_polys, pad_h, pad_w): + """Generate crop target and make sure not to crop the polygon + instances. + + Args: + image (ndarray): The image waited to be crop. + all_polys (list[list[ndarray]]): All polygons including ground + truth polygons and ground truth ignored polygons. + pad_h (int): Padding length of height. + pad_w (int): Padding length of width. + Returns: + h_axis (ndarray): Vertical cropping range. + w_axis (ndarray): Horizontal cropping range. + """ + h, w, _ = image.shape + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + + text_polys = [] + for polygon in all_polys: + rect = cv2.minAreaRect(polygon.astype(np.int32).reshape(-1, 2)) + box = cv2.boxPoints(rect) + box = np.int0(box) + text_polys.append([box[0], box[1], box[2], box[3]]) + + polys = np.array(text_polys, dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + return h_axis, w_axis + + +class RandomCropPolyInstances: + """Randomly crop images and make sure to contain at least one intact + instance.""" + + def __init__(self, crop_ratio=5.0 / 8.0, min_side_ratio=0.4, **kwargs): + super().__init__() + self.crop_ratio = crop_ratio + self.min_side_ratio = min_side_ratio + + def sample_valid_start_end(self, valid_array, min_len, max_start, min_end): + + assert isinstance(min_len, int) + assert len(valid_array) > min_len + + start_array = valid_array.copy() + max_start = min(len(start_array) - min_len, max_start) + start_array[max_start:] = 0 + start_array[0] = 1 + diff_array = np.hstack([0, start_array]) - np.hstack([start_array, 0]) + region_starts = np.where(diff_array < 0)[0] + region_ends = np.where(diff_array > 0)[0] + region_ind = np.random.randint(0, len(region_starts)) + start = np.random.randint(region_starts[region_ind], + region_ends[region_ind]) + + end_array = valid_array.copy() + min_end = max(start + min_len, min_end) + end_array[:min_end] = 0 + end_array[-1] = 1 + diff_array = np.hstack([0, end_array]) - np.hstack([end_array, 0]) + region_starts = np.where(diff_array < 0)[0] + region_ends = np.where(diff_array > 0)[0] + region_ind = np.random.randint(0, len(region_starts)) + end = np.random.randint(region_starts[region_ind], + region_ends[region_ind]) + return start, end + + def sample_crop_box(self, img_size, results): + """Generate crop box and make sure not to crop the polygon instances. + + Args: + img_size (tuple(int)): The image size (h, w). + results (dict): The results dict. + """ + + assert isinstance(img_size, tuple) + h, w = img_size[:2] + + key_masks = results['polys'] + + x_valid_array = np.ones(w, dtype=np.int32) + y_valid_array = np.ones(h, dtype=np.int32) + + selected_mask = key_masks[np.random.randint(0, len(key_masks))] + selected_mask = selected_mask.reshape((-1, 2)).astype(np.int32) + max_x_start = max(np.min(selected_mask[:, 0]) - 2, 0) + min_x_end = min(np.max(selected_mask[:, 0]) + 3, w - 1) + max_y_start = max(np.min(selected_mask[:, 1]) - 2, 0) + min_y_end = min(np.max(selected_mask[:, 1]) + 3, h - 1) + + for mask in key_masks: + mask = mask.reshape((-1, 2)).astype(np.int32) + clip_x = np.clip(mask[:, 0], 0, w - 1) + clip_y = np.clip(mask[:, 1], 0, h - 1) + min_x, max_x = np.min(clip_x), np.max(clip_x) + min_y, max_y = np.min(clip_y), np.max(clip_y) + + x_valid_array[min_x - 2:max_x + 3] = 0 + y_valid_array[min_y - 2:max_y + 3] = 0 + + min_w = int(w * self.min_side_ratio) + min_h = int(h * self.min_side_ratio) + + x1, x2 = self.sample_valid_start_end(x_valid_array, min_w, max_x_start, + min_x_end) + y1, y2 = self.sample_valid_start_end(y_valid_array, min_h, max_y_start, + min_y_end) + + return np.array([x1, y1, x2, y2]) + + def crop_img(self, img, bbox): + assert img.ndim == 3 + h, w, _ = img.shape + assert 0 <= bbox[1] < bbox[3] <= h + assert 0 <= bbox[0] < bbox[2] <= w + return img[bbox[1]:bbox[3], bbox[0]:bbox[2]] + + def __call__(self, results): + image = results['image'] + polygons = results['polys'] + ignore_tags = results['ignore_tags'] + if len(polygons) < 1: + return results + + if np.random.random_sample() < self.crop_ratio: + + crop_box = self.sample_crop_box(image.shape, results) + img = self.crop_img(image, crop_box) + results['image'] = img + # crop and filter masks + x1, y1, x2, y2 = crop_box + w = max(x2 - x1, 1) + h = max(y2 - y1, 1) + polygons[:, :, 0::2] = polygons[:, :, 0::2] - x1 + polygons[:, :, 1::2] = polygons[:, :, 1::2] - y1 + + valid_masks_list = [] + valid_tags_list = [] + for ind, polygon in enumerate(polygons): + if (polygon[:, ::2] > -4).all() and ( + polygon[:, ::2] < w + 4).all() and ( + polygon[:, 1::2] > -4).all() and ( + polygon[:, 1::2] < h + 4).all(): + polygon[:, ::2] = np.clip(polygon[:, ::2], 0, w) + polygon[:, 1::2] = np.clip(polygon[:, 1::2], 0, h) + valid_masks_list.append(polygon) + valid_tags_list.append(ignore_tags[ind]) + + results['polys'] = np.array(valid_masks_list) + results['ignore_tags'] = valid_tags_list + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + return repr_str + + +class RandomRotatePolyInstances: + def __init__(self, + rotate_ratio=0.5, + max_angle=10, + pad_with_fixed_color=False, + pad_value=(0, 0, 0), + **kwargs): + """Randomly rotate images and polygon masks. + + Args: + rotate_ratio (float): The ratio of samples to operate rotation. + max_angle (int): The maximum rotation angle. + pad_with_fixed_color (bool): The flag for whether to pad rotated + image with fixed value. If set to False, the rotated image will + be padded onto cropped image. + pad_value (tuple(int)): The color value for padding rotated image. + """ + self.rotate_ratio = rotate_ratio + self.max_angle = max_angle + self.pad_with_fixed_color = pad_with_fixed_color + self.pad_value = pad_value + + def rotate(self, center, points, theta, center_shift=(0, 0)): + # rotate points. + (center_x, center_y) = center + center_y = -center_y + x, y = points[:, ::2], points[:, 1::2] + y = -y + + theta = theta / 180 * math.pi + cos = math.cos(theta) + sin = math.sin(theta) + + x = (x - center_x) + y = (y - center_y) + + _x = center_x + x * cos - y * sin + center_shift[0] + _y = -(center_y + x * sin + y * cos) + center_shift[1] + + points[:, ::2], points[:, 1::2] = _x, _y + return points + + def cal_canvas_size(self, ori_size, degree): + assert isinstance(ori_size, tuple) + angle = degree * math.pi / 180.0 + h, w = ori_size[:2] + + cos = math.cos(angle) + sin = math.sin(angle) + canvas_h = int(w * math.fabs(sin) + h * math.fabs(cos)) + canvas_w = int(w * math.fabs(cos) + h * math.fabs(sin)) + + canvas_size = (canvas_h, canvas_w) + return canvas_size + + def sample_angle(self, max_angle): + angle = np.random.random_sample() * 2 * max_angle - max_angle + return angle + + def rotate_img(self, img, angle, canvas_size): + h, w = img.shape[:2] + rotation_matrix = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1) + rotation_matrix[0, 2] += int((canvas_size[1] - w) / 2) + rotation_matrix[1, 2] += int((canvas_size[0] - h) / 2) + + if self.pad_with_fixed_color: + target_img = cv2.warpAffine( + img, + rotation_matrix, (canvas_size[1], canvas_size[0]), + flags=cv2.INTER_NEAREST, + borderValue=self.pad_value) + else: + mask = np.zeros_like(img) + (h_ind, w_ind) = (np.random.randint(0, h * 7 // 8), + np.random.randint(0, w * 7 // 8)) + img_cut = img[h_ind:(h_ind + h // 9), w_ind:(w_ind + w // 9)] + img_cut = cv2.resize(img_cut, (canvas_size[1], canvas_size[0])) + + mask = cv2.warpAffine( + mask, + rotation_matrix, (canvas_size[1], canvas_size[0]), + borderValue=[1, 1, 1]) + target_img = cv2.warpAffine( + img, + rotation_matrix, (canvas_size[1], canvas_size[0]), + borderValue=[0, 0, 0]) + target_img = target_img + img_cut * mask + + return target_img + + def __call__(self, results): + if np.random.random_sample() < self.rotate_ratio: + image = results['image'] + polygons = results['polys'] + h, w = image.shape[:2] + + angle = self.sample_angle(self.max_angle) + canvas_size = self.cal_canvas_size((h, w), angle) + center_shift = (int((canvas_size[1] - w) / 2), int( + (canvas_size[0] - h) / 2)) + image = self.rotate_img(image, angle, canvas_size) + results['image'] = image + # rotate polygons + rotated_masks = [] + for mask in polygons: + rotated_mask = self.rotate((w / 2, h / 2), mask, angle, + center_shift) + rotated_masks.append(rotated_mask) + results['polys'] = np.array(rotated_masks) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + return repr_str + + +class SquareResizePad: + def __init__(self, + target_size, + pad_ratio=0.6, + pad_with_fixed_color=False, + pad_value=(0, 0, 0), + **kwargs): + """Resize or pad images to be square shape. + + Args: + target_size (int): The target size of square shaped image. + pad_with_fixed_color (bool): The flag for whether to pad rotated + image with fixed value. If set to False, the rescales image will + be padded onto cropped image. + pad_value (tuple(int)): The color value for padding rotated image. + """ + assert isinstance(target_size, int) + assert isinstance(pad_ratio, float) + assert isinstance(pad_with_fixed_color, bool) + assert isinstance(pad_value, tuple) + + self.target_size = target_size + self.pad_ratio = pad_ratio + self.pad_with_fixed_color = pad_with_fixed_color + self.pad_value = pad_value + + def resize_img(self, img, keep_ratio=True): + h, w, _ = img.shape + if keep_ratio: + t_h = self.target_size if h >= w else int(h * self.target_size / w) + t_w = self.target_size if h <= w else int(w * self.target_size / h) + else: + t_h = t_w = self.target_size + img = cv2.resize(img, (t_w, t_h)) + return img, (t_h, t_w) + + def square_pad(self, img): + h, w = img.shape[:2] + if h == w: + return img, (0, 0) + pad_size = max(h, w) + if self.pad_with_fixed_color: + expand_img = np.ones((pad_size, pad_size, 3), dtype=np.uint8) + expand_img[:] = self.pad_value + else: + (h_ind, w_ind) = (np.random.randint(0, h * 7 // 8), + np.random.randint(0, w * 7 // 8)) + img_cut = img[h_ind:(h_ind + h // 9), w_ind:(w_ind + w // 9)] + expand_img = cv2.resize(img_cut, (pad_size, pad_size)) + if h > w: + y0, x0 = 0, (h - w) // 2 + else: + y0, x0 = (w - h) // 2, 0 + expand_img[y0:y0 + h, x0:x0 + w] = img + offset = (x0, y0) + + return expand_img, offset + + def square_pad_mask(self, points, offset): + x0, y0 = offset + pad_points = points.copy() + pad_points[::2] = pad_points[::2] + x0 + pad_points[1::2] = pad_points[1::2] + y0 + return pad_points + + def __call__(self, results): + image = results['image'] + polygons = results['polys'] + h, w = image.shape[:2] + + if np.random.random_sample() < self.pad_ratio: + image, out_size = self.resize_img(image, keep_ratio=True) + image, offset = self.square_pad(image) + else: + image, out_size = self.resize_img(image, keep_ratio=False) + offset = (0, 0) + results['image'] = image + try: + polygons[:, :, 0::2] = polygons[:, :, 0::2] * out_size[ + 1] / w + offset[0] + polygons[:, :, 1::2] = polygons[:, :, 1::2] * out_size[ + 0] / h + offset[1] + except: + pass + results['polys'] = polygons + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + return repr_str diff --git a/backend/ppocr/data/imaug/fce_targets.py b/backend/ppocr/data/imaug/fce_targets.py new file mode 100644 index 0000000..1818480 --- /dev/null +++ b/backend/ppocr/data/imaug/fce_targets.py @@ -0,0 +1,658 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/fcenet_targets.py +""" + +import cv2 +import numpy as np +from numpy.fft import fft +from numpy.linalg import norm +import sys + + +class FCENetTargets: + """Generate the ground truth targets of FCENet: Fourier Contour Embedding + for Arbitrary-Shaped Text Detection. + + [https://arxiv.org/abs/2104.10442] + + Args: + fourier_degree (int): The maximum Fourier transform degree k. + resample_step (float): The step size for resampling the text center + line (TCL). It's better not to exceed half of the minimum width. + center_region_shrink_ratio (float): The shrink ratio of text center + region. + level_size_divisors (tuple(int)): The downsample ratio on each level. + level_proportion_range (tuple(tuple(int))): The range of text sizes + assigned to each level. + """ + + def __init__(self, + fourier_degree=5, + resample_step=4.0, + center_region_shrink_ratio=0.3, + level_size_divisors=(8, 16, 32), + level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0)), + orientation_thr=2.0, + **kwargs): + + super().__init__() + assert isinstance(level_size_divisors, tuple) + assert isinstance(level_proportion_range, tuple) + assert len(level_size_divisors) == len(level_proportion_range) + self.fourier_degree = fourier_degree + self.resample_step = resample_step + self.center_region_shrink_ratio = center_region_shrink_ratio + self.level_size_divisors = level_size_divisors + self.level_proportion_range = level_proportion_range + + self.orientation_thr = orientation_thr + + def vector_angle(self, vec1, vec2): + if vec1.ndim > 1: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8).reshape((-1, 1)) + else: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8) + if vec2.ndim > 1: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8).reshape((-1, 1)) + else: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8) + return np.arccos( + np.clip( + np.sum(unit_vec1 * unit_vec2, axis=-1), -1.0, 1.0)) + + def resample_line(self, line, n): + """Resample n points on a line. + + Args: + line (ndarray): The points composing a line. + n (int): The resampled points number. + + Returns: + resampled_line (ndarray): The points composing the resampled line. + """ + + assert line.ndim == 2 + assert line.shape[0] >= 2 + assert line.shape[1] == 2 + assert isinstance(n, int) + assert n > 0 + + length_list = [ + norm(line[i + 1] - line[i]) for i in range(len(line) - 1) + ] + total_length = sum(length_list) + length_cumsum = np.cumsum([0.0] + length_list) + delta_length = total_length / (float(n) + 1e-8) + + current_edge_ind = 0 + resampled_line = [line[0]] + + for i in range(1, n): + current_line_len = i * delta_length + + while current_line_len >= length_cumsum[current_edge_ind + 1]: + current_edge_ind += 1 + current_edge_end_shift = current_line_len - length_cumsum[ + current_edge_ind] + end_shift_ratio = current_edge_end_shift / length_list[ + current_edge_ind] + current_point = line[current_edge_ind] + (line[current_edge_ind + 1] + - line[current_edge_ind] + ) * end_shift_ratio + resampled_line.append(current_point) + + resampled_line.append(line[-1]) + resampled_line = np.array(resampled_line) + + return resampled_line + + def reorder_poly_edge(self, points): + """Get the respective points composing head edge, tail edge, top + sideline and bottom sideline. + + Args: + points (ndarray): The points composing a text polygon. + + Returns: + head_edge (ndarray): The two points composing the head edge of text + polygon. + tail_edge (ndarray): The two points composing the tail edge of text + polygon. + top_sideline (ndarray): The points composing top curved sideline of + text polygon. + bot_sideline (ndarray): The points composing bottom curved sideline + of text polygon. + """ + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + + head_inds, tail_inds = self.find_head_tail(points, self.orientation_thr) + head_edge, tail_edge = points[head_inds], points[tail_inds] + + pad_points = np.vstack([points, points]) + if tail_inds[1] < 1: + tail_inds[1] = len(points) + sideline1 = pad_points[head_inds[1]:tail_inds[1]] + sideline2 = pad_points[tail_inds[1]:(head_inds[1] + len(points))] + sideline_mean_shift = np.mean( + sideline1, axis=0) - np.mean( + sideline2, axis=0) + + if sideline_mean_shift[1] > 0: + top_sideline, bot_sideline = sideline2, sideline1 + else: + top_sideline, bot_sideline = sideline1, sideline2 + + return head_edge, tail_edge, top_sideline, bot_sideline + + def find_head_tail(self, points, orientation_thr): + """Find the head edge and tail edge of a text polygon. + + Args: + points (ndarray): The points composing a text polygon. + orientation_thr (float): The threshold for distinguishing between + head edge and tail edge among the horizontal and vertical edges + of a quadrangle. + + Returns: + head_inds (list): The indexes of two points composing head edge. + tail_inds (list): The indexes of two points composing tail edge. + """ + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + assert isinstance(orientation_thr, float) + + if len(points) > 4: + pad_points = np.vstack([points, points[0]]) + edge_vec = pad_points[1:] - pad_points[:-1] + + theta_sum = [] + adjacent_vec_theta = [] + for i, edge_vec1 in enumerate(edge_vec): + adjacent_ind = [x % len(edge_vec) for x in [i - 1, i + 1]] + adjacent_edge_vec = edge_vec[adjacent_ind] + temp_theta_sum = np.sum( + self.vector_angle(edge_vec1, adjacent_edge_vec)) + temp_adjacent_theta = self.vector_angle(adjacent_edge_vec[0], + adjacent_edge_vec[1]) + theta_sum.append(temp_theta_sum) + adjacent_vec_theta.append(temp_adjacent_theta) + theta_sum_score = np.array(theta_sum) / np.pi + adjacent_theta_score = np.array(adjacent_vec_theta) / np.pi + poly_center = np.mean(points, axis=0) + edge_dist = np.maximum( + norm( + pad_points[1:] - poly_center, axis=-1), + norm( + pad_points[:-1] - poly_center, axis=-1)) + dist_score = edge_dist / np.max(edge_dist) + position_score = np.zeros(len(edge_vec)) + score = 0.5 * theta_sum_score + 0.15 * adjacent_theta_score + score += 0.35 * dist_score + if len(points) % 2 == 0: + position_score[(len(score) // 2 - 1)] += 1 + position_score[-1] += 1 + score += 0.1 * position_score + pad_score = np.concatenate([score, score]) + score_matrix = np.zeros((len(score), len(score) - 3)) + x = np.arange(len(score) - 3) / float(len(score) - 4) + gaussian = 1. / (np.sqrt(2. * np.pi) * 0.5) * np.exp(-np.power( + (x - 0.5) / 0.5, 2.) / 2) + gaussian = gaussian / np.max(gaussian) + for i in range(len(score)): + score_matrix[i, :] = score[i] + pad_score[(i + 2):(i + len( + score) - 1)] * gaussian * 0.3 + + head_start, tail_increment = np.unravel_index(score_matrix.argmax(), + score_matrix.shape) + tail_start = (head_start + tail_increment + 2) % len(points) + head_end = (head_start + 1) % len(points) + tail_end = (tail_start + 1) % len(points) + + if head_end > tail_end: + head_start, tail_start = tail_start, head_start + head_end, tail_end = tail_end, head_end + head_inds = [head_start, head_end] + tail_inds = [tail_start, tail_end] + else: + if self.vector_slope(points[1] - points[0]) + self.vector_slope( + points[3] - points[2]) < self.vector_slope(points[ + 2] - points[1]) + self.vector_slope(points[0] - points[ + 3]): + horizontal_edge_inds = [[0, 1], [2, 3]] + vertical_edge_inds = [[3, 0], [1, 2]] + else: + horizontal_edge_inds = [[3, 0], [1, 2]] + vertical_edge_inds = [[0, 1], [2, 3]] + + vertical_len_sum = norm(points[vertical_edge_inds[0][0]] - points[ + vertical_edge_inds[0][1]]) + norm(points[vertical_edge_inds[1][ + 0]] - points[vertical_edge_inds[1][1]]) + horizontal_len_sum = norm(points[horizontal_edge_inds[0][ + 0]] - points[horizontal_edge_inds[0][1]]) + norm(points[ + horizontal_edge_inds[1][0]] - points[horizontal_edge_inds[1] + [1]]) + + if vertical_len_sum > horizontal_len_sum * orientation_thr: + head_inds = horizontal_edge_inds[0] + tail_inds = horizontal_edge_inds[1] + else: + head_inds = vertical_edge_inds[0] + tail_inds = vertical_edge_inds[1] + + return head_inds, tail_inds + + def resample_sidelines(self, sideline1, sideline2, resample_step): + """Resample two sidelines to be of the same points number according to + step size. + + Args: + sideline1 (ndarray): The points composing a sideline of a text + polygon. + sideline2 (ndarray): The points composing another sideline of a + text polygon. + resample_step (float): The resampled step size. + + Returns: + resampled_line1 (ndarray): The resampled line 1. + resampled_line2 (ndarray): The resampled line 2. + """ + + assert sideline1.ndim == sideline2.ndim == 2 + assert sideline1.shape[1] == sideline2.shape[1] == 2 + assert sideline1.shape[0] >= 2 + assert sideline2.shape[0] >= 2 + assert isinstance(resample_step, float) + + length1 = sum([ + norm(sideline1[i + 1] - sideline1[i]) + for i in range(len(sideline1) - 1) + ]) + length2 = sum([ + norm(sideline2[i + 1] - sideline2[i]) + for i in range(len(sideline2) - 1) + ]) + + total_length = (length1 + length2) / 2 + resample_point_num = max(int(float(total_length) / resample_step), 1) + + resampled_line1 = self.resample_line(sideline1, resample_point_num) + resampled_line2 = self.resample_line(sideline2, resample_point_num) + + return resampled_line1, resampled_line2 + + def generate_center_region_mask(self, img_size, text_polys): + """Generate text center region mask. + + Args: + img_size (tuple): The image size of (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + center_region_mask (ndarray): The text center region mask. + """ + + assert isinstance(img_size, tuple) + # assert check_argument.is_2dlist(text_polys) + + h, w = img_size + + center_region_mask = np.zeros((h, w), np.uint8) + + center_region_boxes = [] + for poly in text_polys: + # assert len(poly) == 1 + polygon_points = poly.reshape(-1, 2) + _, _, top_line, bot_line = self.reorder_poly_edge(polygon_points) + resampled_top_line, resampled_bot_line = self.resample_sidelines( + top_line, bot_line, self.resample_step) + resampled_bot_line = resampled_bot_line[::-1] + center_line = (resampled_top_line + resampled_bot_line) / 2 + + line_head_shrink_len = norm(resampled_top_line[0] - + resampled_bot_line[0]) / 4.0 + line_tail_shrink_len = norm(resampled_top_line[-1] - + resampled_bot_line[-1]) / 4.0 + head_shrink_num = int(line_head_shrink_len // self.resample_step) + tail_shrink_num = int(line_tail_shrink_len // self.resample_step) + if len(center_line) > head_shrink_num + tail_shrink_num + 2: + center_line = center_line[head_shrink_num:len(center_line) - + tail_shrink_num] + resampled_top_line = resampled_top_line[head_shrink_num:len( + resampled_top_line) - tail_shrink_num] + resampled_bot_line = resampled_bot_line[head_shrink_num:len( + resampled_bot_line) - tail_shrink_num] + + for i in range(0, len(center_line) - 1): + tl = center_line[i] + (resampled_top_line[i] - center_line[i] + ) * self.center_region_shrink_ratio + tr = center_line[i + 1] + (resampled_top_line[i + 1] - + center_line[i + 1] + ) * self.center_region_shrink_ratio + br = center_line[i + 1] + (resampled_bot_line[i + 1] - + center_line[i + 1] + ) * self.center_region_shrink_ratio + bl = center_line[i] + (resampled_bot_line[i] - center_line[i] + ) * self.center_region_shrink_ratio + current_center_box = np.vstack([tl, tr, br, + bl]).astype(np.int32) + center_region_boxes.append(current_center_box) + + cv2.fillPoly(center_region_mask, center_region_boxes, 1) + return center_region_mask + + def resample_polygon(self, polygon, n=400): + """Resample one polygon with n points on its boundary. + + Args: + polygon (list[float]): The input polygon. + n (int): The number of resampled points. + Returns: + resampled_polygon (list[float]): The resampled polygon. + """ + length = [] + + for i in range(len(polygon)): + p1 = polygon[i] + if i == len(polygon) - 1: + p2 = polygon[0] + else: + p2 = polygon[i + 1] + length.append(((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5) + + total_length = sum(length) + n_on_each_line = (np.array(length) / (total_length + 1e-8)) * n + n_on_each_line = n_on_each_line.astype(np.int32) + new_polygon = [] + + for i in range(len(polygon)): + num = n_on_each_line[i] + p1 = polygon[i] + if i == len(polygon) - 1: + p2 = polygon[0] + else: + p2 = polygon[i + 1] + + if num == 0: + continue + + dxdy = (p2 - p1) / num + for j in range(num): + point = p1 + dxdy * j + new_polygon.append(point) + + return np.array(new_polygon) + + def normalize_polygon(self, polygon): + """Normalize one polygon so that its start point is at right most. + + Args: + polygon (list[float]): The origin polygon. + Returns: + new_polygon (lost[float]): The polygon with start point at right. + """ + temp_polygon = polygon - polygon.mean(axis=0) + x = np.abs(temp_polygon[:, 0]) + y = temp_polygon[:, 1] + index_x = np.argsort(x) + index_y = np.argmin(y[index_x[:8]]) + index = index_x[index_y] + new_polygon = np.concatenate([polygon[index:], polygon[:index]]) + return new_polygon + + def poly2fourier(self, polygon, fourier_degree): + """Perform Fourier transformation to generate Fourier coefficients ck + from polygon. + + Args: + polygon (ndarray): An input polygon. + fourier_degree (int): The maximum Fourier degree K. + Returns: + c (ndarray(complex)): Fourier coefficients. + """ + points = polygon[:, 0] + polygon[:, 1] * 1j + c_fft = fft(points) / len(points) + c = np.hstack((c_fft[-fourier_degree:], c_fft[:fourier_degree + 1])) + return c + + def clockwise(self, c, fourier_degree): + """Make sure the polygon reconstructed from Fourier coefficients c in + the clockwise direction. + + Args: + polygon (list[float]): The origin polygon. + Returns: + new_polygon (lost[float]): The polygon in clockwise point order. + """ + if np.abs(c[fourier_degree + 1]) > np.abs(c[fourier_degree - 1]): + return c + elif np.abs(c[fourier_degree + 1]) < np.abs(c[fourier_degree - 1]): + return c[::-1] + else: + if np.abs(c[fourier_degree + 2]) > np.abs(c[fourier_degree - 2]): + return c + else: + return c[::-1] + + def cal_fourier_signature(self, polygon, fourier_degree): + """Calculate Fourier signature from input polygon. + + Args: + polygon (ndarray): The input polygon. + fourier_degree (int): The maximum Fourier degree K. + Returns: + fourier_signature (ndarray): An array shaped (2k+1, 2) containing + real part and image part of 2k+1 Fourier coefficients. + """ + resampled_polygon = self.resample_polygon(polygon) + resampled_polygon = self.normalize_polygon(resampled_polygon) + + fourier_coeff = self.poly2fourier(resampled_polygon, fourier_degree) + fourier_coeff = self.clockwise(fourier_coeff, fourier_degree) + + real_part = np.real(fourier_coeff).reshape((-1, 1)) + image_part = np.imag(fourier_coeff).reshape((-1, 1)) + fourier_signature = np.hstack([real_part, image_part]) + + return fourier_signature + + def generate_fourier_maps(self, img_size, text_polys): + """Generate Fourier coefficient maps. + + Args: + img_size (tuple): The image size of (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + fourier_real_map (ndarray): The Fourier coefficient real part maps. + fourier_image_map (ndarray): The Fourier coefficient image part + maps. + """ + + assert isinstance(img_size, tuple) + + h, w = img_size + k = self.fourier_degree + real_map = np.zeros((k * 2 + 1, h, w), dtype=np.float32) + imag_map = np.zeros((k * 2 + 1, h, w), dtype=np.float32) + + for poly in text_polys: + mask = np.zeros((h, w), dtype=np.uint8) + polygon = np.array(poly).reshape((1, -1, 2)) + cv2.fillPoly(mask, polygon.astype(np.int32), 1) + fourier_coeff = self.cal_fourier_signature(polygon[0], k) + for i in range(-k, k + 1): + if i != 0: + real_map[i + k, :, :] = mask * fourier_coeff[i + k, 0] + ( + 1 - mask) * real_map[i + k, :, :] + imag_map[i + k, :, :] = mask * fourier_coeff[i + k, 1] + ( + 1 - mask) * imag_map[i + k, :, :] + else: + yx = np.argwhere(mask > 0.5) + k_ind = np.ones((len(yx)), dtype=np.int64) * k + y, x = yx[:, 0], yx[:, 1] + real_map[k_ind, y, x] = fourier_coeff[k, 0] - x + imag_map[k_ind, y, x] = fourier_coeff[k, 1] - y + + return real_map, imag_map + + def generate_text_region_mask(self, img_size, text_polys): + """Generate text center region mask and geometry attribute maps. + + Args: + img_size (tuple): The image size (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + text_region_mask (ndarray): The text region mask. + """ + + assert isinstance(img_size, tuple) + + h, w = img_size + text_region_mask = np.zeros((h, w), dtype=np.uint8) + + for poly in text_polys: + polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2)) + cv2.fillPoly(text_region_mask, polygon, 1) + + return text_region_mask + + def generate_effective_mask(self, mask_size: tuple, polygons_ignore): + """Generate effective mask by setting the ineffective regions to 0 and + effective regions to 1. + + Args: + mask_size (tuple): The mask size. + polygons_ignore (list[[ndarray]]: The list of ignored text + polygons. + + Returns: + mask (ndarray): The effective mask of (height, width). + """ + + mask = np.ones(mask_size, dtype=np.uint8) + + for poly in polygons_ignore: + instance = poly.reshape(-1, 2).astype(np.int32).reshape(1, -1, 2) + cv2.fillPoly(mask, instance, 0) + + return mask + + def generate_level_targets(self, img_size, text_polys, ignore_polys): + """Generate ground truth target on each level. + + Args: + img_size (list[int]): Shape of input image. + text_polys (list[list[ndarray]]): A list of ground truth polygons. + ignore_polys (list[list[ndarray]]): A list of ignored polygons. + Returns: + level_maps (list(ndarray)): A list of ground target on each level. + """ + h, w = img_size + lv_size_divs = self.level_size_divisors + lv_proportion_range = self.level_proportion_range + lv_text_polys = [[] for i in range(len(lv_size_divs))] + lv_ignore_polys = [[] for i in range(len(lv_size_divs))] + level_maps = [] + for poly in text_polys: + polygon = np.array(poly, dtype=np.int).reshape((1, -1, 2)) + _, _, box_w, box_h = cv2.boundingRect(polygon) + proportion = max(box_h, box_w) / (h + 1e-8) + + for ind, proportion_range in enumerate(lv_proportion_range): + if proportion_range[0] < proportion < proportion_range[1]: + lv_text_polys[ind].append(poly / lv_size_divs[ind]) + + for ignore_poly in ignore_polys: + polygon = np.array(ignore_poly, dtype=np.int).reshape((1, -1, 2)) + _, _, box_w, box_h = cv2.boundingRect(polygon) + proportion = max(box_h, box_w) / (h + 1e-8) + + for ind, proportion_range in enumerate(lv_proportion_range): + if proportion_range[0] < proportion < proportion_range[1]: + lv_ignore_polys[ind].append(ignore_poly / lv_size_divs[ind]) + + for ind, size_divisor in enumerate(lv_size_divs): + current_level_maps = [] + level_img_size = (h // size_divisor, w // size_divisor) + + text_region = self.generate_text_region_mask( + level_img_size, lv_text_polys[ind])[None] + current_level_maps.append(text_region) + + center_region = self.generate_center_region_mask( + level_img_size, lv_text_polys[ind])[None] + current_level_maps.append(center_region) + + effective_mask = self.generate_effective_mask( + level_img_size, lv_ignore_polys[ind])[None] + current_level_maps.append(effective_mask) + + fourier_real_map, fourier_image_maps = self.generate_fourier_maps( + level_img_size, lv_text_polys[ind]) + current_level_maps.append(fourier_real_map) + current_level_maps.append(fourier_image_maps) + + level_maps.append(np.concatenate(current_level_maps)) + + return level_maps + + def generate_targets(self, results): + """Generate the ground truth targets for FCENet. + + Args: + results (dict): The input result dictionary. + + Returns: + results (dict): The output result dictionary. + """ + + assert isinstance(results, dict) + image = results['image'] + polygons = results['polys'] + ignore_tags = results['ignore_tags'] + h, w, _ = image.shape + + polygon_masks = [] + polygon_masks_ignore = [] + for tag, polygon in zip(ignore_tags, polygons): + if tag is True: + polygon_masks_ignore.append(polygon) + else: + polygon_masks.append(polygon) + + level_maps = self.generate_level_targets((h, w), polygon_masks, + polygon_masks_ignore) + + mapping = { + 'p3_maps': level_maps[0], + 'p4_maps': level_maps[1], + 'p5_maps': level_maps[2] + } + for key, value in mapping.items(): + results[key] = value + + return results + + def __call__(self, results): + results = self.generate_targets(results) + return results diff --git a/backend/ppocr/data/imaug/gen_table_mask.py b/backend/ppocr/data/imaug/gen_table_mask.py new file mode 100644 index 0000000..08e35d5 --- /dev/null +++ b/backend/ppocr/data/imaug/gen_table_mask.py @@ -0,0 +1,244 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np + + +class GenTableMask(object): + """ gen table mask """ + + def __init__(self, shrink_h_max, shrink_w_max, mask_type=0, **kwargs): + self.shrink_h_max = 5 + self.shrink_w_max = 5 + self.mask_type = mask_type + + def projection(self, erosion, h, w, spilt_threshold=0): + # 水平投影 + projection_map = np.ones_like(erosion) + project_val_array = [0 for _ in range(0, h)] + + for j in range(0, h): + for i in range(0, w): + if erosion[j, i] == 255: + project_val_array[j] += 1 + # 根据数组,获取切割点 + start_idx = 0 # 记录进入字符区的索引 + end_idx = 0 # 记录进入空白区域的索引 + in_text = False # 是否遍历到了字符区内 + box_list = [] + for i in range(len(project_val_array)): + if in_text == False and project_val_array[i] > spilt_threshold: # 进入字符区了 + in_text = True + start_idx = i + elif project_val_array[i] <= spilt_threshold and in_text == True: # 进入空白区了 + end_idx = i + in_text = False + if end_idx - start_idx <= 2: + continue + box_list.append((start_idx, end_idx + 1)) + + if in_text: + box_list.append((start_idx, h - 1)) + # 绘制投影直方图 + for j in range(0, h): + for i in range(0, project_val_array[j]): + projection_map[j, i] = 0 + return box_list, projection_map + + def projection_cx(self, box_img): + box_gray_img = cv2.cvtColor(box_img, cv2.COLOR_BGR2GRAY) + h, w = box_gray_img.shape + # 灰度图片进行二值化处理 + ret, thresh1 = cv2.threshold(box_gray_img, 200, 255, cv2.THRESH_BINARY_INV) + # 纵向腐蚀 + if h < w: + kernel = np.ones((2, 1), np.uint8) + erode = cv2.erode(thresh1, kernel, iterations=1) + else: + erode = thresh1 + # 水平膨胀 + kernel = np.ones((1, 5), np.uint8) + erosion = cv2.dilate(erode, kernel, iterations=1) + # 水平投影 + projection_map = np.ones_like(erosion) + project_val_array = [0 for _ in range(0, h)] + + for j in range(0, h): + for i in range(0, w): + if erosion[j, i] == 255: + project_val_array[j] += 1 + # 根据数组,获取切割点 + start_idx = 0 # 记录进入字符区的索引 + end_idx = 0 # 记录进入空白区域的索引 + in_text = False # 是否遍历到了字符区内 + box_list = [] + spilt_threshold = 0 + for i in range(len(project_val_array)): + if in_text == False and project_val_array[i] > spilt_threshold: # 进入字符区了 + in_text = True + start_idx = i + elif project_val_array[i] <= spilt_threshold and in_text == True: # 进入空白区了 + end_idx = i + in_text = False + if end_idx - start_idx <= 2: + continue + box_list.append((start_idx, end_idx + 1)) + + if in_text: + box_list.append((start_idx, h - 1)) + # 绘制投影直方图 + for j in range(0, h): + for i in range(0, project_val_array[j]): + projection_map[j, i] = 0 + split_bbox_list = [] + if len(box_list) > 1: + for i, (h_start, h_end) in enumerate(box_list): + if i == 0: + h_start = 0 + if i == len(box_list): + h_end = h + word_img = erosion[h_start:h_end + 1, :] + word_h, word_w = word_img.shape + w_split_list, w_projection_map = self.projection(word_img.T, word_w, word_h) + w_start, w_end = w_split_list[0][0], w_split_list[-1][1] + if h_start > 0: + h_start -= 1 + h_end += 1 + word_img = box_img[h_start:h_end + 1:, w_start:w_end + 1, :] + split_bbox_list.append([w_start, h_start, w_end, h_end]) + else: + split_bbox_list.append([0, 0, w, h]) + return split_bbox_list + + def shrink_bbox(self, bbox): + left, top, right, bottom = bbox + sh_h = min(max(int((bottom - top) * 0.1), 1), self.shrink_h_max) + sh_w = min(max(int((right - left) * 0.1), 1), self.shrink_w_max) + left_new = left + sh_w + right_new = right - sh_w + top_new = top + sh_h + bottom_new = bottom - sh_h + if left_new >= right_new: + left_new = left + right_new = right + if top_new >= bottom_new: + top_new = top + bottom_new = bottom + return [left_new, top_new, right_new, bottom_new] + + def __call__(self, data): + img = data['image'] + cells = data['cells'] + height, width = img.shape[0:2] + if self.mask_type == 1: + mask_img = np.zeros((height, width), dtype=np.float32) + else: + mask_img = np.zeros((height, width, 3), dtype=np.float32) + cell_num = len(cells) + for cno in range(cell_num): + if "bbox" in cells[cno]: + bbox = cells[cno]['bbox'] + left, top, right, bottom = bbox + box_img = img[top:bottom, left:right, :].copy() + split_bbox_list = self.projection_cx(box_img) + for sno in range(len(split_bbox_list)): + split_bbox_list[sno][0] += left + split_bbox_list[sno][1] += top + split_bbox_list[sno][2] += left + split_bbox_list[sno][3] += top + + for sno in range(len(split_bbox_list)): + left, top, right, bottom = split_bbox_list[sno] + left, top, right, bottom = self.shrink_bbox([left, top, right, bottom]) + if self.mask_type == 1: + mask_img[top:bottom, left:right] = 1.0 + data['mask_img'] = mask_img + else: + mask_img[top:bottom, left:right, :] = (255, 255, 255) + data['image'] = mask_img + return data + +class ResizeTableImage(object): + def __init__(self, max_len, **kwargs): + super(ResizeTableImage, self).__init__() + self.max_len = max_len + + def get_img_bbox(self, cells): + bbox_list = [] + if len(cells) == 0: + return bbox_list + cell_num = len(cells) + for cno in range(cell_num): + if "bbox" in cells[cno]: + bbox = cells[cno]['bbox'] + bbox_list.append(bbox) + return bbox_list + + def resize_img_table(self, img, bbox_list, max_len): + height, width = img.shape[0:2] + ratio = max_len / (max(height, width) * 1.0) + resize_h = int(height * ratio) + resize_w = int(width * ratio) + img_new = cv2.resize(img, (resize_w, resize_h)) + bbox_list_new = [] + for bno in range(len(bbox_list)): + left, top, right, bottom = bbox_list[bno].copy() + left = int(left * ratio) + top = int(top * ratio) + right = int(right * ratio) + bottom = int(bottom * ratio) + bbox_list_new.append([left, top, right, bottom]) + return img_new, bbox_list_new + + def __call__(self, data): + img = data['image'] + if 'cells' not in data: + cells = [] + else: + cells = data['cells'] + bbox_list = self.get_img_bbox(cells) + img_new, bbox_list_new = self.resize_img_table(img, bbox_list, self.max_len) + data['image'] = img_new + cell_num = len(cells) + bno = 0 + for cno in range(cell_num): + if "bbox" in data['cells'][cno]: + data['cells'][cno]['bbox'] = bbox_list_new[bno] + bno += 1 + data['max_len'] = self.max_len + return data + +class PaddingTableImage(object): + def __init__(self, **kwargs): + super(PaddingTableImage, self).__init__() + + def __call__(self, data): + img = data['image'] + max_len = data['max_len'] + padding_img = np.zeros((max_len, max_len, 3), dtype=np.float32) + height, width = img.shape[0:2] + padding_img[0:height, 0:width, :] = img.copy() + data['image'] = padding_img + return data + \ No newline at end of file diff --git a/backend/ppocr/data/imaug/iaa_augment.py b/backend/ppocr/data/imaug/iaa_augment.py new file mode 100644 index 0000000..0aac787 --- /dev/null +++ b/backend/ppocr/data/imaug/iaa_augment.py @@ -0,0 +1,105 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/iaa_augment.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import imgaug +import imgaug.augmenters as iaa + + +class AugmenterBuilder(object): + def __init__(self): + pass + + def build(self, args, root=True): + if args is None or len(args) == 0: + return None + elif isinstance(args, list): + if root: + sequence = [self.build(value, root=False) for value in args] + return iaa.Sequential(sequence) + else: + return getattr(iaa, args[0])( + *[self.to_tuple_if_list(a) for a in args[1:]]) + elif isinstance(args, dict): + cls = getattr(iaa, args['type']) + return cls(**{ + k: self.to_tuple_if_list(v) + for k, v in args['args'].items() + }) + else: + raise RuntimeError('unknown augmenter arg: ' + str(args)) + + def to_tuple_if_list(self, obj): + if isinstance(obj, list): + return tuple(obj) + return obj + + +class IaaAugment(): + def __init__(self, augmenter_args=None, **kwargs): + if augmenter_args is None: + augmenter_args = [{ + 'type': 'Fliplr', + 'args': { + 'p': 0.5 + } + }, { + 'type': 'Affine', + 'args': { + 'rotate': [-10, 10] + } + }, { + 'type': 'Resize', + 'args': { + 'size': [0.5, 3] + } + }] + self.augmenter = AugmenterBuilder().build(augmenter_args) + + def __call__(self, data): + image = data['image'] + shape = image.shape + + if self.augmenter: + aug = self.augmenter.to_deterministic() + data['image'] = aug.augment_image(image) + data = self.may_augment_annotation(aug, data, shape) + return data + + def may_augment_annotation(self, aug, data, shape): + if aug is None: + return data + + line_polys = [] + for poly in data['polys']: + new_poly = self.may_augment_poly(aug, shape, poly) + line_polys.append(new_poly) + data['polys'] = np.array(line_polys) + return data + + def may_augment_poly(self, aug, img_shape, poly): + keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly] + keypoints = aug.augment_keypoints( + [imgaug.KeypointsOnImage( + keypoints, shape=img_shape)])[0].keypoints + poly = [(p.x, p.y) for p in keypoints] + return poly diff --git a/backend/ppocr/data/imaug/label_ops.py b/backend/ppocr/data/imaug/label_ops.py new file mode 100644 index 0000000..c9bc2e7 --- /dev/null +++ b/backend/ppocr/data/imaug/label_ops.py @@ -0,0 +1,1041 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy +import numpy as np +import string +from shapely.geometry import LineString, Point, Polygon +import json +import copy + +from ppocr.utils.logging import get_logger + + +class ClsLabelEncode(object): + def __init__(self, label_list, **kwargs): + self.label_list = label_list + + def __call__(self, data): + label = data['label'] + if label not in self.label_list: + return None + label = self.label_list.index(label) + data['label'] = label + return data + + +class DetLabelEncode(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags = [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + boxes.append(box) + txts.append(txt) + if txt in ['*', '###']: + txt_tags.append(True) + else: + txt_tags.append(False) + if len(boxes) == 0: + return None + boxes = self.expand_points_num(boxes) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=np.bool) + + data['polys'] = boxes + data['texts'] = txts + data['ignore_tags'] = txt_tags + return data + + def order_points_clockwise(self, pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + diff = np.diff(pts, axis=1) + rect[1] = pts[np.argmin(diff)] + rect[3] = pts[np.argmax(diff)] + return rect + + def expand_points_num(self, boxes): + max_points_num = 0 + for box in boxes: + if len(box) > max_points_num: + max_points_num = len(box) + ex_boxes = [] + for box in boxes: + ex_box = box + [box[-1]] * (max_points_num - len(box)) + ex_boxes.append(ex_box) + return ex_boxes + + +class BaseRecLabelEncode(object): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False): + + self.max_text_len = max_text_length + self.beg_str = "sos" + self.end_str = "eos" + self.lower = False + + if character_dict_path is None: + logger = get_logger() + logger.warning( + "The character_dict_path is None, model can only recognize number and lower letters" + ) + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + self.lower = True + else: + self.character_str = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def add_special_char(self, dict_character): + return dict_character + + def encode(self, text): + """convert text-label into text-index. + input: + text: text labels of each image. [batch_size] + + output: + text: concatenated text index for CTCLoss. + [sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)] + length: length of each text. [batch_size] + """ + if len(text) == 0 or len(text) > self.max_text_len: + return None + if self.lower: + text = text.lower() + text_list = [] + for char in text: + if char not in self.dict: + # logger = get_logger() + # logger.warning('{} is not in dict'.format(char)) + continue + text_list.append(self.dict[char]) + if len(text_list) == 0: + return None + return text_list + + +class NRTRLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + + super(NRTRLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + text.insert(0, 2) + text.append(3) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + +class CTCLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(CTCLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + data['length'] = np.array(len(text)) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + + label = [0] * len(self.character) + for x in text: + label[x] += 1 + data['label_ace'] = np.array(label) + return data + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character + + +class E2ELabelEncodeTest(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(E2ELabelEncodeTest, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + import json + padnum = len(self.dict) + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags = [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + boxes.append(box) + txts.append(txt) + if txt in ['*', '###']: + txt_tags.append(True) + else: + txt_tags.append(False) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=np.bool) + data['polys'] = boxes + data['ignore_tags'] = txt_tags + temp_texts = [] + for text in txts: + text = text.lower() + text = self.encode(text) + if text is None: + return None + text = text + [padnum] * (self.max_text_len - len(text) + ) # use 36 to pad + temp_texts.append(text) + data['texts'] = np.array(temp_texts) + return data + + +class E2ELabelEncodeTrain(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + import json + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags = [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + boxes.append(box) + txts.append(txt) + if txt in ['*', '###']: + txt_tags.append(True) + else: + txt_tags.append(False) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=np.bool) + + data['polys'] = boxes + data['texts'] = txts + data['ignore_tags'] = txt_tags + return data + + +class KieLabelEncode(object): + def __init__(self, character_dict_path, norm=10, directed=False, **kwargs): + super(KieLabelEncode, self).__init__() + self.dict = dict({'': 0}) + with open(character_dict_path, 'r', encoding='utf-8') as fr: + idx = 1 + for line in fr: + char = line.strip() + self.dict[char] = idx + idx += 1 + self.norm = norm + self.directed = directed + + def compute_relation(self, boxes): + """Compute relation between every two boxes.""" + x1s, y1s = boxes[:, 0:1], boxes[:, 1:2] + x2s, y2s = boxes[:, 4:5], boxes[:, 5:6] + ws, hs = x2s - x1s + 1, np.maximum(y2s - y1s + 1, 1) + dxs = (x1s[:, 0][None] - x1s) / self.norm + dys = (y1s[:, 0][None] - y1s) / self.norm + xhhs, xwhs = hs[:, 0][None] / hs, ws[:, 0][None] / hs + whs = ws / hs + np.zeros_like(xhhs) + relations = np.stack([dxs, dys, whs, xhhs, xwhs], -1) + bboxes = np.concatenate([x1s, y1s, x2s, y2s], -1).astype(np.float32) + return relations, bboxes + + def pad_text_indices(self, text_inds): + """Pad text index to same length.""" + max_len = 300 + recoder_len = max([len(text_ind) for text_ind in text_inds]) + padded_text_inds = -np.ones((len(text_inds), max_len), np.int32) + for idx, text_ind in enumerate(text_inds): + padded_text_inds[idx, :len(text_ind)] = np.array(text_ind) + return padded_text_inds, recoder_len + + def list_to_numpy(self, ann_infos): + """Convert bboxes, relations, texts and labels to ndarray.""" + boxes, text_inds = ann_infos['points'], ann_infos['text_inds'] + boxes = np.array(boxes, np.int32) + relations, bboxes = self.compute_relation(boxes) + + labels = ann_infos.get('labels', None) + if labels is not None: + labels = np.array(labels, np.int32) + edges = ann_infos.get('edges', None) + if edges is not None: + labels = labels[:, None] + edges = np.array(edges) + edges = (edges[:, None] == edges[None, :]).astype(np.int32) + if self.directed: + edges = (edges & labels == 1).astype(np.int32) + np.fill_diagonal(edges, -1) + labels = np.concatenate([labels, edges], -1) + padded_text_inds, recoder_len = self.pad_text_indices(text_inds) + max_num = 300 + temp_bboxes = np.zeros([max_num, 4]) + h, _ = bboxes.shape + temp_bboxes[:h, :] = bboxes + + temp_relations = np.zeros([max_num, max_num, 5]) + temp_relations[:h, :h, :] = relations + + temp_padded_text_inds = np.zeros([max_num, max_num]) + temp_padded_text_inds[:h, :] = padded_text_inds + + temp_labels = np.zeros([max_num, max_num]) + temp_labels[:h, :h + 1] = labels + + tag = np.array([h, recoder_len]) + return dict( + image=ann_infos['image'], + points=temp_bboxes, + relations=temp_relations, + texts=temp_padded_text_inds, + labels=temp_labels, + tag=tag) + + def convert_canonical(self, points_x, points_y): + + assert len(points_x) == 4 + assert len(points_y) == 4 + + points = [Point(points_x[i], points_y[i]) for i in range(4)] + + polygon = Polygon([(p.x, p.y) for p in points]) + min_x, min_y, _, _ = polygon.bounds + points_to_lefttop = [ + LineString([points[i], Point(min_x, min_y)]) for i in range(4) + ] + distances = np.array([line.length for line in points_to_lefttop]) + sort_dist_idx = np.argsort(distances) + lefttop_idx = sort_dist_idx[0] + + if lefttop_idx == 0: + point_orders = [0, 1, 2, 3] + elif lefttop_idx == 1: + point_orders = [1, 2, 3, 0] + elif lefttop_idx == 2: + point_orders = [2, 3, 0, 1] + else: + point_orders = [3, 0, 1, 2] + + sorted_points_x = [points_x[i] for i in point_orders] + sorted_points_y = [points_y[j] for j in point_orders] + + return sorted_points_x, sorted_points_y + + def sort_vertex(self, points_x, points_y): + + assert len(points_x) == 4 + assert len(points_y) == 4 + + x = np.array(points_x) + y = np.array(points_y) + center_x = np.sum(x) * 0.25 + center_y = np.sum(y) * 0.25 + + x_arr = np.array(x - center_x) + y_arr = np.array(y - center_y) + + angle = np.arctan2(y_arr, x_arr) * 180.0 / np.pi + sort_idx = np.argsort(angle) + + sorted_points_x, sorted_points_y = [], [] + for i in range(4): + sorted_points_x.append(points_x[sort_idx[i]]) + sorted_points_y.append(points_y[sort_idx[i]]) + + return self.convert_canonical(sorted_points_x, sorted_points_y) + + def __call__(self, data): + import json + label = data['label'] + annotations = json.loads(label) + boxes, texts, text_inds, labels, edges = [], [], [], [], [] + for ann in annotations: + box = ann['points'] + x_list = [box[i][0] for i in range(4)] + y_list = [box[i][1] for i in range(4)] + sorted_x_list, sorted_y_list = self.sort_vertex(x_list, y_list) + sorted_box = [] + for x, y in zip(sorted_x_list, sorted_y_list): + sorted_box.append(x) + sorted_box.append(y) + boxes.append(sorted_box) + text = ann['transcription'] + texts.append(ann['transcription']) + text_ind = [self.dict[c] for c in text if c in self.dict] + text_inds.append(text_ind) + labels.append(ann['label']) + edges.append(ann.get('edge', 0)) + ann_infos = dict( + image=data['image'], + points=boxes, + texts=texts, + text_inds=text_inds, + edges=edges, + labels=labels) + + return self.list_to_numpy(ann_infos) + + +class AttnLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(AttnLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len + - len(text) - 2) + data['label'] = np.array(text) + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SEEDLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SEEDLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.padding = "padding" + self.end_str = "eos" + self.unknown = "unknown" + dict_character = dict_character + [ + self.end_str, self.padding, self.unknown + ] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + 1 # conclude eos + text = text + [len(self.character) - 3] + [len(self.character) - 2] * ( + self.max_text_len - len(text) - 1) + data['label'] = np.array(text) + return data + + +class SRNLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length=25, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SRNLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + dict_character = dict_character + [self.beg_str, self.end_str] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + char_num = len(self.character) + if text is None: + return None + if len(text) > self.max_text_len: + return None + data['length'] = np.array(len(text)) + text = text + [char_num - 1] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class TableLabelEncode(object): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + max_elem_length, + max_cell_num, + character_dict_path, + span_weight=1.0, + **kwargs): + self.max_text_length = max_text_length + self.max_elem_length = max_elem_length + self.max_cell_num = max_cell_num + list_character, list_elem = self.load_char_elem_dict( + character_dict_path) + list_character = self.add_special_char(list_character) + list_elem = self.add_special_char(list_elem) + self.dict_character = {} + for i, char in enumerate(list_character): + self.dict_character[char] = i + self.dict_elem = {} + for i, elem in enumerate(list_elem): + self.dict_elem[elem] = i + self.span_weight = span_weight + + def load_char_elem_dict(self, character_dict_path): + list_character = [] + list_elem = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + substr = lines[0].decode('utf-8').strip("\r\n").split("\t") + character_num = int(substr[0]) + elem_num = int(substr[1]) + for cno in range(1, 1 + character_num): + character = lines[cno].decode('utf-8').strip("\r\n") + list_character.append(character) + for eno in range(1 + character_num, 1 + character_num + elem_num): + elem = lines[eno].decode('utf-8').strip("\r\n") + list_elem.append(elem) + return list_character, list_elem + + def add_special_char(self, list_character): + self.beg_str = "sos" + self.end_str = "eos" + list_character = [self.beg_str] + list_character + [self.end_str] + return list_character + + def get_span_idx_list(self): + span_idx_list = [] + for elem in self.dict_elem: + if 'span' in elem: + span_idx_list.append(self.dict_elem[elem]) + return span_idx_list + + def __call__(self, data): + cells = data['cells'] + structure = data['structure']['tokens'] + structure = self.encode(structure, 'elem') + if structure is None: + return None + elem_num = len(structure) + structure = [0] + structure + [len(self.dict_elem) - 1] + structure = structure + [0] * (self.max_elem_length + 2 - len(structure) + ) + structure = np.array(structure) + data['structure'] = structure + elem_char_idx1 = self.dict_elem[''] + elem_char_idx2 = self.dict_elem[' 0: + span_weight = len(td_idx_list) * 1.0 / len(span_idx_list) + span_weight = min(max(span_weight, 1.0), self.span_weight) + for cno in range(len(cells)): + if 'bbox' in cells[cno]: + bbox = cells[cno]['bbox'].copy() + bbox[0] = bbox[0] * 1.0 / img_width + bbox[1] = bbox[1] * 1.0 / img_height + bbox[2] = bbox[2] * 1.0 / img_width + bbox[3] = bbox[3] * 1.0 / img_height + td_idx = td_idx_list[cno] + bbox_list[td_idx] = bbox + bbox_list_mask[td_idx] = 1.0 + cand_span_idx = td_idx + 1 + if cand_span_idx < (self.max_elem_length + 2): + if structure[cand_span_idx] in span_idx_list: + structure_mask[cand_span_idx] = span_weight + + data['bbox_list'] = bbox_list + data['bbox_list_mask'] = bbox_list_mask + data['structure_mask'] = structure_mask + char_beg_idx = self.get_beg_end_flag_idx('beg', 'char') + char_end_idx = self.get_beg_end_flag_idx('end', 'char') + elem_beg_idx = self.get_beg_end_flag_idx('beg', 'elem') + elem_end_idx = self.get_beg_end_flag_idx('end', 'elem') + data['sp_tokens'] = np.array([ + char_beg_idx, char_end_idx, elem_beg_idx, elem_end_idx, + elem_char_idx1, elem_char_idx2, self.max_text_length, + self.max_elem_length, self.max_cell_num, elem_num + ]) + return data + + def encode(self, text, char_or_elem): + """convert text-label into text-index. + """ + if char_or_elem == "char": + max_len = self.max_text_length + current_dict = self.dict_character + else: + max_len = self.max_elem_length + current_dict = self.dict_elem + if len(text) > max_len: + return None + if len(text) == 0: + if char_or_elem == "char": + return [self.dict_character['space']] + else: + return None + text_list = [] + for char in text: + if char not in current_dict: + return None + text_list.append(current_dict[char]) + if len(text_list) == 0: + if char_or_elem == "char": + return [self.dict_character['space']] + else: + return None + return text_list + + def get_ignored_tokens(self, char_or_elem): + beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem) + end_idx = self.get_beg_end_flag_idx("end", char_or_elem) + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end, char_or_elem): + if char_or_elem == "char": + if beg_or_end == "beg": + idx = np.array(self.dict_character[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict_character[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \ + % beg_or_end + elif char_or_elem == "elem": + if beg_or_end == "beg": + idx = np.array(self.dict_elem[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict_elem[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \ + % beg_or_end + else: + assert False, "Unsupport type %s in char_or_elem" \ + % char_or_elem + return idx + + +class SARLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SARLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + target = [self.start_idx] + text + [self.end_idx] + padded_text = [self.padding_idx for _ in range(self.max_text_len)] + + padded_text[:len(target)] = target + data['label'] = np.array(padded_text) + return data + + def get_ignored_tokens(self): + return [self.padding_idx] + + +class PRENLabelEncode(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path, + use_space_char=False, + **kwargs): + super(PRENLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + padding_str = '' # 0 + end_str = '' # 1 + unknown_str = '' # 2 + + dict_character = [padding_str, end_str, unknown_str] + dict_character + self.padding_idx = 0 + self.end_idx = 1 + self.unknown_idx = 2 + + return dict_character + + def encode(self, text): + if len(text) == 0 or len(text) >= self.max_text_len: + return None + if self.lower: + text = text.lower() + text_list = [] + for char in text: + if char not in self.dict: + text_list.append(self.unknown_idx) + else: + text_list.append(self.dict[char]) + text_list.append(self.end_idx) + if len(text_list) < self.max_text_len: + text_list += [self.padding_idx] * ( + self.max_text_len - len(text_list)) + return text_list + + def __call__(self, data): + text = data['label'] + encoded_text = self.encode(text) + if encoded_text is None: + return None + data['label'] = np.array(encoded_text) + return data + + +class VQATokenLabelEncode(object): + """ + Label encode for NLP VQA methods + """ + + def __init__(self, + class_path, + contains_re=False, + add_special_ids=False, + algorithm='LayoutXLM', + infer_mode=False, + ocr_engine=None, + **kwargs): + super(VQATokenLabelEncode, self).__init__() + from paddlenlp.transformers import LayoutXLMTokenizer, LayoutLMTokenizer, LayoutLMv2Tokenizer + from ppocr.utils.utility import load_vqa_bio_label_maps + tokenizer_dict = { + 'LayoutXLM': { + 'class': LayoutXLMTokenizer, + 'pretrained_model': 'layoutxlm-base-uncased' + }, + 'LayoutLM': { + 'class': LayoutLMTokenizer, + 'pretrained_model': 'layoutlm-base-uncased' + }, + 'LayoutLMv2': { + 'class': LayoutLMv2Tokenizer, + 'pretrained_model': 'layoutlmv2-base-uncased' + } + } + self.contains_re = contains_re + tokenizer_config = tokenizer_dict[algorithm] + self.tokenizer = tokenizer_config['class'].from_pretrained( + tokenizer_config['pretrained_model']) + self.label2id_map, id2label_map = load_vqa_bio_label_maps(class_path) + self.add_special_ids = add_special_ids + self.infer_mode = infer_mode + self.ocr_engine = ocr_engine + + def __call__(self, data): + # load bbox and label info + ocr_info = self._load_ocr_info(data) + + height, width, _ = data['image'].shape + + words_list = [] + bbox_list = [] + input_ids_list = [] + token_type_ids_list = [] + segment_offset_id = [] + gt_label_list = [] + + entities = [] + + # for re + train_re = self.contains_re and not self.infer_mode + if train_re: + relations = [] + id2label = {} + entity_id_to_index_map = {} + empty_entity = set() + + data['ocr_info'] = copy.deepcopy(ocr_info) + + for info in ocr_info: + if train_re: + # for re + if len(info["text"]) == 0: + empty_entity.add(info["id"]) + continue + id2label[info["id"]] = info["label"] + relations.extend([tuple(sorted(l)) for l in info["linking"]]) + # smooth_box + bbox = self._smooth_box(info["bbox"], height, width) + + text = info["text"] + encode_res = self.tokenizer.encode( + text, pad_to_max_seq_len=False, return_attention_mask=True) + + if not self.add_special_ids: + # TODO: use tok.all_special_ids to remove + encode_res["input_ids"] = encode_res["input_ids"][1:-1] + encode_res["token_type_ids"] = encode_res["token_type_ids"][1: + -1] + encode_res["attention_mask"] = encode_res["attention_mask"][1: + -1] + # parse label + if not self.infer_mode: + label = info['label'] + gt_label = self._parse_label(label, encode_res) + + # construct entities for re + if train_re: + if gt_label[0] != self.label2id_map["O"]: + entity_id_to_index_map[info["id"]] = len(entities) + label = label.upper() + entities.append({ + "start": len(input_ids_list), + "end": + len(input_ids_list) + len(encode_res["input_ids"]), + "label": label.upper(), + }) + else: + entities.append({ + "start": len(input_ids_list), + "end": len(input_ids_list) + len(encode_res["input_ids"]), + "label": 'O', + }) + input_ids_list.extend(encode_res["input_ids"]) + token_type_ids_list.extend(encode_res["token_type_ids"]) + bbox_list.extend([bbox] * len(encode_res["input_ids"])) + words_list.append(text) + segment_offset_id.append(len(input_ids_list)) + if not self.infer_mode: + gt_label_list.extend(gt_label) + + data['input_ids'] = input_ids_list + data['token_type_ids'] = token_type_ids_list + data['bbox'] = bbox_list + data['attention_mask'] = [1] * len(input_ids_list) + data['labels'] = gt_label_list + data['segment_offset_id'] = segment_offset_id + data['tokenizer_params'] = dict( + padding_side=self.tokenizer.padding_side, + pad_token_type_id=self.tokenizer.pad_token_type_id, + pad_token_id=self.tokenizer.pad_token_id) + data['entities'] = entities + + if train_re: + data['relations'] = relations + data['id2label'] = id2label + data['empty_entity'] = empty_entity + data['entity_id_to_index_map'] = entity_id_to_index_map + return data + + def _load_ocr_info(self, data): + def trans_poly_to_bbox(poly): + x1 = np.min([p[0] for p in poly]) + x2 = np.max([p[0] for p in poly]) + y1 = np.min([p[1] for p in poly]) + y2 = np.max([p[1] for p in poly]) + return [x1, y1, x2, y2] + + if self.infer_mode: + ocr_result = self.ocr_engine.ocr(data['image'], cls=False) + ocr_info = [] + for res in ocr_result: + ocr_info.append({ + "text": res[1][0], + "bbox": trans_poly_to_bbox(res[0]), + "poly": res[0], + }) + return ocr_info + else: + info = data['label'] + # read text info + info_dict = json.loads(info) + return info_dict["ocr_info"] + + def _smooth_box(self, bbox, height, width): + bbox[0] = int(bbox[0] * 1000.0 / width) + bbox[2] = int(bbox[2] * 1000.0 / width) + bbox[1] = int(bbox[1] * 1000.0 / height) + bbox[3] = int(bbox[3] * 1000.0 / height) + return bbox + + def _parse_label(self, label, encode_res): + gt_label = [] + if label.lower() == "other": + gt_label.extend([0] * len(encode_res["input_ids"])) + else: + gt_label.append(self.label2id_map[("b-" + label).upper()]) + gt_label.extend([self.label2id_map[("i-" + label).upper()]] * + (len(encode_res["input_ids"]) - 1)) + return gt_label + + +class MultiLabelEncode(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(MultiLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + self.ctc_encode = CTCLabelEncode(max_text_length, character_dict_path, + use_space_char, **kwargs) + self.sar_encode = SARLabelEncode(max_text_length, character_dict_path, + use_space_char, **kwargs) + + def __call__(self, data): + + data_ctc = copy.deepcopy(data) + data_sar = copy.deepcopy(data) + data_out = dict() + data_out['img_path'] = data.get('img_path', None) + data_out['image'] = data['image'] + ctc = self.ctc_encode.__call__(data_ctc) + sar = self.sar_encode.__call__(data_sar) + if ctc is None or sar is None: + return None + data_out['label_ctc'] = ctc['label'] + data_out['label_sar'] = sar['label'] + data_out['length'] = ctc['length'] + return data_out diff --git a/backend/ppocr/data/imaug/make_border_map.py b/backend/ppocr/data/imaug/make_border_map.py new file mode 100644 index 0000000..abab383 --- /dev/null +++ b/backend/ppocr/data/imaug/make_border_map.py @@ -0,0 +1,173 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_border_map.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import cv2 + +np.seterr(divide='ignore', invalid='ignore') +import pyclipper +from shapely.geometry import Polygon +import sys +import warnings + +warnings.simplefilter("ignore") + +__all__ = ['MakeBorderMap'] + + +class MakeBorderMap(object): + def __init__(self, + shrink_ratio=0.4, + thresh_min=0.3, + thresh_max=0.7, + **kwargs): + self.shrink_ratio = shrink_ratio + self.thresh_min = thresh_min + self.thresh_max = thresh_max + + def __call__(self, data): + + img = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + canvas = np.zeros(img.shape[:2], dtype=np.float32) + mask = np.zeros(img.shape[:2], dtype=np.float32) + + for i in range(len(text_polys)): + if ignore_tags[i]: + continue + self.draw_border_map(text_polys[i], canvas, mask=mask) + canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min + + data['threshold_map'] = canvas + data['threshold_mask'] = mask + return data + + def draw_border_map(self, polygon, canvas, mask): + polygon = np.array(polygon) + assert polygon.ndim == 2 + assert polygon.shape[1] == 2 + + polygon_shape = Polygon(polygon) + if polygon_shape.area <= 0: + return + distance = polygon_shape.area * ( + 1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + + padded_polygon = np.array(padding.Execute(distance)[0]) + cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0) + + xmin = padded_polygon[:, 0].min() + xmax = padded_polygon[:, 0].max() + ymin = padded_polygon[:, 1].min() + ymax = padded_polygon[:, 1].max() + width = xmax - xmin + 1 + height = ymax - ymin + 1 + + polygon[:, 0] = polygon[:, 0] - xmin + polygon[:, 1] = polygon[:, 1] - ymin + + xs = np.broadcast_to( + np.linspace( + 0, width - 1, num=width).reshape(1, width), (height, width)) + ys = np.broadcast_to( + np.linspace( + 0, height - 1, num=height).reshape(height, 1), (height, width)) + + distance_map = np.zeros( + (polygon.shape[0], height, width), dtype=np.float32) + for i in range(polygon.shape[0]): + j = (i + 1) % polygon.shape[0] + absolute_distance = self._distance(xs, ys, polygon[i], polygon[j]) + distance_map[i] = np.clip(absolute_distance / distance, 0, 1) + distance_map = distance_map.min(axis=0) + + xmin_valid = min(max(0, xmin), canvas.shape[1] - 1) + xmax_valid = min(max(0, xmax), canvas.shape[1] - 1) + ymin_valid = min(max(0, ymin), canvas.shape[0] - 1) + ymax_valid = min(max(0, ymax), canvas.shape[0] - 1) + canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax( + 1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height, + xmin_valid - xmin:xmax_valid - xmax + width], + canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]) + + def _distance(self, xs, ys, point_1, point_2): + ''' + compute the distance from point to a line + ys: coordinates in the first axis + xs: coordinates in the second axis + point_1, point_2: (x, y), the end of the line + ''' + height, width = xs.shape[:2] + square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[ + 1]) + square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[ + 1]) + square_distance = np.square(point_1[0] - point_2[0]) + np.square( + point_1[1] - point_2[1]) + + cosin = (square_distance - square_distance_1 - square_distance_2) / ( + 2 * np.sqrt(square_distance_1 * square_distance_2)) + square_sin = 1 - np.square(cosin) + square_sin = np.nan_to_num(square_sin) + result = np.sqrt(square_distance_1 * square_distance_2 * square_sin / + square_distance) + + result[cosin < + 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin + < 0] + # self.extend_line(point_1, point_2, result) + return result + + def extend_line(self, point_1, point_2, result, shrink_ratio): + ex_point_1 = (int( + round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))), + int( + round(point_1[1] + (point_1[1] - point_2[1]) * ( + 1 + shrink_ratio)))) + cv2.line( + result, + tuple(ex_point_1), + tuple(point_1), + 4096.0, + 1, + lineType=cv2.LINE_AA, + shift=0) + ex_point_2 = (int( + round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))), + int( + round(point_2[1] + (point_2[1] - point_1[1]) * ( + 1 + shrink_ratio)))) + cv2.line( + result, + tuple(ex_point_2), + tuple(point_2), + 4096.0, + 1, + lineType=cv2.LINE_AA, + shift=0) + return ex_point_1, ex_point_2 diff --git a/backend/ppocr/data/imaug/make_pse_gt.py b/backend/ppocr/data/imaug/make_pse_gt.py new file mode 100644 index 0000000..255d076 --- /dev/null +++ b/backend/ppocr/data/imaug/make_pse_gt.py @@ -0,0 +1,106 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np +import pyclipper +from shapely.geometry import Polygon + +__all__ = ['MakePseGt'] + + +class MakePseGt(object): + def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs): + self.kernel_num = kernel_num + self.min_shrink_ratio = min_shrink_ratio + self.size = size + + def __call__(self, data): + + image = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + h, w, _ = image.shape + short_edge = min(h, w) + if short_edge < self.size: + # keep short_size >= self.size + scale = self.size / short_edge + image = cv2.resize(image, dsize=None, fx=scale, fy=scale) + text_polys *= scale + + gt_kernels = [] + for i in range(1, self.kernel_num + 1): + # s1->sn, from big to small + rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1 + ) * i + text_kernel, ignore_tags = self.generate_kernel( + image.shape[0:2], rate, text_polys, ignore_tags) + gt_kernels.append(text_kernel) + + training_mask = np.ones(image.shape[0:2], dtype='uint8') + for i in range(text_polys.shape[0]): + if ignore_tags[i]: + cv2.fillPoly(training_mask, + text_polys[i].astype(np.int32)[np.newaxis, :, :], + 0) + + gt_kernels = np.array(gt_kernels) + gt_kernels[gt_kernels > 0] = 1 + + data['image'] = image + data['polys'] = text_polys + data['gt_kernels'] = gt_kernels[0:] + data['gt_text'] = gt_kernels[0] + data['mask'] = training_mask.astype('float32') + return data + + def generate_kernel(self, + img_size, + shrink_ratio, + text_polys, + ignore_tags=None): + """ + Refer to part of the code: + https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/base_textdet_targets.py + """ + + h, w = img_size + text_kernel = np.zeros((h, w), dtype=np.float32) + for i, poly in enumerate(text_polys): + polygon = Polygon(poly) + distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / ( + polygon.length + 1e-6) + subject = [tuple(l) for l in poly] + pco = pyclipper.PyclipperOffset() + pco.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + shrinked = np.array(pco.Execute(-distance)) + + if len(shrinked) == 0 or shrinked.size == 0: + if ignore_tags is not None: + ignore_tags[i] = True + continue + try: + shrinked = np.array(shrinked[0]).reshape(-1, 2) + except: + if ignore_tags is not None: + ignore_tags[i] = True + continue + cv2.fillPoly(text_kernel, [shrinked.astype(np.int32)], i + 1) + return text_kernel, ignore_tags diff --git a/backend/ppocr/data/imaug/make_shrink_map.py b/backend/ppocr/data/imaug/make_shrink_map.py new file mode 100644 index 0000000..6c65c20 --- /dev/null +++ b/backend/ppocr/data/imaug/make_shrink_map.py @@ -0,0 +1,123 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_shrink_map.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import cv2 +from shapely.geometry import Polygon +import pyclipper + +__all__ = ['MakeShrinkMap'] + + +class MakeShrinkMap(object): + r''' + Making binary mask from detection data with ICDAR format. + Typically following the process of class `MakeICDARData`. + ''' + + def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs): + self.min_text_size = min_text_size + self.shrink_ratio = shrink_ratio + + def __call__(self, data): + image = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + h, w = image.shape[:2] + text_polys, ignore_tags = self.validate_polygons(text_polys, + ignore_tags, h, w) + gt = np.zeros((h, w), dtype=np.float32) + mask = np.ones((h, w), dtype=np.float32) + for i in range(len(text_polys)): + polygon = text_polys[i] + height = max(polygon[:, 1]) - min(polygon[:, 1]) + width = max(polygon[:, 0]) - min(polygon[:, 0]) + if ignore_tags[i] or min(height, width) < self.min_text_size: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + else: + polygon_shape = Polygon(polygon) + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, + pyclipper.ET_CLOSEDPOLYGON) + shrinked = [] + + # Increase the shrink ratio every time we get multiple polygon returned back + possible_ratios = np.arange(self.shrink_ratio, 1, + self.shrink_ratio) + np.append(possible_ratios, 1) + # print(possible_ratios) + for ratio in possible_ratios: + # print(f"Change shrink ratio to {ratio}") + distance = polygon_shape.area * ( + 1 - np.power(ratio, 2)) / polygon_shape.length + shrinked = padding.Execute(-distance) + if len(shrinked) == 1: + break + + if shrinked == []: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + continue + + for each_shirnk in shrinked: + shirnk = np.array(each_shirnk).reshape(-1, 2) + cv2.fillPoly(gt, [shirnk.astype(np.int32)], 1) + + data['shrink_map'] = gt + data['shrink_mask'] = mask + return data + + def validate_polygons(self, polygons, ignore_tags, h, w): + ''' + polygons (numpy.array, required): of shape (num_instances, num_points, 2) + ''' + if len(polygons) == 0: + return polygons, ignore_tags + assert len(polygons) == len(ignore_tags) + for polygon in polygons: + polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1) + polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1) + + for i in range(len(polygons)): + area = self.polygon_area(polygons[i]) + if abs(area) < 1: + ignore_tags[i] = True + if area > 0: + polygons[i] = polygons[i][::-1, :] + return polygons, ignore_tags + + def polygon_area(self, polygon): + """ + compute polygon area + """ + area = 0 + q = polygon[-1] + for p in polygon: + area += p[0] * q[1] - p[1] * q[0] + q = p + return area / 2.0 diff --git a/backend/ppocr/data/imaug/operators.py b/backend/ppocr/data/imaug/operators.py new file mode 100644 index 0000000..0973651 --- /dev/null +++ b/backend/ppocr/data/imaug/operators.py @@ -0,0 +1,468 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np +import math + + +class DecodeImage(object): + """ decode image """ + + def __init__(self, + img_mode='RGB', + channel_first=False, + ignore_orientation=False, + **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + self.ignore_orientation = ignore_orientation + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + if self.ignore_orientation: + img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION | + cv2.IMREAD_COLOR) + else: + img = cv2.imdecode(img, 1) + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + + if self.channel_first: + img = img.transpose((2, 0, 1)) + + data['image'] = img + return data + + +class NRTRDecodeImage(object): + """ decode image """ + + def __init__(self, img_mode='RGB', channel_first=False, **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + + img = cv2.imdecode(img, 1) + + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if self.channel_first: + img = img.transpose((2, 0, 1)) + data['image'] = img + return data + + +class NormalizeImage(object): + """ normalize image such as substract mean, divide std + """ + + def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): + if isinstance(scale, str): + scale = eval(scale) + self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) + mean = mean if mean is not None else [0.485, 0.456, 0.406] + std = std if std is not None else [0.229, 0.224, 0.225] + + shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) + self.mean = np.array(mean).reshape(shape).astype('float32') + self.std = np.array(std).reshape(shape).astype('float32') + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + assert isinstance(img, + np.ndarray), "invalid input 'img' in NormalizeImage" + data['image'] = ( + img.astype('float32') * self.scale - self.mean) / self.std + return data + + +class ToCHWImage(object): + """ convert hwc image to chw image + """ + + def __init__(self, **kwargs): + pass + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + data['image'] = img.transpose((2, 0, 1)) + return data + + +class Fasttext(object): + def __init__(self, path="None", **kwargs): + import fasttext + self.fast_model = fasttext.load_model(path) + + def __call__(self, data): + label = data['label'] + fast_label = self.fast_model[label] + data['fast_label'] = fast_label + return data + + +class KeepKeys(object): + def __init__(self, keep_keys, **kwargs): + self.keep_keys = keep_keys + + def __call__(self, data): + data_list = [] + for key in self.keep_keys: + data_list.append(data[key]) + return data_list + + +class Pad(object): + def __init__(self, size=None, size_div=32, **kwargs): + if size is not None and not isinstance(size, (int, list, tuple)): + raise TypeError("Type of target_size is invalid. Now is {}".format( + type(size))) + if isinstance(size, int): + size = [size, size] + self.size = size + self.size_div = size_div + + def __call__(self, data): + + img = data['image'] + img_h, img_w = img.shape[0], img.shape[1] + if self.size: + resize_h2, resize_w2 = self.size + assert ( + img_h < resize_h2 and img_w < resize_w2 + ), '(h, w) of target size should be greater than (img_h, img_w)' + else: + resize_h2 = max( + int(math.ceil(img.shape[0] / self.size_div) * self.size_div), + self.size_div) + resize_w2 = max( + int(math.ceil(img.shape[1] / self.size_div) * self.size_div), + self.size_div) + img = cv2.copyMakeBorder( + img, + 0, + resize_h2 - img_h, + 0, + resize_w2 - img_w, + cv2.BORDER_CONSTANT, + value=0) + data['image'] = img + return data + + +class Resize(object): + def __init__(self, size=(640, 640), **kwargs): + self.size = size + + def resize_image(self, img): + resize_h, resize_w = self.size + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + return img, [ratio_h, ratio_w] + + def __call__(self, data): + img = data['image'] + if 'polys' in data: + text_polys = data['polys'] + + img_resize, [ratio_h, ratio_w] = self.resize_image(img) + if 'polys' in data: + new_boxes = [] + for box in text_polys: + new_box = [] + for cord in box: + new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) + new_boxes.append(new_box) + data['polys'] = np.array(new_boxes, dtype=np.float32) + data['image'] = img_resize + return data + + +class DetResizeForTest(object): + def __init__(self, **kwargs): + super(DetResizeForTest, self).__init__() + self.resize_type = 0 + if 'image_shape' in kwargs: + self.image_shape = kwargs['image_shape'] + self.resize_type = 1 + elif 'limit_side_len' in kwargs: + self.limit_side_len = kwargs['limit_side_len'] + self.limit_type = kwargs.get('limit_type', 'min') + elif 'resize_long' in kwargs: + self.resize_type = 2 + self.resize_long = kwargs.get('resize_long', 960) + else: + self.limit_side_len = 736 + self.limit_type = 'min' + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + + if self.resize_type == 0: + # img, shape = self.resize_image_type0(img) + img, [ratio_h, ratio_w] = self.resize_image_type0(img) + elif self.resize_type == 2: + img, [ratio_h, ratio_w] = self.resize_image_type2(img) + else: + # img, shape = self.resize_image_type1(img) + img, [ratio_h, ratio_w] = self.resize_image_type1(img) + data['image'] = img + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_type1(self, img): + resize_h, resize_w = self.image_shape + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + # return img, np.array([ori_h, ori_w]) + return img, [ratio_h, ratio_w] + + def resize_image_type0(self, img): + """ + resize image to a size multiple of 32 which is required by the network + args: + img(array): array with shape [h, w, c] + return(tuple): + img, (ratio_h, ratio_w) + """ + limit_side_len = self.limit_side_len + h, w, c = img.shape + + # limit the max side + if self.limit_type == 'max': + if max(h, w) > limit_side_len: + if h > w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'min': + if min(h, w) < limit_side_len: + if h < w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'resize_long': + ratio = float(limit_side_len) / max(h, w) + else: + raise Exception('not support limit type, image ') + resize_h = int(h * ratio) + resize_w = int(w * ratio) + + resize_h = max(int(round(resize_h / 32) * 32), 32) + resize_w = max(int(round(resize_w / 32) * 32), 32) + + try: + if int(resize_w) <= 0 or int(resize_h) <= 0: + return None, (None, None) + img = cv2.resize(img, (int(resize_w), int(resize_h))) + except: + print(img.shape, resize_w, resize_h) + sys.exit(0) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return img, [ratio_h, ratio_w] + + def resize_image_type2(self, img): + h, w, _ = img.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(self.resize_long) / resize_h + else: + ratio = float(self.resize_long) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + img = cv2.resize(img, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return img, [ratio_h, ratio_w] + + +class E2EResizeForTest(object): + def __init__(self, **kwargs): + super(E2EResizeForTest, self).__init__() + self.max_side_len = kwargs['max_side_len'] + self.valid_set = kwargs['valid_set'] + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if self.valid_set == 'totaltext': + im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext( + img, max_side_len=self.max_side_len) + else: + im_resized, (ratio_h, ratio_w) = self.resize_image( + img, max_side_len=self.max_side_len) + data['image'] = im_resized + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_for_totaltext(self, im, max_side_len=512): + + h, w, _ = im.shape + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + def resize_image(self, im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + # Fix the longer side + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +class KieResize(object): + def __init__(self, **kwargs): + super(KieResize, self).__init__() + self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[ + 'img_scale'][1] + + def __call__(self, data): + img = data['image'] + points = data['points'] + src_h, src_w, _ = img.shape + im_resized, scale_factor, [ratio_h, ratio_w + ], [new_h, new_w] = self.resize_image(img) + resize_points = self.resize_boxes(img, points, scale_factor) + data['ori_image'] = img + data['ori_boxes'] = points + data['points'] = resize_points + data['image'] = im_resized + data['shape'] = np.array([new_h, new_w]) + return data + + def resize_image(self, img): + norm_img = np.zeros([1024, 1024, 3], dtype='float32') + scale = [512, 1024] + h, w = img.shape[:2] + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float( + scale_factor) + 0.5) + max_stride = 32 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(img, (resize_w, resize_h)) + new_h, new_w = im.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + scale_factor = np.array( + [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) + norm_img[:new_h, :new_w, :] = im + return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w] + + def resize_boxes(self, im, points, scale_factor): + points = points * scale_factor + img_shape = im.shape[:2] + points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) + points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) + return points diff --git a/backend/ppocr/data/imaug/pg_process.py b/backend/ppocr/data/imaug/pg_process.py new file mode 100644 index 0000000..5303106 --- /dev/null +++ b/backend/ppocr/data/imaug/pg_process.py @@ -0,0 +1,906 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import cv2 +import numpy as np + +__all__ = ['PGProcessTrain'] + + +class PGProcessTrain(object): + def __init__(self, + character_dict_path, + max_text_length, + max_text_nums, + tcl_len, + batch_size=14, + min_crop_size=24, + min_text_size=4, + max_text_size=512, + **kwargs): + self.tcl_len = tcl_len + self.max_text_length = max_text_length + self.max_text_nums = max_text_nums + self.batch_size = batch_size + self.min_crop_size = min_crop_size + self.min_text_size = min_text_size + self.max_text_size = max_text_size + self.Lexicon_Table = self.get_dict(character_dict_path) + self.pad_num = len(self.Lexicon_Table) + self.img_id = 0 + + def get_dict(self, character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + def quad_area(self, poly): + """ + compute area of a polygon + :param poly: + :return: + """ + edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), + (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), + (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), + (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] + return np.sum(edge) / 2. + + def gen_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad + + def check_and_validate_polys(self, polys, tags, im_size): + """ + check so that the text poly is in the same direction, + and also filter some invalid polygons + :param polys: + :param tags: + :return: + """ + (h, w) = im_size + if polys.shape[0] == 0: + return polys, np.array([]), np.array([]) + polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) + polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) + + validated_polys = [] + validated_tags = [] + hv_tags = [] + for poly, tag in zip(polys, tags): + quad = self.gen_quad_from_poly(poly) + p_area = self.quad_area(quad) + if abs(p_area) < 1: + print('invalid poly') + continue + if p_area > 0: + if tag == False: + print('poly in wrong direction') + tag = True # reversed cases should be ignore + poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1), :] + quad = quad[(0, 3, 2, 1), :] + + len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] - + quad[2]) + len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - + quad[2]) + hv_tag = 1 + + if len_w * 2.0 < len_h: + hv_tag = 0 + + validated_polys.append(poly) + validated_tags.append(tag) + hv_tags.append(hv_tag) + return np.array(validated_polys), np.array(validated_tags), np.array( + hv_tags) + + def crop_area(self, + im, + polys, + tags, + hv_tags, + txts, + crop_background=False, + max_tries=25): + """ + make random crop from the input image + :param im: + :param polys: [b,4,2] + :param tags: + :param crop_background: + :param max_tries: 50 -> 25 + :return: + """ + h, w, _ = im.shape + pad_h = h // 10 + pad_w = w // 10 + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + if len(h_axis) == 0 or len(w_axis) == 0: + return im, polys, tags, hv_tags, txts + for i in range(max_tries): + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if xmax - xmin < self.min_crop_size or \ + ymax - ymin < self.min_crop_size: + continue + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \ + & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax) + selected_polys = np.where( + np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + if len(selected_polys) == 0: + # no text in this area + if crop_background: + txts_tmp = [] + for selected_poly in selected_polys: + txts_tmp.append(txts[selected_poly]) + txts = txts_tmp + return im[ymin: ymax + 1, xmin: xmax + 1, :], \ + polys[selected_polys], tags[selected_polys], hv_tags[selected_polys], txts + else: + continue + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = polys[selected_polys] + tags = tags[selected_polys] + hv_tags = hv_tags[selected_polys] + txts_tmp = [] + for selected_poly in selected_polys: + txts_tmp.append(txts[selected_poly]) + txts = txts_tmp + polys[:, :, 0] -= xmin + polys[:, :, 1] -= ymin + return im, polys, tags, hv_tags, txts + + return im, polys, tags, hv_tags, txts + + def fit_and_gather_tcl_points_v2(self, + min_area_quad, + poly, + max_h, + max_w, + fixed_point_num=64, + img_id=0, + reference_height=3): + """ + Find the center point of poly as key_points, then fit and gather. + """ + key_point_xys = [] + point_num = poly.shape[0] + for idx in range(point_num // 2): + center_point = (poly[idx] + poly[point_num - 1 - idx]) / 2.0 + key_point_xys.append(center_point) + + tmp_image = np.zeros( + shape=( + max_h, + max_w, ), dtype='float32') + cv2.polylines(tmp_image, [np.array(key_point_xys).astype('int32')], + False, 1.0) + ys, xs = np.where(tmp_image > 0) + xy_text = np.array(list(zip(xs, ys)), dtype='float32') + + left_center_pt = ( + (min_area_quad[0] - min_area_quad[1]) / 2.0).reshape(1, 2) + right_center_pt = ( + (min_area_quad[1] - min_area_quad[2]) / 2.0).reshape(1, 2) + proj_unit_vec = (right_center_pt - left_center_pt) / ( + np.linalg.norm(right_center_pt - left_center_pt) + 1e-6) + proj_unit_vec_tile = np.tile(proj_unit_vec, + (xy_text.shape[0], 1)) # (n, 2) + left_center_pt_tile = np.tile(left_center_pt, + (xy_text.shape[0], 1)) # (n, 2) + xy_text_to_left_center = xy_text - left_center_pt_tile + proj_value = np.sum(xy_text_to_left_center * proj_unit_vec_tile, axis=1) + xy_text = xy_text[np.argsort(proj_value)] + + # convert to np and keep the num of point not greater then fixed_point_num + pos_info = np.array(xy_text).reshape(-1, 2)[:, ::-1] # xy-> yx + point_num = len(pos_info) + if point_num > fixed_point_num: + keep_ids = [ + int((point_num * 1.0 / fixed_point_num) * x) + for x in range(fixed_point_num) + ] + pos_info = pos_info[keep_ids, :] + + keep = int(min(len(pos_info), fixed_point_num)) + if np.random.rand() < 0.2 and reference_height >= 3: + dl = (np.random.rand(keep) - 0.5) * reference_height * 0.3 + random_float = np.array([1, 0]).reshape([1, 2]) * dl.reshape( + [keep, 1]) + pos_info += random_float + pos_info[:, 0] = np.clip(pos_info[:, 0], 0, max_h - 1) + pos_info[:, 1] = np.clip(pos_info[:, 1], 0, max_w - 1) + + # padding to fixed length + pos_l = np.zeros((self.tcl_len, 3), dtype=np.int32) + pos_l[:, 0] = np.ones((self.tcl_len, )) * img_id + pos_m = np.zeros((self.tcl_len, 1), dtype=np.float32) + pos_l[:keep, 1:] = np.round(pos_info).astype(np.int32) + pos_m[:keep] = 1.0 + return pos_l, pos_m + + def generate_direction_map(self, poly_quads, n_char, direction_map): + """ + """ + width_list = [] + height_list = [] + for quad in poly_quads: + quad_w = (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) / 2.0 + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + width_list.append(quad_w) + height_list.append(quad_h) + norm_width = max(sum(width_list) / n_char, 1.0) + average_height = max(sum(height_list) / len(height_list), 1.0) + k = 1 + for quad in poly_quads: + direct_vector_full = ( + (quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0 + direct_vector = direct_vector_full / ( + np.linalg.norm(direct_vector_full) + 1e-6) * norm_width + direction_label = tuple( + map(float, + [direct_vector[0], direct_vector[1], 1.0 / average_height])) + cv2.fillPoly(direction_map, + quad.round().astype(np.int32)[np.newaxis, :, :], + direction_label) + k += 1 + return direction_map + + def calculate_average_height(self, poly_quads): + """ + """ + height_list = [] + for quad in poly_quads: + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + height_list.append(quad_h) + average_height = max(sum(height_list) / len(height_list), 1.0) + return average_height + + def generate_tcl_ctc_label(self, + h, + w, + polys, + tags, + text_strs, + ds_ratio, + tcl_ratio=0.3, + shrink_ratio_of_width=0.15): + """ + Generate polygon. + """ + score_map_big = np.zeros( + ( + h, + w, ), dtype=np.float32) + h, w = int(h * ds_ratio), int(w * ds_ratio) + polys = polys * ds_ratio + + score_map = np.zeros( + ( + h, + w, ), dtype=np.float32) + score_label_map = np.zeros( + ( + h, + w, ), dtype=np.float32) + tbo_map = np.zeros((h, w, 5), dtype=np.float32) + training_mask = np.ones( + ( + h, + w, ), dtype=np.float32) + direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape( + [1, 1, 3]).astype(np.float32) + + label_idx = 0 + score_label_map_text_label_list = [] + pos_list, pos_mask, label_list = [], [], [] + for poly_idx, poly_tag in enumerate(zip(polys, tags)): + poly = poly_tag[0] + tag = poly_tag[1] + + # generate min_area_quad + min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly) + min_area_quad_h = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[3]) + + np.linalg.norm(min_area_quad[1] - min_area_quad[2])) + min_area_quad_w = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[1]) + + np.linalg.norm(min_area_quad[2] - min_area_quad[3])) + + if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \ + or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio: + continue + + if tag: + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0.15) + else: + text_label = text_strs[poly_idx] + text_label = self.prepare_text_label(text_label, + self.Lexicon_Table) + + text_label_index_list = [[self.Lexicon_Table.index(c_)] + for c_ in text_label + if c_ in self.Lexicon_Table] + if len(text_label_index_list) < 1: + continue + + tcl_poly = self.poly2tcl(poly, tcl_ratio) + tcl_quads = self.poly2quads(tcl_poly) + poly_quads = self.poly2quads(poly) + + stcl_quads, quad_index = self.shrink_poly_along_width( + tcl_quads, + shrink_ratio_of_width=shrink_ratio_of_width, + expand_height_ratio=1.0 / tcl_ratio) + + cv2.fillPoly(score_map, + np.round(stcl_quads).astype(np.int32), 1.0) + cv2.fillPoly(score_map_big, + np.round(stcl_quads / ds_ratio).astype(np.int32), + 1.0) + + for idx, quad in enumerate(stcl_quads): + quad_mask = np.zeros((h, w), dtype=np.float32) + quad_mask = cv2.fillPoly( + quad_mask, + np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0) + tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]], + quad_mask, tbo_map) + + # score label map and score_label_map_text_label_list for refine + if label_idx == 0: + text_pos_list_ = [[len(self.Lexicon_Table)], ] + score_label_map_text_label_list.append(text_pos_list_) + + label_idx += 1 + cv2.fillPoly(score_label_map, + np.round(poly_quads).astype(np.int32), label_idx) + score_label_map_text_label_list.append(text_label_index_list) + + # direction info, fix-me + n_char = len(text_label_index_list) + direction_map = self.generate_direction_map(poly_quads, n_char, + direction_map) + + # pos info + average_shrink_height = self.calculate_average_height( + stcl_quads) + pos_l, pos_m = self.fit_and_gather_tcl_points_v2( + min_area_quad, + poly, + max_h=h, + max_w=w, + fixed_point_num=64, + img_id=self.img_id, + reference_height=average_shrink_height) + + label_l = text_label_index_list + if len(text_label_index_list) < 2: + continue + + pos_list.append(pos_l) + pos_mask.append(pos_m) + label_list.append(label_l) + + # use big score_map for smooth tcl lines + score_map_big_resized = cv2.resize( + score_map_big, dsize=None, fx=ds_ratio, fy=ds_ratio) + score_map = np.array(score_map_big_resized > 1e-3, dtype='float32') + + return score_map, score_label_map, tbo_map, direction_map, training_mask, \ + pos_list, pos_mask, label_list, score_label_map_text_label_list + + def adjust_point(self, poly): + """ + adjust point order. + """ + point_num = poly.shape[0] + if point_num == 4: + len_1 = np.linalg.norm(poly[0] - poly[1]) + len_2 = np.linalg.norm(poly[1] - poly[2]) + len_3 = np.linalg.norm(poly[2] - poly[3]) + len_4 = np.linalg.norm(poly[3] - poly[0]) + + if (len_1 + len_3) * 1.5 < (len_2 + len_4): + poly = poly[[1, 2, 3, 0], :] + + elif point_num > 4: + vector_1 = poly[0] - poly[1] + vector_2 = poly[1] - poly[2] + cos_theta = np.dot(vector_1, vector_2) / ( + np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6) + theta = np.arccos(np.round(cos_theta, decimals=4)) + + if abs(theta) > (70 / 180 * math.pi): + index = list(range(1, point_num)) + [0] + poly = poly[np.array(index), :] + return poly + + def gen_min_area_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + if point_num == 4: + min_area_quad = poly + center_point = np.sum(poly, axis=0) / 4 + else: + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + center_point = rect[0] + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad, center_point + + def shrink_quad_along_width(self, + quad, + begin_width_ratio=0., + end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def shrink_poly_along_width(self, + quads, + shrink_ratio_of_width, + expand_height_ratio=1.0): + """ + shrink poly with given length. + """ + upper_edge_list = [] + + def get_cut_info(edge_len_list, cut_len): + for idx, edge_len in enumerate(edge_len_list): + cut_len -= edge_len + if cut_len <= 0.000001: + ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx] + return idx, ratio + + for quad in quads: + upper_edge_len = np.linalg.norm(quad[0] - quad[1]) + upper_edge_list.append(upper_edge_len) + + # length of left edge and right edge. + left_length = np.linalg.norm(quads[0][0] - quads[0][ + 3]) * expand_height_ratio + right_length = np.linalg.norm(quads[-1][1] - quads[-1][ + 2]) * expand_height_ratio + + shrink_length = min(left_length, right_length, + sum(upper_edge_list)) * shrink_ratio_of_width + # shrinking length + upper_len_left = shrink_length + upper_len_right = sum(upper_edge_list) - shrink_length + + left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left) + left_quad = self.shrink_quad_along_width( + quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1) + right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right) + right_quad = self.shrink_quad_along_width( + quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio) + + out_quad_list = [] + if left_idx == right_idx: + out_quad_list.append( + [left_quad[0], right_quad[1], right_quad[2], left_quad[3]]) + else: + out_quad_list.append(left_quad) + for idx in range(left_idx + 1, right_idx): + out_quad_list.append(quads[idx]) + out_quad_list.append(right_quad) + + return np.array(out_quad_list), list(range(left_idx, right_idx + 1)) + + def prepare_text_label(self, label_str, Lexicon_Table): + """ + Prepare text lablel by given Lexicon_Table. + """ + if len(Lexicon_Table) == 36: + return label_str.lower() + else: + return label_str + + def vector_angle(self, A, B): + """ + Calculate the angle between vector AB and x-axis positive direction. + """ + AB = np.array([B[1] - A[1], B[0] - A[0]]) + return np.arctan2(*AB) + + def theta_line_cross_point(self, theta, point): + """ + Calculate the line through given point and angle in ax + by + c =0 form. + """ + x, y = point + cos = np.cos(theta) + sin = np.sin(theta) + return [sin, -cos, cos * y - sin * x] + + def line_cross_two_point(self, A, B): + """ + Calculate the line through given point A and B in ax + by + c =0 form. + """ + angle = self.vector_angle(A, B) + return self.theta_line_cross_point(angle, A) + + def average_angle(self, poly): + """ + Calculate the average angle between left and right edge in given poly. + """ + p0, p1, p2, p3 = poly + angle30 = self.vector_angle(p3, p0) + angle21 = self.vector_angle(p2, p1) + return (angle30 + angle21) / 2 + + def line_cross_point(self, line1, line2): + """ + line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2 + """ + a1, b1, c1 = line1 + a2, b2, c2 = line2 + d = a1 * b2 - a2 * b1 + + if d == 0: + print('Cross point does not exist') + return np.array([0, 0], dtype=np.float32) + else: + x = (b1 * c2 - b2 * c1) / d + y = (a2 * c1 - a1 * c2) / d + + return np.array([x, y], dtype=np.float32) + + def quad2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. (4, 2) + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair + p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair + return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]]) + + def poly2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + tcl_poly = np.zeros_like(poly) + point_num = poly.shape[0] + + for idx in range(point_num // 2): + point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx] + ) * ratio_pair + tcl_poly[idx] = point_pair[0] + tcl_poly[point_num - 1 - idx] = point_pair[1] + return tcl_poly + + def gen_quad_tbo(self, quad, tcl_mask, tbo_map): + """ + Generate tbo_map for give quad. + """ + # upper and lower line function: ax + by + c = 0; + up_line = self.line_cross_two_point(quad[0], quad[1]) + lower_line = self.line_cross_two_point(quad[3], quad[2]) + + quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[1] - quad[2])) + quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) + + # average angle of left and right line. + angle = self.average_angle(quad) + + xy_in_poly = np.argwhere(tcl_mask == 1) + for y, x in xy_in_poly: + point = (x, y) + line = self.theta_line_cross_point(angle, point) + cross_point_upper = self.line_cross_point(up_line, line) + cross_point_lower = self.line_cross_point(lower_line, line) + ##FIX, offset reverse + upper_offset_x, upper_offset_y = cross_point_upper - point + lower_offset_x, lower_offset_y = cross_point_lower - point + tbo_map[y, x, 0] = upper_offset_y + tbo_map[y, x, 1] = upper_offset_x + tbo_map[y, x, 2] = lower_offset_y + tbo_map[y, x, 3] = lower_offset_x + tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2 + return tbo_map + + def poly2quads(self, poly): + """ + Split poly into quads. + """ + quad_list = [] + point_num = poly.shape[0] + + # point pair + point_pair_list = [] + for idx in range(point_num // 2): + point_pair = [poly[idx], poly[point_num - 1 - idx]] + point_pair_list.append(point_pair) + + quad_num = point_num // 2 - 1 + for idx in range(quad_num): + # reshape and adjust to clock-wise + quad_list.append((np.array(point_pair_list)[[idx, idx + 1]] + ).reshape(4, 2)[[0, 2, 3, 1]]) + + return np.array(quad_list) + + def rotate_im_poly(self, im, text_polys): + """ + rotate image with 90 / 180 / 270 degre + """ + im_w, im_h = im.shape[1], im.shape[0] + dst_im = im.copy() + dst_polys = [] + rand_degree_ratio = np.random.rand() + rand_degree_cnt = 1 + if rand_degree_ratio > 0.5: + rand_degree_cnt = 3 + for i in range(rand_degree_cnt): + dst_im = np.rot90(dst_im) + rot_degree = -90 * rand_degree_cnt + rot_angle = rot_degree * math.pi / 180.0 + n_poly = text_polys.shape[0] + cx, cy = 0.5 * im_w, 0.5 * im_h + ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0] + for i in range(n_poly): + wordBB = text_polys[i] + poly = [] + for j in range(4): # 16->4 + sx, sy = wordBB[j][0], wordBB[j][1] + dx = math.cos(rot_angle) * (sx - cx) - math.sin(rot_angle) * ( + sy - cy) + ncx + dy = math.sin(rot_angle) * (sx - cx) + math.cos(rot_angle) * ( + sy - cy) + ncy + poly.append([dx, dy]) + dst_polys.append(poly) + return dst_im, np.array(dst_polys, dtype=np.float32) + + def __call__(self, data): + input_size = 512 + im = data['image'] + text_polys = data['polys'] + text_tags = data['ignore_tags'] + text_strs = data['texts'] + h, w, _ = im.shape + text_polys, text_tags, hv_tags = self.check_and_validate_polys( + text_polys, text_tags, (h, w)) + if text_polys.shape[0] <= 0: + return None + # set aspect ratio and keep area fix + asp_scales = np.arange(1.0, 1.55, 0.1) + asp_scale = np.random.choice(asp_scales) + if np.random.rand() < 0.5: + asp_scale = 1.0 / asp_scale + asp_scale = math.sqrt(asp_scale) + + asp_wx = asp_scale + asp_hy = 1.0 / asp_scale + im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy) + text_polys[:, :, 0] *= asp_wx + text_polys[:, :, 1] *= asp_hy + + h, w, _ = im.shape + if max(h, w) > 2048: + rd_scale = 2048.0 / max(h, w) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + h, w, _ = im.shape + if min(h, w) < 16: + return None + + # no background + im, text_polys, text_tags, hv_tags, text_strs = self.crop_area( + im, + text_polys, + text_tags, + hv_tags, + text_strs, + crop_background=False) + + if text_polys.shape[0] == 0: + return None + # # continue for all ignore case + if np.sum((text_tags * 1.0)) >= text_tags.size: + return None + new_h, new_w, _ = im.shape + if (new_h is None) or (new_w is None): + return None + # resize image + std_ratio = float(input_size) / max(new_w, new_h) + rand_scales = np.array( + [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0]) + rz_scale = std_ratio * np.random.choice(rand_scales) + im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale) + text_polys[:, :, 0] *= rz_scale + text_polys[:, :, 1] *= rz_scale + + # add gaussian blur + if np.random.rand() < 0.1 * 0.5: + ks = np.random.permutation(5)[0] + 1 + ks = int(ks / 2) * 2 + 1 + im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0) + # add brighter + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 + np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + # add darker + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 - np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + + # Padding the im to [input_size, input_size] + new_h, new_w, _ = im.shape + if min(new_w, new_h) < input_size * 0.5: + return None + im_padded = np.ones((input_size, input_size, 3), dtype=np.float32) + im_padded[:, :, 2] = 0.485 * 255 + im_padded[:, :, 1] = 0.456 * 255 + im_padded[:, :, 0] = 0.406 * 255 + + # Random the start position + del_h = input_size - new_h + del_w = input_size - new_w + sh, sw = 0, 0 + if del_h > 1: + sh = int(np.random.rand() * del_h) + if del_w > 1: + sw = int(np.random.rand() * del_w) + + # Padding + im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy() + text_polys[:, :, 0] += sw + text_polys[:, :, 1] += sh + + score_map, score_label_map, border_map, direction_map, training_mask, \ + pos_list, pos_mask, label_list, score_label_map_text_label = self.generate_tcl_ctc_label(input_size, + input_size, + text_polys, + text_tags, + text_strs, 0.25) + if len(label_list) <= 0: # eliminate negative samples + return None + pos_list_temp = np.zeros([64, 3]) + pos_mask_temp = np.zeros([64, 1]) + label_list_temp = np.zeros([self.max_text_length, 1]) + self.pad_num + + for i, label in enumerate(label_list): + n = len(label) + if n > self.max_text_length: + label_list[i] = label[:self.max_text_length] + continue + while n < self.max_text_length: + label.append([self.pad_num]) + n += 1 + + for i in range(len(label_list)): + label_list[i] = np.array(label_list[i]) + + if len(pos_list) <= 0 or len(pos_list) > self.max_text_nums: + return None + for __ in range(self.max_text_nums - len(pos_list), 0, -1): + pos_list.append(pos_list_temp) + pos_mask.append(pos_mask_temp) + label_list.append(label_list_temp) + + if self.img_id == self.batch_size - 1: + self.img_id = 0 + else: + self.img_id += 1 + + im_padded[:, :, 2] -= 0.485 * 255 + im_padded[:, :, 1] -= 0.456 * 255 + im_padded[:, :, 0] -= 0.406 * 255 + im_padded[:, :, 2] /= (255.0 * 0.229) + im_padded[:, :, 1] /= (255.0 * 0.224) + im_padded[:, :, 0] /= (255.0 * 0.225) + im_padded = im_padded.transpose((2, 0, 1)) + images = im_padded[::-1, :, :] + tcl_maps = score_map[np.newaxis, :, :] + tcl_label_maps = score_label_map[np.newaxis, :, :] + border_maps = border_map.transpose((2, 0, 1)) + direction_maps = direction_map.transpose((2, 0, 1)) + training_masks = training_mask[np.newaxis, :, :] + pos_list = np.array(pos_list) + pos_mask = np.array(pos_mask) + label_list = np.array(label_list) + data['images'] = images + data['tcl_maps'] = tcl_maps + data['tcl_label_maps'] = tcl_label_maps + data['border_maps'] = border_maps + data['direction_maps'] = direction_maps + data['training_masks'] = training_masks + data['label_list'] = label_list + data['pos_list'] = pos_list + data['pos_mask'] = pos_mask + return data diff --git a/backend/ppocr/data/imaug/randaugment.py b/backend/ppocr/data/imaug/randaugment.py new file mode 100644 index 0000000..56f114d --- /dev/null +++ b/backend/ppocr/data/imaug/randaugment.py @@ -0,0 +1,143 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from PIL import Image, ImageEnhance, ImageOps +import numpy as np +import random +import six + + +class RawRandAugment(object): + def __init__(self, + num_layers=2, + magnitude=5, + fillcolor=(128, 128, 128), + **kwargs): + self.num_layers = num_layers + self.magnitude = magnitude + self.max_level = 10 + + abso_level = self.magnitude / self.max_level + self.level_map = { + "shearX": 0.3 * abso_level, + "shearY": 0.3 * abso_level, + "translateX": 150.0 / 331 * abso_level, + "translateY": 150.0 / 331 * abso_level, + "rotate": 30 * abso_level, + "color": 0.9 * abso_level, + "posterize": int(4.0 * abso_level), + "solarize": 256.0 * abso_level, + "contrast": 0.9 * abso_level, + "sharpness": 0.9 * abso_level, + "brightness": 0.9 * abso_level, + "autocontrast": 0, + "equalize": 0, + "invert": 0 + } + + # from https://stackoverflow.com/questions/5252170/ + # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand + def rotate_with_fill(img, magnitude): + rot = img.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new("RGBA", rot.size, (128, ) * 4), + rot).convert(img.mode) + + rnd_ch_op = random.choice + + self.func = { + "shearX": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0), + Image.BICUBIC, + fillcolor=fillcolor), + "shearY": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0), + Image.BICUBIC, + fillcolor=fillcolor), + "translateX": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0), + fillcolor=fillcolor), + "translateY": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])), + fillcolor=fillcolor), + "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude), + "color": lambda img, magnitude: ImageEnhance.Color(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "posterize": lambda img, magnitude: + ImageOps.posterize(img, magnitude), + "solarize": lambda img, magnitude: + ImageOps.solarize(img, magnitude), + "contrast": lambda img, magnitude: + ImageEnhance.Contrast(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "sharpness": lambda img, magnitude: + ImageEnhance.Sharpness(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "brightness": lambda img, magnitude: + ImageEnhance.Brightness(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "autocontrast": lambda img, magnitude: + ImageOps.autocontrast(img), + "equalize": lambda img, magnitude: ImageOps.equalize(img), + "invert": lambda img, magnitude: ImageOps.invert(img) + } + + def __call__(self, img): + avaiable_op_names = list(self.level_map.keys()) + for layer_num in range(self.num_layers): + op_name = np.random.choice(avaiable_op_names) + img = self.func[op_name](img, self.level_map[op_name]) + return img + + +class RandAugment(RawRandAugment): + """ RandAugment wrapper to auto fit different img types """ + + def __init__(self, prob=0.5, *args, **kwargs): + self.prob = prob + if six.PY2: + super(RandAugment, self).__init__(*args, **kwargs) + else: + super().__init__(*args, **kwargs) + + def __call__(self, data): + if np.random.rand() > self.prob: + return data + img = data['image'] + if not isinstance(img, Image.Image): + img = np.ascontiguousarray(img) + img = Image.fromarray(img) + + if six.PY2: + img = super(RandAugment, self).__call__(img) + else: + img = super().__call__(img) + + if isinstance(img, Image.Image): + img = np.asarray(img) + data['image'] = img + return data diff --git a/backend/ppocr/data/imaug/random_crop_data.py b/backend/ppocr/data/imaug/random_crop_data.py new file mode 100644 index 0000000..64aa110 --- /dev/null +++ b/backend/ppocr/data/imaug/random_crop_data.py @@ -0,0 +1,234 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/random_crop_data.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import cv2 +import random + + +def is_poly_in_rect(poly, x, y, w, h): + poly = np.array(poly) + if poly[:, 0].min() < x or poly[:, 0].max() > x + w: + return False + if poly[:, 1].min() < y or poly[:, 1].max() > y + h: + return False + return True + + +def is_poly_outside_rect(poly, x, y, w, h): + poly = np.array(poly) + if poly[:, 0].max() < x or poly[:, 0].min() > x + w: + return True + if poly[:, 1].max() < y or poly[:, 1].min() > y + h: + return True + return False + + +def split_regions(axis): + regions = [] + min_axis = 0 + for i in range(1, axis.shape[0]): + if axis[i] != axis[i - 1] + 1: + region = axis[min_axis:i] + min_axis = i + regions.append(region) + return regions + + +def random_select(axis, max_size): + xx = np.random.choice(axis, size=2) + xmin = np.min(xx) + xmax = np.max(xx) + xmin = np.clip(xmin, 0, max_size - 1) + xmax = np.clip(xmax, 0, max_size - 1) + return xmin, xmax + + +def region_wise_random_select(regions, max_size): + selected_index = list(np.random.choice(len(regions), 2)) + selected_values = [] + for index in selected_index: + axis = regions[index] + xx = int(np.random.choice(axis, size=1)) + selected_values.append(xx) + xmin = min(selected_values) + xmax = max(selected_values) + return xmin, xmax + + +def crop_area(im, text_polys, min_crop_side_ratio, max_tries): + h, w, _ = im.shape + h_array = np.zeros(h, dtype=np.int32) + w_array = np.zeros(w, dtype=np.int32) + for points in text_polys: + points = np.round(points, decimals=0).astype(np.int32) + minx = np.min(points[:, 0]) + maxx = np.max(points[:, 0]) + w_array[minx:maxx] = 1 + miny = np.min(points[:, 1]) + maxy = np.max(points[:, 1]) + h_array[miny:maxy] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + + if len(h_axis) == 0 or len(w_axis) == 0: + return 0, 0, w, h + + h_regions = split_regions(h_axis) + w_regions = split_regions(w_axis) + + for i in range(max_tries): + if len(w_regions) > 1: + xmin, xmax = region_wise_random_select(w_regions, w) + else: + xmin, xmax = random_select(w_axis, w) + if len(h_regions) > 1: + ymin, ymax = region_wise_random_select(h_regions, h) + else: + ymin, ymax = random_select(h_axis, h) + + if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h: + # area too small + continue + num_poly_in_rect = 0 + for poly in text_polys: + if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin, + ymax - ymin): + num_poly_in_rect += 1 + break + + if num_poly_in_rect > 0: + return xmin, ymin, xmax - xmin, ymax - ymin + + return 0, 0, w, h + + +class EastRandomCropData(object): + def __init__(self, + size=(640, 640), + max_tries=10, + min_crop_side_ratio=0.1, + keep_ratio=True, + **kwargs): + self.size = size + self.max_tries = max_tries + self.min_crop_side_ratio = min_crop_side_ratio + self.keep_ratio = keep_ratio + + def __call__(self, data): + img = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + texts = data['texts'] + all_care_polys = [ + text_polys[i] for i, tag in enumerate(ignore_tags) if not tag + ] + # 计算crop区域 + crop_x, crop_y, crop_w, crop_h = crop_area( + img, all_care_polys, self.min_crop_side_ratio, self.max_tries) + # crop 图片 保持比例填充 + scale_w = self.size[0] / crop_w + scale_h = self.size[1] / crop_h + scale = min(scale_w, scale_h) + h = int(crop_h * scale) + w = int(crop_w * scale) + if self.keep_ratio: + padimg = np.zeros((self.size[1], self.size[0], img.shape[2]), + img.dtype) + padimg[:h, :w] = cv2.resize( + img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h)) + img = padimg + else: + img = cv2.resize( + img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], + tuple(self.size)) + # crop 文本框 + text_polys_crop = [] + ignore_tags_crop = [] + texts_crop = [] + for poly, text, tag in zip(text_polys, texts, ignore_tags): + poly = ((poly - (crop_x, crop_y)) * scale).tolist() + if not is_poly_outside_rect(poly, 0, 0, w, h): + text_polys_crop.append(poly) + ignore_tags_crop.append(tag) + texts_crop.append(text) + data['image'] = img + data['polys'] = np.array(text_polys_crop) + data['ignore_tags'] = ignore_tags_crop + data['texts'] = texts_crop + return data + + +class RandomCropImgMask(object): + def __init__(self, size, main_key, crop_keys, p=3 / 8, **kwargs): + self.size = size + self.main_key = main_key + self.crop_keys = crop_keys + self.p = p + + def __call__(self, data): + image = data['image'] + + h, w = image.shape[0:2] + th, tw = self.size + if w == tw and h == th: + return data + + mask = data[self.main_key] + if np.max(mask) > 0 and random.random() > self.p: + # make sure to crop the text region + tl = np.min(np.where(mask > 0), axis=1) - (th, tw) + tl[tl < 0] = 0 + br = np.max(np.where(mask > 0), axis=1) - (th, tw) + br[br < 0] = 0 + + br[0] = min(br[0], h - th) + br[1] = min(br[1], w - tw) + + i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0 + j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0 + else: + i = random.randint(0, h - th) if h - th > 0 else 0 + j = random.randint(0, w - tw) if w - tw > 0 else 0 + + # return i, j, th, tw + for k in data: + if k in self.crop_keys: + if len(data[k].shape) == 3: + if np.argmin(data[k].shape) == 0: + img = data[k][:, i:i + th, j:j + tw] + if img.shape[1] != img.shape[2]: + a = 1 + elif np.argmin(data[k].shape) == 2: + img = data[k][i:i + th, j:j + tw, :] + if img.shape[1] != img.shape[0]: + a = 1 + else: + img = data[k] + else: + img = data[k][i:i + th, j:j + tw] + if img.shape[0] != img.shape[1]: + a = 1 + data[k] = img + return data diff --git a/backend/ppocr/data/imaug/rec_img_aug.py b/backend/ppocr/data/imaug/rec_img_aug.py new file mode 100644 index 0000000..7483dff --- /dev/null +++ b/backend/ppocr/data/imaug/rec_img_aug.py @@ -0,0 +1,601 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import cv2 +import numpy as np +import random +import copy +from PIL import Image +from .text_image_aug import tia_perspective, tia_stretch, tia_distort + + +class RecAug(object): + def __init__(self, use_tia=True, aug_prob=0.4, **kwargs): + self.use_tia = use_tia + self.aug_prob = aug_prob + + def __call__(self, data): + img = data['image'] + img = warp(img, 10, self.use_tia, self.aug_prob) + data['image'] = img + return data + + +class RecConAug(object): + def __init__(self, + prob=0.5, + image_shape=(32, 320, 3), + max_text_length=25, + ext_data_num=1, + **kwargs): + self.ext_data_num = ext_data_num + self.prob = prob + self.max_text_length = max_text_length + self.image_shape = image_shape + self.max_wh_ratio = self.image_shape[1] / self.image_shape[0] + + def merge_ext_data(self, data, ext_data): + ori_w = round(data['image'].shape[1] / data['image'].shape[0] * + self.image_shape[0]) + ext_w = round(ext_data['image'].shape[1] / ext_data['image'].shape[0] * + self.image_shape[0]) + data['image'] = cv2.resize(data['image'], (ori_w, self.image_shape[0])) + ext_data['image'] = cv2.resize(ext_data['image'], + (ext_w, self.image_shape[0])) + data['image'] = np.concatenate( + [data['image'], ext_data['image']], axis=1) + data["label"] += ext_data["label"] + return data + + def __call__(self, data): + rnd_num = random.random() + if rnd_num > self.prob: + return data + for idx, ext_data in enumerate(data["ext_data"]): + if len(data["label"]) + len(ext_data[ + "label"]) > self.max_text_length: + break + concat_ratio = data['image'].shape[1] / data['image'].shape[ + 0] + ext_data['image'].shape[1] / ext_data['image'].shape[0] + if concat_ratio > self.max_wh_ratio: + break + data = self.merge_ext_data(data, ext_data) + data.pop("ext_data") + return data + + +class ClsResizeImg(object): + def __init__(self, image_shape, **kwargs): + self.image_shape = image_shape + + def __call__(self, data): + img = data['image'] + norm_img, _ = resize_norm_img(img, self.image_shape) + data['image'] = norm_img + return data + + +class NRTRRecResizeImg(object): + def __init__(self, image_shape, resize_type, padding=False, **kwargs): + self.image_shape = image_shape + self.resize_type = resize_type + self.padding = padding + + def __call__(self, data): + img = data['image'] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + image_shape = self.image_shape + if self.padding: + imgC, imgH, imgW = image_shape + # todo: change to 0 and modified image shape + h = img.shape[0] + w = img.shape[1] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + norm_img = np.expand_dims(resized_image, -1) + norm_img = norm_img.transpose((2, 0, 1)) + resized_image = norm_img.astype(np.float32) / 128. - 1. + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + data['image'] = padding_im + return data + if self.resize_type == 'PIL': + image_pil = Image.fromarray(np.uint8(img)) + img = image_pil.resize(self.image_shape, Image.ANTIALIAS) + img = np.array(img) + if self.resize_type == 'OpenCV': + img = cv2.resize(img, self.image_shape) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + data['image'] = norm_img.astype(np.float32) / 128. - 1. + return data + + +class RecResizeImg(object): + def __init__(self, + image_shape, + infer_mode=False, + character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', + padding=True, + **kwargs): + self.image_shape = image_shape + self.infer_mode = infer_mode + self.character_dict_path = character_dict_path + self.padding = padding + + def __call__(self, data): + img = data['image'] + if self.infer_mode and self.character_dict_path is not None: + norm_img, valid_ratio = resize_norm_img_chinese(img, + self.image_shape) + else: + norm_img, valid_ratio = resize_norm_img(img, self.image_shape, + self.padding) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + +class SRNRecResizeImg(object): + def __init__(self, image_shape, num_heads, max_text_length, **kwargs): + self.image_shape = image_shape + self.num_heads = num_heads + self.max_text_length = max_text_length + + def __call__(self, data): + img = data['image'] + norm_img = resize_norm_img_srn(img, self.image_shape) + data['image'] = norm_img + [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + srn_other_inputs(self.image_shape, self.num_heads, self.max_text_length) + + data['encoder_word_pos'] = encoder_word_pos + data['gsrm_word_pos'] = gsrm_word_pos + data['gsrm_slf_attn_bias1'] = gsrm_slf_attn_bias1 + data['gsrm_slf_attn_bias2'] = gsrm_slf_attn_bias2 + return data + + +class SARRecResizeImg(object): + def __init__(self, image_shape, width_downsample_ratio=0.25, **kwargs): + self.image_shape = image_shape + self.width_downsample_ratio = width_downsample_ratio + + def __call__(self, data): + img = data['image'] + norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar( + img, self.image_shape, self.width_downsample_ratio) + data['image'] = norm_img + data['resized_shape'] = resize_shape + data['pad_shape'] = pad_shape + data['valid_ratio'] = valid_ratio + return data + + +class PRENResizeImg(object): + def __init__(self, image_shape, **kwargs): + """ + Accroding to original paper's realization, it's a hard resize method here. + So maybe you should optimize it to fit for your task better. + """ + self.dst_h, self.dst_w = image_shape + + def __call__(self, data): + img = data['image'] + resized_img = cv2.resize( + img, (self.dst_w, self.dst_h), interpolation=cv2.INTER_LINEAR) + resized_img = resized_img.transpose((2, 0, 1)) / 255 + resized_img -= 0.5 + resized_img /= 0.5 + data['image'] = resized_img.astype(np.float32) + return data + + +def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + +def resize_norm_img(img, image_shape, padding=True): + imgC, imgH, imgW = image_shape + h = img.shape[0] + w = img.shape[1] + if not padding: + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_w = imgW + else: + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + valid_ratio = min(1.0, float(resized_w / imgW)) + return padding_im, valid_ratio + + +def resize_norm_img_chinese(img, image_shape): + imgC, imgH, imgW = image_shape + # todo: change to 0 and modified image shape + max_wh_ratio = imgW * 1.0 / imgH + h, w = img.shape[0], img.shape[1] + ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, ratio) + imgW = int(imgH * max_wh_ratio) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + valid_ratio = min(1.0, float(resized_w / imgW)) + return padding_im, valid_ratio + + +def resize_norm_img_srn(img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + + +def srn_other_inputs(image_shape, num_heads, max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape( + (feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape( + (max_text_length, 1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( + [1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, + [num_heads, 1, 1]) * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( + [1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, + [num_heads, 1, 1]) * [-1e9] + + return [ + encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + + +def flag(): + """ + flag + """ + return 1 if random.random() > 0.5000001 else -1 + + +def cvtColor(img): + """ + cvtColor + """ + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + delta = 0.001 * random.random() * flag() + hsv[:, :, 2] = hsv[:, :, 2] * (1 + delta) + new_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) + return new_img + + +def blur(img): + """ + blur + """ + h, w, _ = img.shape + if h > 10 and w > 10: + return cv2.GaussianBlur(img, (5, 5), 1) + else: + return img + + +def jitter(img): + """ + jitter + """ + w, h, _ = img.shape + if h > 10 and w > 10: + thres = min(w, h) + s = int(random.random() * thres * 0.01) + src_img = img.copy() + for i in range(s): + img[i:, i:, :] = src_img[:w - i, :h - i, :] + return img + else: + return img + + +def add_gasuss_noise(image, mean=0, var=0.1): + """ + Gasuss noise + """ + + noise = np.random.normal(mean, var**0.5, image.shape) + out = image + 0.5 * noise + out = np.clip(out, 0, 255) + out = np.uint8(out) + return out + + +def get_crop(image): + """ + random crop + """ + h, w, _ = image.shape + top_min = 1 + top_max = 8 + top_crop = int(random.randint(top_min, top_max)) + top_crop = min(top_crop, h - 1) + crop_img = image.copy() + ratio = random.randint(0, 1) + if ratio: + crop_img = crop_img[top_crop:h, :, :] + else: + crop_img = crop_img[0:h - top_crop, :, :] + return crop_img + + +class Config: + """ + Config + """ + + def __init__(self, use_tia): + self.anglex = random.random() * 30 + self.angley = random.random() * 15 + self.anglez = random.random() * 10 + self.fov = 42 + self.r = 0 + self.shearx = random.random() * 0.3 + self.sheary = random.random() * 0.05 + self.borderMode = cv2.BORDER_REPLICATE + self.use_tia = use_tia + + def make(self, w, h, ang): + """ + make + """ + self.anglex = random.random() * 5 * flag() + self.angley = random.random() * 5 * flag() + self.anglez = -1 * random.random() * int(ang) * flag() + self.fov = 42 + self.r = 0 + self.shearx = 0 + self.sheary = 0 + self.borderMode = cv2.BORDER_REPLICATE + self.w = w + self.h = h + + self.perspective = self.use_tia + self.stretch = self.use_tia + self.distort = self.use_tia + + self.crop = True + self.affine = False + self.reverse = True + self.noise = True + self.jitter = True + self.blur = True + self.color = True + + +def rad(x): + """ + rad + """ + return x * np.pi / 180 + + +def get_warpR(config): + """ + get_warpR + """ + anglex, angley, anglez, fov, w, h, r = \ + config.anglex, config.angley, config.anglez, config.fov, config.w, config.h, config.r + if w > 69 and w < 112: + anglex = anglex * 1.5 + + z = np.sqrt(w**2 + h**2) / 2 / np.tan(rad(fov / 2)) + # Homogeneous coordinate transformation matrix + rx = np.array([[1, 0, 0, 0], + [0, np.cos(rad(anglex)), -np.sin(rad(anglex)), 0], [ + 0, + -np.sin(rad(anglex)), + np.cos(rad(anglex)), + 0, + ], [0, 0, 0, 1]], np.float32) + ry = np.array([[np.cos(rad(angley)), 0, np.sin(rad(angley)), 0], + [0, 1, 0, 0], [ + -np.sin(rad(angley)), + 0, + np.cos(rad(angley)), + 0, + ], [0, 0, 0, 1]], np.float32) + rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0, 0], + [-np.sin(rad(anglez)), np.cos(rad(anglez)), 0, 0], + [0, 0, 1, 0], [0, 0, 0, 1]], np.float32) + r = rx.dot(ry).dot(rz) + # generate 4 points + pcenter = np.array([h / 2, w / 2, 0, 0], np.float32) + p1 = np.array([0, 0, 0, 0], np.float32) - pcenter + p2 = np.array([w, 0, 0, 0], np.float32) - pcenter + p3 = np.array([0, h, 0, 0], np.float32) - pcenter + p4 = np.array([w, h, 0, 0], np.float32) - pcenter + dst1 = r.dot(p1) + dst2 = r.dot(p2) + dst3 = r.dot(p3) + dst4 = r.dot(p4) + list_dst = np.array([dst1, dst2, dst3, dst4]) + org = np.array([[0, 0], [w, 0], [0, h], [w, h]], np.float32) + dst = np.zeros((4, 2), np.float32) + # Project onto the image plane + dst[:, 0] = list_dst[:, 0] * z / (z - list_dst[:, 2]) + pcenter[0] + dst[:, 1] = list_dst[:, 1] * z / (z - list_dst[:, 2]) + pcenter[1] + + warpR = cv2.getPerspectiveTransform(org, dst) + + dst1, dst2, dst3, dst4 = dst + r1 = int(min(dst1[1], dst2[1])) + r2 = int(max(dst3[1], dst4[1])) + c1 = int(min(dst1[0], dst3[0])) + c2 = int(max(dst2[0], dst4[0])) + + try: + ratio = min(1.0 * h / (r2 - r1), 1.0 * w / (c2 - c1)) + + dx = -c1 + dy = -r1 + T1 = np.float32([[1., 0, dx], [0, 1., dy], [0, 0, 1.0 / ratio]]) + ret = T1.dot(warpR) + except: + ratio = 1.0 + T1 = np.float32([[1., 0, 0], [0, 1., 0], [0, 0, 1.]]) + ret = T1 + return ret, (-r1, -c1), ratio, dst + + +def get_warpAffine(config): + """ + get_warpAffine + """ + anglez = config.anglez + rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0], + [-np.sin(rad(anglez)), np.cos(rad(anglez)), 0]], np.float32) + return rz + + +def warp(img, ang, use_tia=True, prob=0.4): + """ + warp + """ + h, w, _ = img.shape + config = Config(use_tia=use_tia) + config.make(w, h, ang) + new_img = img + + if config.distort: + img_height, img_width = img.shape[0:2] + if random.random() <= prob and img_height >= 20 and img_width >= 20: + new_img = tia_distort(new_img, random.randint(3, 6)) + + if config.stretch: + img_height, img_width = img.shape[0:2] + if random.random() <= prob and img_height >= 20 and img_width >= 20: + new_img = tia_stretch(new_img, random.randint(3, 6)) + + if config.perspective: + if random.random() <= prob: + new_img = tia_perspective(new_img) + + if config.crop: + img_height, img_width = img.shape[0:2] + if random.random() <= prob and img_height >= 20 and img_width >= 20: + new_img = get_crop(new_img) + + if config.blur: + if random.random() <= prob: + new_img = blur(new_img) + if config.color: + if random.random() <= prob: + new_img = cvtColor(new_img) + if config.jitter: + new_img = jitter(new_img) + if config.noise: + if random.random() <= prob: + new_img = add_gasuss_noise(new_img) + if config.reverse: + if random.random() <= prob: + new_img = 255 - new_img + return new_img diff --git a/backend/ppocr/data/imaug/sast_process.py b/backend/ppocr/data/imaug/sast_process.py new file mode 100644 index 0000000..08d03b1 --- /dev/null +++ b/backend/ppocr/data/imaug/sast_process.py @@ -0,0 +1,777 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +""" +This part code is refered from: +https://github.com/songdejia/EAST/blob/master/data_utils.py +""" +import math +import cv2 +import numpy as np +import json +import sys +import os + +__all__ = ['SASTProcessTrain'] + + +class SASTProcessTrain(object): + def __init__(self, + image_shape=[512, 512], + min_crop_size=24, + min_crop_side_ratio=0.3, + min_text_size=10, + max_text_size=512, + **kwargs): + self.input_size = image_shape[1] + self.min_crop_size = min_crop_size + self.min_crop_side_ratio = min_crop_side_ratio + self.min_text_size = min_text_size + self.max_text_size = max_text_size + + def quad_area(self, poly): + """ + compute area of a polygon + :param poly: + :return: + """ + edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), + (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), + (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), + (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] + return np.sum(edge) / 2. + + def gen_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + if True: + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + center_point = rect[0] + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad + + def check_and_validate_polys(self, polys, tags, xxx_todo_changeme): + """ + check so that the text poly is in the same direction, + and also filter some invalid polygons + :param polys: + :param tags: + :return: + """ + (h, w) = xxx_todo_changeme + if polys.shape[0] == 0: + return polys, np.array([]), np.array([]) + polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) + polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) + + validated_polys = [] + validated_tags = [] + hv_tags = [] + for poly, tag in zip(polys, tags): + quad = self.gen_quad_from_poly(poly) + p_area = self.quad_area(quad) + if abs(p_area) < 1: + print('invalid poly') + continue + if p_area > 0: + if tag == False: + print('poly in wrong direction') + tag = True # reversed cases should be ignore + poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1), :] + quad = quad[(0, 3, 2, 1), :] + + len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] - + quad[2]) + len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - + quad[2]) + hv_tag = 1 + + if len_w * 2.0 < len_h: + hv_tag = 0 + + validated_polys.append(poly) + validated_tags.append(tag) + hv_tags.append(hv_tag) + return np.array(validated_polys), np.array(validated_tags), np.array( + hv_tags) + + def crop_area(self, + im, + polys, + tags, + hv_tags, + crop_background=False, + max_tries=25): + """ + make random crop from the input image + :param im: + :param polys: + :param tags: + :param crop_background: + :param max_tries: 50 -> 25 + :return: + """ + h, w, _ = im.shape + pad_h = h // 10 + pad_w = w // 10 + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + if len(h_axis) == 0 or len(w_axis) == 0: + return im, polys, tags, hv_tags + for i in range(max_tries): + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + # if xmax - xmin < ARGS.min_crop_side_ratio * w or \ + # ymax - ymin < ARGS.min_crop_side_ratio * h: + if xmax - xmin < self.min_crop_size or \ + ymax - ymin < self.min_crop_size: + # area too small + continue + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \ + & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax) + selected_polys = np.where( + np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + if len(selected_polys) == 0: + # no text in this area + if crop_background: + return im[ymin : ymax + 1, xmin : xmax + 1, :], \ + polys[selected_polys], tags[selected_polys], hv_tags[selected_polys] + else: + continue + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = polys[selected_polys] + tags = tags[selected_polys] + hv_tags = hv_tags[selected_polys] + polys[:, :, 0] -= xmin + polys[:, :, 1] -= ymin + return im, polys, tags, hv_tags + + return im, polys, tags, hv_tags + + def generate_direction_map(self, poly_quads, direction_map): + """ + """ + width_list = [] + height_list = [] + for quad in poly_quads: + quad_w = (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) / 2.0 + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + width_list.append(quad_w) + height_list.append(quad_h) + norm_width = max(sum(width_list) / (len(width_list) + 1e-6), 1.0) + average_height = max(sum(height_list) / (len(height_list) + 1e-6), 1.0) + + for quad in poly_quads: + direct_vector_full = ( + (quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0 + direct_vector = direct_vector_full / ( + np.linalg.norm(direct_vector_full) + 1e-6) * norm_width + direction_label = tuple( + map(float, [ + direct_vector[0], direct_vector[1], 1.0 / (average_height + + 1e-6) + ])) + cv2.fillPoly(direction_map, + quad.round().astype(np.int32)[np.newaxis, :, :], + direction_label) + return direction_map + + def calculate_average_height(self, poly_quads): + """ + """ + height_list = [] + for quad in poly_quads: + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + height_list.append(quad_h) + average_height = max(sum(height_list) / len(height_list), 1.0) + return average_height + + def generate_tcl_label(self, + hw, + polys, + tags, + ds_ratio, + tcl_ratio=0.3, + shrink_ratio_of_width=0.15): + """ + Generate polygon. + """ + h, w = hw + h, w = int(h * ds_ratio), int(w * ds_ratio) + polys = polys * ds_ratio + + score_map = np.zeros( + ( + h, + w, ), dtype=np.float32) + tbo_map = np.zeros((h, w, 5), dtype=np.float32) + training_mask = np.ones( + ( + h, + w, ), dtype=np.float32) + direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape( + [1, 1, 3]).astype(np.float32) + + for poly_idx, poly_tag in enumerate(zip(polys, tags)): + poly = poly_tag[0] + tag = poly_tag[1] + + # generate min_area_quad + min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly) + min_area_quad_h = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[3]) + + np.linalg.norm(min_area_quad[1] - min_area_quad[2])) + min_area_quad_w = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[1]) + + np.linalg.norm(min_area_quad[2] - min_area_quad[3])) + + if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \ + or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio: + continue + + if tag: + # continue + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0.15) + else: + tcl_poly = self.poly2tcl(poly, tcl_ratio) + tcl_quads = self.poly2quads(tcl_poly) + poly_quads = self.poly2quads(poly) + # stcl map + stcl_quads, quad_index = self.shrink_poly_along_width( + tcl_quads, + shrink_ratio_of_width=shrink_ratio_of_width, + expand_height_ratio=1.0 / tcl_ratio) + # generate tcl map + cv2.fillPoly(score_map, + np.round(stcl_quads).astype(np.int32), 1.0) + + # generate tbo map + for idx, quad in enumerate(stcl_quads): + quad_mask = np.zeros((h, w), dtype=np.float32) + quad_mask = cv2.fillPoly( + quad_mask, + np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0) + tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]], + quad_mask, tbo_map) + return score_map, tbo_map, training_mask + + def generate_tvo_and_tco(self, + hw, + polys, + tags, + tcl_ratio=0.3, + ds_ratio=0.25): + """ + Generate tcl map, tvo map and tbo map. + """ + h, w = hw + h, w = int(h * ds_ratio), int(w * ds_ratio) + polys = polys * ds_ratio + poly_mask = np.zeros((h, w), dtype=np.float32) + + tvo_map = np.ones((9, h, w), dtype=np.float32) + tvo_map[0:-1:2] = np.tile(np.arange(0, w), (h, 1)) + tvo_map[1:-1:2] = np.tile(np.arange(0, w), (h, 1)).T + poly_tv_xy_map = np.zeros((8, h, w), dtype=np.float32) + + # tco map + tco_map = np.ones((3, h, w), dtype=np.float32) + tco_map[0] = np.tile(np.arange(0, w), (h, 1)) + tco_map[1] = np.tile(np.arange(0, w), (h, 1)).T + poly_tc_xy_map = np.zeros((2, h, w), dtype=np.float32) + + poly_short_edge_map = np.ones((h, w), dtype=np.float32) + + for poly, poly_tag in zip(polys, tags): + + if poly_tag == True: + continue + + # adjust point order for vertical poly + poly = self.adjust_point(poly) + + # generate min_area_quad + min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly) + min_area_quad_h = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[3]) + + np.linalg.norm(min_area_quad[1] - min_area_quad[2])) + min_area_quad_w = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[1]) + + np.linalg.norm(min_area_quad[2] - min_area_quad[3])) + + # generate tcl map and text, 128 * 128 + tcl_poly = self.poly2tcl(poly, tcl_ratio) + + # generate poly_tv_xy_map + for idx in range(4): + cv2.fillPoly( + poly_tv_xy_map[2 * idx], + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(min(max(min_area_quad[idx, 0], 0), w))) + cv2.fillPoly( + poly_tv_xy_map[2 * idx + 1], + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(min(max(min_area_quad[idx, 1], 0), h))) + + # generate poly_tc_xy_map + for idx in range(2): + cv2.fillPoly( + poly_tc_xy_map[idx], + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(center_point[idx])) + + # generate poly_short_edge_map + cv2.fillPoly( + poly_short_edge_map, + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(max(min(min_area_quad_h, min_area_quad_w), 1.0))) + + # generate poly_mask and training_mask + cv2.fillPoly(poly_mask, + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + 1) + + tvo_map *= poly_mask + tvo_map[:8] -= poly_tv_xy_map + tvo_map[-1] /= poly_short_edge_map + tvo_map = tvo_map.transpose((1, 2, 0)) + + tco_map *= poly_mask + tco_map[:2] -= poly_tc_xy_map + tco_map[-1] /= poly_short_edge_map + tco_map = tco_map.transpose((1, 2, 0)) + + return tvo_map, tco_map + + def adjust_point(self, poly): + """ + adjust point order. + """ + point_num = poly.shape[0] + if point_num == 4: + len_1 = np.linalg.norm(poly[0] - poly[1]) + len_2 = np.linalg.norm(poly[1] - poly[2]) + len_3 = np.linalg.norm(poly[2] - poly[3]) + len_4 = np.linalg.norm(poly[3] - poly[0]) + + if (len_1 + len_3) * 1.5 < (len_2 + len_4): + poly = poly[[1, 2, 3, 0], :] + + elif point_num > 4: + vector_1 = poly[0] - poly[1] + vector_2 = poly[1] - poly[2] + cos_theta = np.dot(vector_1, vector_2) / ( + np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6) + theta = np.arccos(np.round(cos_theta, decimals=4)) + + if abs(theta) > (70 / 180 * math.pi): + index = list(range(1, point_num)) + [0] + poly = poly[np.array(index), :] + return poly + + def gen_min_area_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + if point_num == 4: + min_area_quad = poly + center_point = np.sum(poly, axis=0) / 4 + else: + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + center_point = rect[0] + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad, center_point + + def shrink_quad_along_width(self, + quad, + begin_width_ratio=0., + end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def shrink_poly_along_width(self, + quads, + shrink_ratio_of_width, + expand_height_ratio=1.0): + """ + shrink poly with given length. + """ + upper_edge_list = [] + + def get_cut_info(edge_len_list, cut_len): + for idx, edge_len in enumerate(edge_len_list): + cut_len -= edge_len + if cut_len <= 0.000001: + ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx] + return idx, ratio + + for quad in quads: + upper_edge_len = np.linalg.norm(quad[0] - quad[1]) + upper_edge_list.append(upper_edge_len) + + # length of left edge and right edge. + left_length = np.linalg.norm(quads[0][0] - quads[0][ + 3]) * expand_height_ratio + right_length = np.linalg.norm(quads[-1][1] - quads[-1][ + 2]) * expand_height_ratio + + shrink_length = min(left_length, right_length, + sum(upper_edge_list)) * shrink_ratio_of_width + # shrinking length + upper_len_left = shrink_length + upper_len_right = sum(upper_edge_list) - shrink_length + + left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left) + left_quad = self.shrink_quad_along_width( + quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1) + right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right) + right_quad = self.shrink_quad_along_width( + quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio) + + out_quad_list = [] + if left_idx == right_idx: + out_quad_list.append( + [left_quad[0], right_quad[1], right_quad[2], left_quad[3]]) + else: + out_quad_list.append(left_quad) + for idx in range(left_idx + 1, right_idx): + out_quad_list.append(quads[idx]) + out_quad_list.append(right_quad) + + return np.array(out_quad_list), list(range(left_idx, right_idx + 1)) + + def vector_angle(self, A, B): + """ + Calculate the angle between vector AB and x-axis positive direction. + """ + AB = np.array([B[1] - A[1], B[0] - A[0]]) + return np.arctan2(*AB) + + def theta_line_cross_point(self, theta, point): + """ + Calculate the line through given point and angle in ax + by + c =0 form. + """ + x, y = point + cos = np.cos(theta) + sin = np.sin(theta) + return [sin, -cos, cos * y - sin * x] + + def line_cross_two_point(self, A, B): + """ + Calculate the line through given point A and B in ax + by + c =0 form. + """ + angle = self.vector_angle(A, B) + return self.theta_line_cross_point(angle, A) + + def average_angle(self, poly): + """ + Calculate the average angle between left and right edge in given poly. + """ + p0, p1, p2, p3 = poly + angle30 = self.vector_angle(p3, p0) + angle21 = self.vector_angle(p2, p1) + return (angle30 + angle21) / 2 + + def line_cross_point(self, line1, line2): + """ + line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2 + """ + a1, b1, c1 = line1 + a2, b2, c2 = line2 + d = a1 * b2 - a2 * b1 + + if d == 0: + #print("line1", line1) + #print("line2", line2) + print('Cross point does not exist') + return np.array([0, 0], dtype=np.float32) + else: + x = (b1 * c2 - b2 * c1) / d + y = (a2 * c1 - a1 * c2) / d + + return np.array([x, y], dtype=np.float32) + + def quad2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. (4, 2) + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair + p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair + return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]]) + + def poly2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + tcl_poly = np.zeros_like(poly) + point_num = poly.shape[0] + + for idx in range(point_num // 2): + point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx] + ) * ratio_pair + tcl_poly[idx] = point_pair[0] + tcl_poly[point_num - 1 - idx] = point_pair[1] + return tcl_poly + + def gen_quad_tbo(self, quad, tcl_mask, tbo_map): + """ + Generate tbo_map for give quad. + """ + # upper and lower line function: ax + by + c = 0; + up_line = self.line_cross_two_point(quad[0], quad[1]) + lower_line = self.line_cross_two_point(quad[3], quad[2]) + + quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[1] - quad[2])) + quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) + + # average angle of left and right line. + angle = self.average_angle(quad) + + xy_in_poly = np.argwhere(tcl_mask == 1) + for y, x in xy_in_poly: + point = (x, y) + line = self.theta_line_cross_point(angle, point) + cross_point_upper = self.line_cross_point(up_line, line) + cross_point_lower = self.line_cross_point(lower_line, line) + ##FIX, offset reverse + upper_offset_x, upper_offset_y = cross_point_upper - point + lower_offset_x, lower_offset_y = cross_point_lower - point + tbo_map[y, x, 0] = upper_offset_y + tbo_map[y, x, 1] = upper_offset_x + tbo_map[y, x, 2] = lower_offset_y + tbo_map[y, x, 3] = lower_offset_x + tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2 + return tbo_map + + def poly2quads(self, poly): + """ + Split poly into quads. + """ + quad_list = [] + point_num = poly.shape[0] + + # point pair + point_pair_list = [] + for idx in range(point_num // 2): + point_pair = [poly[idx], poly[point_num - 1 - idx]] + point_pair_list.append(point_pair) + + quad_num = point_num // 2 - 1 + for idx in range(quad_num): + # reshape and adjust to clock-wise + quad_list.append((np.array(point_pair_list)[[idx, idx + 1]] + ).reshape(4, 2)[[0, 2, 3, 1]]) + + return np.array(quad_list) + + def __call__(self, data): + im = data['image'] + text_polys = data['polys'] + text_tags = data['ignore_tags'] + if im is None: + return None + if text_polys.shape[0] == 0: + return None + + h, w, _ = im.shape + text_polys, text_tags, hv_tags = self.check_and_validate_polys( + text_polys, text_tags, (h, w)) + + if text_polys.shape[0] == 0: + return None + + #set aspect ratio and keep area fix + asp_scales = np.arange(1.0, 1.55, 0.1) + asp_scale = np.random.choice(asp_scales) + + if np.random.rand() < 0.5: + asp_scale = 1.0 / asp_scale + asp_scale = math.sqrt(asp_scale) + + asp_wx = asp_scale + asp_hy = 1.0 / asp_scale + im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy) + text_polys[:, :, 0] *= asp_wx + text_polys[:, :, 1] *= asp_hy + + h, w, _ = im.shape + if max(h, w) > 2048: + rd_scale = 2048.0 / max(h, w) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + h, w, _ = im.shape + if min(h, w) < 16: + return None + + #no background + im, text_polys, text_tags, hv_tags = self.crop_area(im, \ + text_polys, text_tags, hv_tags, crop_background=False) + + if text_polys.shape[0] == 0: + return None + #continue for all ignore case + if np.sum((text_tags * 1.0)) >= text_tags.size: + return None + new_h, new_w, _ = im.shape + if (new_h is None) or (new_w is None): + return None + #resize image + std_ratio = float(self.input_size) / max(new_w, new_h) + rand_scales = np.array( + [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0]) + rz_scale = std_ratio * np.random.choice(rand_scales) + im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale) + text_polys[:, :, 0] *= rz_scale + text_polys[:, :, 1] *= rz_scale + + #add gaussian blur + if np.random.rand() < 0.1 * 0.5: + ks = np.random.permutation(5)[0] + 1 + ks = int(ks / 2) * 2 + 1 + im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0) + #add brighter + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 + np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + #add darker + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 - np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + + # Padding the im to [input_size, input_size] + new_h, new_w, _ = im.shape + if min(new_w, new_h) < self.input_size * 0.5: + return None + + im_padded = np.ones( + (self.input_size, self.input_size, 3), dtype=np.float32) + im_padded[:, :, 2] = 0.485 * 255 + im_padded[:, :, 1] = 0.456 * 255 + im_padded[:, :, 0] = 0.406 * 255 + + # Random the start position + del_h = self.input_size - new_h + del_w = self.input_size - new_w + sh, sw = 0, 0 + if del_h > 1: + sh = int(np.random.rand() * del_h) + if del_w > 1: + sw = int(np.random.rand() * del_w) + + # Padding + im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy() + text_polys[:, :, 0] += sw + text_polys[:, :, 1] += sh + + score_map, border_map, training_mask = self.generate_tcl_label( + (self.input_size, self.input_size), text_polys, text_tags, 0.25) + + # SAST head + tvo_map, tco_map = self.generate_tvo_and_tco( + (self.input_size, self.input_size), + text_polys, + text_tags, + tcl_ratio=0.3, + ds_ratio=0.25) + # print("test--------tvo_map shape:", tvo_map.shape) + + im_padded[:, :, 2] -= 0.485 * 255 + im_padded[:, :, 1] -= 0.456 * 255 + im_padded[:, :, 0] -= 0.406 * 255 + im_padded[:, :, 2] /= (255.0 * 0.229) + im_padded[:, :, 1] /= (255.0 * 0.224) + im_padded[:, :, 0] /= (255.0 * 0.225) + im_padded = im_padded.transpose((2, 0, 1)) + + data['image'] = im_padded[::-1, :, :] + data['score_map'] = score_map[np.newaxis, :, :] + data['border_map'] = border_map.transpose((2, 0, 1)) + data['training_mask'] = training_mask[np.newaxis, :, :] + data['tvo_map'] = tvo_map.transpose((2, 0, 1)) + data['tco_map'] = tco_map.transpose((2, 0, 1)) + return data diff --git a/backend/ppocr/data/imaug/ssl_img_aug.py b/backend/ppocr/data/imaug/ssl_img_aug.py new file mode 100644 index 0000000..f9ed6ac --- /dev/null +++ b/backend/ppocr/data/imaug/ssl_img_aug.py @@ -0,0 +1,60 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import cv2 +import numpy as np +import random +from PIL import Image + +from .rec_img_aug import resize_norm_img + + +class SSLRotateResize(object): + def __init__(self, + image_shape, + padding=False, + select_all=True, + mode="train", + **kwargs): + self.image_shape = image_shape + self.padding = padding + self.select_all = select_all + self.mode = mode + + def __call__(self, data): + img = data["image"] + + data["image_r90"] = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE) + data["image_r180"] = cv2.rotate(data["image_r90"], + cv2.ROTATE_90_CLOCKWISE) + data["image_r270"] = cv2.rotate(data["image_r180"], + cv2.ROTATE_90_CLOCKWISE) + + images = [] + for key in ["image", "image_r90", "image_r180", "image_r270"]: + images.append( + resize_norm_img( + data.pop(key), + image_shape=self.image_shape, + padding=self.padding)[0]) + data["image"] = np.stack(images, axis=0) + data["label"] = np.array(list(range(4))) + if not self.select_all: + data["image"] = data["image"][0::2] # just choose 0 and 180 + data["label"] = data["label"][0:2] # label needs to be continuous + if self.mode == "test": + data["image"] = data["image"][0] + data["label"] = data["label"][0] + return data diff --git a/backend/ppocr/data/imaug/text_image_aug/__init__.py b/backend/ppocr/data/imaug/text_image_aug/__init__.py new file mode 100644 index 0000000..bca2626 --- /dev/null +++ b/backend/ppocr/data/imaug/text_image_aug/__init__.py @@ -0,0 +1,17 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .augment import tia_perspective, tia_distort, tia_stretch + +__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective'] diff --git a/backend/ppocr/data/imaug/text_image_aug/augment.py b/backend/ppocr/data/imaug/text_image_aug/augment.py new file mode 100644 index 0000000..2d15dd5 --- /dev/null +++ b/backend/ppocr/data/imaug/text_image_aug/augment.py @@ -0,0 +1,120 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/augment.py +""" + +import numpy as np +from .warp_mls import WarpMLS + + +def tia_distort(src, segment=4): + img_h, img_w = src.shape[:2] + + cut = img_w // segment + thresh = cut // 3 + + src_pts = list() + dst_pts = list() + + src_pts.append([0, 0]) + src_pts.append([img_w, 0]) + src_pts.append([img_w, img_h]) + src_pts.append([0, img_h]) + + dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)]) + dst_pts.append( + [img_w - np.random.randint(thresh), np.random.randint(thresh)]) + dst_pts.append( + [img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)]) + dst_pts.append( + [np.random.randint(thresh), img_h - np.random.randint(thresh)]) + + half_thresh = thresh * 0.5 + + for cut_idx in np.arange(1, segment, 1): + src_pts.append([cut * cut_idx, 0]) + src_pts.append([cut * cut_idx, img_h]) + dst_pts.append([ + cut * cut_idx + np.random.randint(thresh) - half_thresh, + np.random.randint(thresh) - half_thresh + ]) + dst_pts.append([ + cut * cut_idx + np.random.randint(thresh) - half_thresh, + img_h + np.random.randint(thresh) - half_thresh + ]) + + trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h) + dst = trans.generate() + + return dst + + +def tia_stretch(src, segment=4): + img_h, img_w = src.shape[:2] + + cut = img_w // segment + thresh = cut * 4 // 5 + + src_pts = list() + dst_pts = list() + + src_pts.append([0, 0]) + src_pts.append([img_w, 0]) + src_pts.append([img_w, img_h]) + src_pts.append([0, img_h]) + + dst_pts.append([0, 0]) + dst_pts.append([img_w, 0]) + dst_pts.append([img_w, img_h]) + dst_pts.append([0, img_h]) + + half_thresh = thresh * 0.5 + + for cut_idx in np.arange(1, segment, 1): + move = np.random.randint(thresh) - half_thresh + src_pts.append([cut * cut_idx, 0]) + src_pts.append([cut * cut_idx, img_h]) + dst_pts.append([cut * cut_idx + move, 0]) + dst_pts.append([cut * cut_idx + move, img_h]) + + trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h) + dst = trans.generate() + + return dst + + +def tia_perspective(src): + img_h, img_w = src.shape[:2] + + thresh = img_h // 2 + + src_pts = list() + dst_pts = list() + + src_pts.append([0, 0]) + src_pts.append([img_w, 0]) + src_pts.append([img_w, img_h]) + src_pts.append([0, img_h]) + + dst_pts.append([0, np.random.randint(thresh)]) + dst_pts.append([img_w, np.random.randint(thresh)]) + dst_pts.append([img_w, img_h - np.random.randint(thresh)]) + dst_pts.append([0, img_h - np.random.randint(thresh)]) + + trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h) + dst = trans.generate() + + return dst \ No newline at end of file diff --git a/backend/ppocr/data/imaug/text_image_aug/warp_mls.py b/backend/ppocr/data/imaug/text_image_aug/warp_mls.py new file mode 100644 index 0000000..75de111 --- /dev/null +++ b/backend/ppocr/data/imaug/text_image_aug/warp_mls.py @@ -0,0 +1,168 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/warp_mls.py +""" + +import numpy as np + + +class WarpMLS: + def __init__(self, src, src_pts, dst_pts, dst_w, dst_h, trans_ratio=1.): + self.src = src + self.src_pts = src_pts + self.dst_pts = dst_pts + self.pt_count = len(self.dst_pts) + self.dst_w = dst_w + self.dst_h = dst_h + self.trans_ratio = trans_ratio + self.grid_size = 100 + self.rdx = np.zeros((self.dst_h, self.dst_w)) + self.rdy = np.zeros((self.dst_h, self.dst_w)) + + @staticmethod + def __bilinear_interp(x, y, v11, v12, v21, v22): + return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * + (1 - y) + v22 * y) * x + + def generate(self): + self.calc_delta() + return self.gen_img() + + def calc_delta(self): + w = np.zeros(self.pt_count, dtype=np.float32) + + if self.pt_count < 2: + return + + i = 0 + while 1: + if self.dst_w <= i < self.dst_w + self.grid_size - 1: + i = self.dst_w - 1 + elif i >= self.dst_w: + break + + j = 0 + while 1: + if self.dst_h <= j < self.dst_h + self.grid_size - 1: + j = self.dst_h - 1 + elif j >= self.dst_h: + break + + sw = 0 + swp = np.zeros(2, dtype=np.float32) + swq = np.zeros(2, dtype=np.float32) + new_pt = np.zeros(2, dtype=np.float32) + cur_pt = np.array([i, j], dtype=np.float32) + + k = 0 + for k in range(self.pt_count): + if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]: + break + + w[k] = 1. / ( + (i - self.dst_pts[k][0]) * (i - self.dst_pts[k][0]) + + (j - self.dst_pts[k][1]) * (j - self.dst_pts[k][1])) + + sw += w[k] + swp = swp + w[k] * np.array(self.dst_pts[k]) + swq = swq + w[k] * np.array(self.src_pts[k]) + + if k == self.pt_count - 1: + pstar = 1 / sw * swp + qstar = 1 / sw * swq + + miu_s = 0 + for k in range(self.pt_count): + if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]: + continue + pt_i = self.dst_pts[k] - pstar + miu_s += w[k] * np.sum(pt_i * pt_i) + + cur_pt -= pstar + cur_pt_j = np.array([-cur_pt[1], cur_pt[0]]) + + for k in range(self.pt_count): + if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]: + continue + + pt_i = self.dst_pts[k] - pstar + pt_j = np.array([-pt_i[1], pt_i[0]]) + + tmp_pt = np.zeros(2, dtype=np.float32) + tmp_pt[0] = np.sum(pt_i * cur_pt) * self.src_pts[k][0] - \ + np.sum(pt_j * cur_pt) * self.src_pts[k][1] + tmp_pt[1] = -np.sum(pt_i * cur_pt_j) * self.src_pts[k][0] + \ + np.sum(pt_j * cur_pt_j) * self.src_pts[k][1] + tmp_pt *= (w[k] / miu_s) + new_pt += tmp_pt + + new_pt += qstar + else: + new_pt = self.src_pts[k] + + self.rdx[j, i] = new_pt[0] - i + self.rdy[j, i] = new_pt[1] - j + + j += self.grid_size + i += self.grid_size + + def gen_img(self): + src_h, src_w = self.src.shape[:2] + dst = np.zeros_like(self.src, dtype=np.float32) + + for i in np.arange(0, self.dst_h, self.grid_size): + for j in np.arange(0, self.dst_w, self.grid_size): + ni = i + self.grid_size + nj = j + self.grid_size + w = h = self.grid_size + if ni >= self.dst_h: + ni = self.dst_h - 1 + h = ni - i + 1 + if nj >= self.dst_w: + nj = self.dst_w - 1 + w = nj - j + 1 + + di = np.reshape(np.arange(h), (-1, 1)) + dj = np.reshape(np.arange(w), (1, -1)) + delta_x = self.__bilinear_interp( + di / h, dj / w, self.rdx[i, j], self.rdx[i, nj], + self.rdx[ni, j], self.rdx[ni, nj]) + delta_y = self.__bilinear_interp( + di / h, dj / w, self.rdy[i, j], self.rdy[i, nj], + self.rdy[ni, j], self.rdy[ni, nj]) + nx = j + dj + delta_x * self.trans_ratio + ny = i + di + delta_y * self.trans_ratio + nx = np.clip(nx, 0, src_w - 1) + ny = np.clip(ny, 0, src_h - 1) + nxi = np.array(np.floor(nx), dtype=np.int32) + nyi = np.array(np.floor(ny), dtype=np.int32) + nxi1 = np.array(np.ceil(nx), dtype=np.int32) + nyi1 = np.array(np.ceil(ny), dtype=np.int32) + + if len(self.src.shape) == 3: + x = np.tile(np.expand_dims(ny - nyi, axis=-1), (1, 1, 3)) + y = np.tile(np.expand_dims(nx - nxi, axis=-1), (1, 1, 3)) + else: + x = ny - nyi + y = nx - nxi + dst[i:i + h, j:j + w] = self.__bilinear_interp( + x, y, self.src[nyi, nxi], self.src[nyi, nxi1], + self.src[nyi1, nxi], self.src[nyi1, nxi1]) + + dst = np.clip(dst, 0, 255) + dst = np.array(dst, dtype=np.uint8) + + return dst diff --git a/backend/ppocr/data/imaug/vqa/__init__.py b/backend/ppocr/data/imaug/vqa/__init__.py new file mode 100644 index 0000000..a5025e7 --- /dev/null +++ b/backend/ppocr/data/imaug/vqa/__init__.py @@ -0,0 +1,19 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .token import VQATokenPad, VQASerTokenChunk, VQAReTokenChunk, VQAReTokenRelation + +__all__ = [ + 'VQATokenPad', 'VQASerTokenChunk', 'VQAReTokenChunk', 'VQAReTokenRelation' +] diff --git a/backend/ppocr/data/imaug/vqa/token/__init__.py b/backend/ppocr/data/imaug/vqa/token/__init__.py new file mode 100644 index 0000000..7c11566 --- /dev/null +++ b/backend/ppocr/data/imaug/vqa/token/__init__.py @@ -0,0 +1,17 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .vqa_token_chunk import VQASerTokenChunk, VQAReTokenChunk +from .vqa_token_pad import VQATokenPad +from .vqa_token_relation import VQAReTokenRelation diff --git a/backend/ppocr/data/imaug/vqa/token/vqa_token_chunk.py b/backend/ppocr/data/imaug/vqa/token/vqa_token_chunk.py new file mode 100644 index 0000000..1fa949e --- /dev/null +++ b/backend/ppocr/data/imaug/vqa/token/vqa_token_chunk.py @@ -0,0 +1,122 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + + +class VQASerTokenChunk(object): + def __init__(self, max_seq_len=512, infer_mode=False, **kwargs): + self.max_seq_len = max_seq_len + self.infer_mode = infer_mode + + def __call__(self, data): + encoded_inputs_all = [] + seq_len = len(data['input_ids']) + for index in range(0, seq_len, self.max_seq_len): + chunk_beg = index + chunk_end = min(index + self.max_seq_len, seq_len) + encoded_inputs_example = {} + for key in data: + if key in [ + 'label', 'input_ids', 'labels', 'token_type_ids', + 'bbox', 'attention_mask' + ]: + if self.infer_mode and key == 'labels': + encoded_inputs_example[key] = data[key] + else: + encoded_inputs_example[key] = data[key][chunk_beg: + chunk_end] + else: + encoded_inputs_example[key] = data[key] + + encoded_inputs_all.append(encoded_inputs_example) + if len(encoded_inputs_all) == 0: + return None + return encoded_inputs_all[0] + + +class VQAReTokenChunk(object): + def __init__(self, + max_seq_len=512, + entities_labels=None, + infer_mode=False, + **kwargs): + self.max_seq_len = max_seq_len + self.entities_labels = { + 'HEADER': 0, + 'QUESTION': 1, + 'ANSWER': 2 + } if entities_labels is None else entities_labels + self.infer_mode = infer_mode + + def __call__(self, data): + # prepare data + entities = data.pop('entities') + relations = data.pop('relations') + encoded_inputs_all = [] + for index in range(0, len(data["input_ids"]), self.max_seq_len): + item = {} + for key in data: + if key in [ + 'label', 'input_ids', 'labels', 'token_type_ids', + 'bbox', 'attention_mask' + ]: + if self.infer_mode and key == 'labels': + item[key] = data[key] + else: + item[key] = data[key][index:index + self.max_seq_len] + else: + item[key] = data[key] + # select entity in current chunk + entities_in_this_span = [] + global_to_local_map = {} # + for entity_id, entity in enumerate(entities): + if (index <= entity["start"] < index + self.max_seq_len and + index <= entity["end"] < index + self.max_seq_len): + entity["start"] = entity["start"] - index + entity["end"] = entity["end"] - index + global_to_local_map[entity_id] = len(entities_in_this_span) + entities_in_this_span.append(entity) + + # select relations in current chunk + relations_in_this_span = [] + for relation in relations: + if (index <= relation["start_index"] < index + self.max_seq_len + and index <= relation["end_index"] < + index + self.max_seq_len): + relations_in_this_span.append({ + "head": global_to_local_map[relation["head"]], + "tail": global_to_local_map[relation["tail"]], + "start_index": relation["start_index"] - index, + "end_index": relation["end_index"] - index, + }) + item.update({ + "entities": self.reformat(entities_in_this_span), + "relations": self.reformat(relations_in_this_span), + }) + if len(item['entities']) > 0: + item['entities']['label'] = [ + self.entities_labels[x] for x in item['entities']['label'] + ] + encoded_inputs_all.append(item) + if len(encoded_inputs_all) == 0: + return None + return encoded_inputs_all[0] + + def reformat(self, data): + new_data = defaultdict(list) + for item in data: + for k, v in item.items(): + new_data[k].append(v) + return new_data diff --git a/backend/ppocr/data/imaug/vqa/token/vqa_token_pad.py b/backend/ppocr/data/imaug/vqa/token/vqa_token_pad.py new file mode 100644 index 0000000..8e5a20f --- /dev/null +++ b/backend/ppocr/data/imaug/vqa/token/vqa_token_pad.py @@ -0,0 +1,104 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import numpy as np + + +class VQATokenPad(object): + def __init__(self, + max_seq_len=512, + pad_to_max_seq_len=True, + return_attention_mask=True, + return_token_type_ids=True, + truncation_strategy="longest_first", + return_overflowing_tokens=False, + return_special_tokens_mask=False, + infer_mode=False, + **kwargs): + self.max_seq_len = max_seq_len + self.pad_to_max_seq_len = max_seq_len + self.return_attention_mask = return_attention_mask + self.return_token_type_ids = return_token_type_ids + self.truncation_strategy = truncation_strategy + self.return_overflowing_tokens = return_overflowing_tokens + self.return_special_tokens_mask = return_special_tokens_mask + self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index + self.infer_mode = infer_mode + + def __call__(self, data): + needs_to_be_padded = self.pad_to_max_seq_len and len(data[ + "input_ids"]) < self.max_seq_len + + if needs_to_be_padded: + if 'tokenizer_params' in data: + tokenizer_params = data.pop('tokenizer_params') + else: + tokenizer_params = dict( + padding_side='right', pad_token_type_id=0, pad_token_id=1) + + difference = self.max_seq_len - len(data["input_ids"]) + if tokenizer_params['padding_side'] == 'right': + if self.return_attention_mask: + data["attention_mask"] = [1] * len(data[ + "input_ids"]) + [0] * difference + if self.return_token_type_ids: + data["token_type_ids"] = ( + data["token_type_ids"] + + [tokenizer_params['pad_token_type_id']] * difference) + if self.return_special_tokens_mask: + data["special_tokens_mask"] = data[ + "special_tokens_mask"] + [1] * difference + data["input_ids"] = data["input_ids"] + [ + tokenizer_params['pad_token_id'] + ] * difference + if not self.infer_mode: + data["labels"] = data[ + "labels"] + [self.pad_token_label_id] * difference + data["bbox"] = data["bbox"] + [[0, 0, 0, 0]] * difference + elif tokenizer_params['padding_side'] == 'left': + if self.return_attention_mask: + data["attention_mask"] = [0] * difference + [ + 1 + ] * len(data["input_ids"]) + if self.return_token_type_ids: + data["token_type_ids"] = ( + [tokenizer_params['pad_token_type_id']] * difference + + data["token_type_ids"]) + if self.return_special_tokens_mask: + data["special_tokens_mask"] = [ + 1 + ] * difference + data["special_tokens_mask"] + data["input_ids"] = [tokenizer_params['pad_token_id'] + ] * difference + data["input_ids"] + if not self.infer_mode: + data["labels"] = [self.pad_token_label_id + ] * difference + data["labels"] + data["bbox"] = [[0, 0, 0, 0]] * difference + data["bbox"] + else: + if self.return_attention_mask: + data["attention_mask"] = [1] * len(data["input_ids"]) + + for key in data: + if key in [ + 'input_ids', 'labels', 'token_type_ids', 'bbox', + 'attention_mask' + ]: + if self.infer_mode: + if key != 'labels': + length = min(len(data[key]), self.max_seq_len) + data[key] = data[key][:length] + else: + continue + data[key] = np.array(data[key], dtype='int64') + return data diff --git a/backend/ppocr/data/imaug/vqa/token/vqa_token_relation.py b/backend/ppocr/data/imaug/vqa/token/vqa_token_relation.py new file mode 100644 index 0000000..293988f --- /dev/null +++ b/backend/ppocr/data/imaug/vqa/token/vqa_token_relation.py @@ -0,0 +1,67 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class VQAReTokenRelation(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + """ + build relations + """ + entities = data['entities'] + relations = data['relations'] + id2label = data.pop('id2label') + empty_entity = data.pop('empty_entity') + entity_id_to_index_map = data.pop('entity_id_to_index_map') + + relations = list(set(relations)) + relations = [ + rel for rel in relations + if rel[0] not in empty_entity and rel[1] not in empty_entity + ] + kv_relations = [] + for rel in relations: + pair = [id2label[rel[0]], id2label[rel[1]]] + if pair == ["question", "answer"]: + kv_relations.append({ + "head": entity_id_to_index_map[rel[0]], + "tail": entity_id_to_index_map[rel[1]] + }) + elif pair == ["answer", "question"]: + kv_relations.append({ + "head": entity_id_to_index_map[rel[1]], + "tail": entity_id_to_index_map[rel[0]] + }) + else: + continue + relations = sorted( + [{ + "head": rel["head"], + "tail": rel["tail"], + "start_index": self.get_relation_span(rel, entities)[0], + "end_index": self.get_relation_span(rel, entities)[1], + } for rel in kv_relations], + key=lambda x: x["head"], ) + + data['relations'] = relations + return data + + def get_relation_span(self, rel, entities): + bound = [] + for entity_index in [rel["head"], rel["tail"]]: + bound.append(entities[entity_index]["start"]) + bound.append(entities[entity_index]["end"]) + return min(bound), max(bound) diff --git a/backend/ppocr/data/lmdb_dataset.py b/backend/ppocr/data/lmdb_dataset.py new file mode 100644 index 0000000..e1b4980 --- /dev/null +++ b/backend/ppocr/data/lmdb_dataset.py @@ -0,0 +1,118 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +from paddle.io import Dataset +import lmdb +import cv2 + +from .imaug import transform, create_operators + + +class LMDBDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(LMDBDataSet, self).__init__() + + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + batch_size = loader_config['batch_size_per_card'] + data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + + self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir) + logger.info("Initialize indexs of datasets:%s" % data_dir) + self.data_idx_order_list = self.dataset_traversal() + if self.do_shuffle: + np.random.shuffle(self.data_idx_order_list) + self.ops = create_operators(dataset_config['transforms'], global_config) + + ratio_list = dataset_config.get("ratio_list", [1.0]) + self.need_reset = True in [x < 1 for x in ratio_list] + + def load_hierarchical_lmdb_dataset(self, data_dir): + lmdb_sets = {} + dataset_idx = 0 + for dirpath, dirnames, filenames in os.walk(data_dir + '/'): + if not dirnames: + env = lmdb.open( + dirpath, + max_readers=32, + readonly=True, + lock=False, + readahead=False, + meminit=False) + txn = env.begin(write=False) + num_samples = int(txn.get('num-samples'.encode())) + lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \ + "txn":txn, "num_samples":num_samples} + dataset_idx += 1 + return lmdb_sets + + def dataset_traversal(self): + lmdb_num = len(self.lmdb_sets) + total_sample_num = 0 + for lno in range(lmdb_num): + total_sample_num += self.lmdb_sets[lno]['num_samples'] + data_idx_order_list = np.zeros((total_sample_num, 2)) + beg_idx = 0 + for lno in range(lmdb_num): + tmp_sample_num = self.lmdb_sets[lno]['num_samples'] + end_idx = beg_idx + tmp_sample_num + data_idx_order_list[beg_idx:end_idx, 0] = lno + data_idx_order_list[beg_idx:end_idx, 1] \ + = list(range(tmp_sample_num)) + data_idx_order_list[beg_idx:end_idx, 1] += 1 + beg_idx = beg_idx + tmp_sample_num + return data_idx_order_list + + def get_img_data(self, value): + """get_img_data""" + if not value: + return None + imgdata = np.frombuffer(value, dtype='uint8') + if imgdata is None: + return None + imgori = cv2.imdecode(imgdata, 1) + if imgori is None: + return None + return imgori + + def get_lmdb_sample_info(self, txn, index): + label_key = 'label-%09d'.encode() % index + label = txn.get(label_key) + if label is None: + return None + label = label.decode('utf-8') + img_key = 'image-%09d'.encode() % index + imgbuf = txn.get(img_key) + return imgbuf, label + + def __getitem__(self, idx): + lmdb_idx, file_idx = self.data_idx_order_list[idx] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'], + file_idx) + if sample_info is None: + return self.__getitem__(np.random.randint(self.__len__())) + img, label = sample_info + data = {'image': img, 'label': label} + outs = transform(data, self.ops) + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs + + def __len__(self): + return self.data_idx_order_list.shape[0] diff --git a/backend/ppocr/data/pgnet_dataset.py b/backend/ppocr/data/pgnet_dataset.py new file mode 100644 index 0000000..6f80179 --- /dev/null +++ b/backend/ppocr/data/pgnet_dataset.py @@ -0,0 +1,106 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +from paddle.io import Dataset +from .imaug import transform, create_operators +import random + + +class PGDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(PGDataSet, self).__init__() + + self.logger = logger + self.seed = seed + self.mode = mode + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + + self.delimiter = dataset_config.get('delimiter', '\t') + label_file_list = dataset_config.pop('label_file_list') + data_source_num = len(label_file_list) + ratio_list = dataset_config.get("ratio_list", [1.0]) + if isinstance(ratio_list, (float, int)): + ratio_list = [float(ratio_list)] * int(data_source_num) + assert len( + ratio_list + ) == data_source_num, "The length of ratio_list should be the same as the file_list." + self.data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + + logger.info("Initialize indexs of datasets:%s" % label_file_list) + self.data_lines = self.get_image_info_list(label_file_list, ratio_list) + self.data_idx_order_list = list(range(len(self.data_lines))) + if mode.lower() == "train": + self.shuffle_data_random() + + self.ops = create_operators(dataset_config['transforms'], global_config) + + self.need_reset = True in [x < 1 for x in ratio_list] + + def shuffle_data_random(self): + if self.do_shuffle: + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def get_image_info_list(self, file_list, ratio_list): + if isinstance(file_list, str): + file_list = [file_list] + data_lines = [] + for idx, file in enumerate(file_list): + with open(file, "rb") as f: + lines = f.readlines() + if self.mode == "train" or ratio_list[idx] < 1.0: + random.seed(self.seed) + lines = random.sample(lines, + round(len(lines) * ratio_list[idx])) + data_lines.extend(lines) + return data_lines + + def __getitem__(self, idx): + file_idx = self.data_idx_order_list[idx] + data_line = self.data_lines[file_idx] + img_id = 0 + try: + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + if self.mode.lower() == 'eval': + try: + img_id = int(data_line.split(".")[0][7:]) + except: + img_id = 0 + data = {'img_path': img_path, 'label': label, 'img_id': img_id} + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + outs = transform(data, self.ops) + except Exception as e: + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + self.data_idx_order_list[idx], e)) + outs = None + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs + + def __len__(self): + return len(self.data_idx_order_list) diff --git a/backend/ppocr/data/pubtab_dataset.py b/backend/ppocr/data/pubtab_dataset.py new file mode 100644 index 0000000..671cda7 --- /dev/null +++ b/backend/ppocr/data/pubtab_dataset.py @@ -0,0 +1,114 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +import random +from paddle.io import Dataset +import json + +from .imaug import transform, create_operators + + +class PubTabDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(PubTabDataSet, self).__init__() + self.logger = logger + + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + + label_file_path = dataset_config.pop('label_file_path') + + self.data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + self.do_hard_select = False + if 'hard_select' in loader_config: + self.do_hard_select = loader_config['hard_select'] + self.hard_prob = loader_config['hard_prob'] + if self.do_hard_select: + self.img_select_prob = self.load_hard_select_prob() + self.table_select_type = None + if 'table_select_type' in loader_config: + self.table_select_type = loader_config['table_select_type'] + self.table_select_prob = loader_config['table_select_prob'] + + self.seed = seed + logger.info("Initialize indexs of datasets:%s" % label_file_path) + with open(label_file_path, "rb") as f: + self.data_lines = f.readlines() + self.data_idx_order_list = list(range(len(self.data_lines))) + if mode.lower() == "train": + self.shuffle_data_random() + self.ops = create_operators(dataset_config['transforms'], global_config) + + ratio_list = dataset_config.get("ratio_list", [1.0]) + self.need_reset = True in [x < 1 for x in ratio_list] + + def shuffle_data_random(self): + if self.do_shuffle: + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def __getitem__(self, idx): + try: + data_line = self.data_lines[idx] + data_line = data_line.decode('utf-8').strip("\n") + info = json.loads(data_line) + file_name = info['filename'] + select_flag = True + if self.do_hard_select: + prob = self.img_select_prob[file_name] + if prob < random.uniform(0, 1): + select_flag = False + + if self.table_select_type: + structure = info['html']['structure']['tokens'].copy() + structure_str = ''.join(structure) + table_type = "simple" + if 'colspan' in structure_str or 'rowspan' in structure_str: + table_type = "complex" + if table_type == "complex": + if self.table_select_prob < random.uniform(0, 1): + select_flag = False + + if select_flag: + cells = info['html']['cells'].copy() + structure = info['html']['structure'].copy() + img_path = os.path.join(self.data_dir, file_name) + data = { + 'img_path': img_path, + 'cells': cells, + 'structure': structure + } + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + outs = transform(data, self.ops) + else: + outs = None + except Exception as e: + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + data_line, e)) + outs = None + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs + + def __len__(self): + return len(self.data_idx_order_list) diff --git a/backend/ppocr/data/simple_dataset.py b/backend/ppocr/data/simple_dataset.py new file mode 100644 index 0000000..b5da9b8 --- /dev/null +++ b/backend/ppocr/data/simple_dataset.py @@ -0,0 +1,151 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +import json +import random +import traceback +from paddle.io import Dataset +from .imaug import transform, create_operators + + +class SimpleDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(SimpleDataSet, self).__init__() + self.logger = logger + self.mode = mode.lower() + + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + + self.delimiter = dataset_config.get('delimiter', '\t') + label_file_list = dataset_config.pop('label_file_list') + data_source_num = len(label_file_list) + ratio_list = dataset_config.get("ratio_list", [1.0]) + if isinstance(ratio_list, (float, int)): + ratio_list = [float(ratio_list)] * int(data_source_num) + + assert len( + ratio_list + ) == data_source_num, "The length of ratio_list should be the same as the file_list." + self.data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + self.seed = seed + logger.info("Initialize indexs of datasets:%s" % label_file_list) + self.data_lines = self.get_image_info_list(label_file_list, ratio_list) + self.data_idx_order_list = list(range(len(self.data_lines))) + if self.mode == "train" and self.do_shuffle: + self.shuffle_data_random() + self.ops = create_operators(dataset_config['transforms'], global_config) + self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", + 2) + self.need_reset = True in [x < 1 for x in ratio_list] + + def get_image_info_list(self, file_list, ratio_list): + if isinstance(file_list, str): + file_list = [file_list] + data_lines = [] + for idx, file in enumerate(file_list): + with open(file, "rb") as f: + lines = f.readlines() + if self.mode == "train" or ratio_list[idx] < 1.0: + random.seed(self.seed) + lines = random.sample(lines, + round(len(lines) * ratio_list[idx])) + data_lines.extend(lines) + return data_lines + + def shuffle_data_random(self): + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def _try_parse_filename_list(self, file_name): + # multiple images -> one gt label + if len(file_name) > 0 and file_name[0] == "[": + try: + info = json.loads(file_name) + file_name = random.choice(info) + except: + pass + return file_name + + def get_ext_data(self): + ext_data_num = 0 + for op in self.ops: + if hasattr(op, 'ext_data_num'): + ext_data_num = getattr(op, 'ext_data_num') + break + load_data_ops = self.ops[:self.ext_op_transform_idx] + ext_data = [] + + while len(ext_data) < ext_data_num: + file_idx = self.data_idx_order_list[np.random.randint(self.__len__( + ))] + data_line = self.data_lines[file_idx] + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + file_name = self._try_parse_filename_list(file_name) + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + data = {'img_path': img_path, 'label': label} + if not os.path.exists(img_path): + continue + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + data = transform(data, load_data_ops) + + if data is None: + continue + if 'polys' in data.keys(): + if data['polys'].shape[1] != 4: + continue + ext_data.append(data) + return ext_data + + def __getitem__(self, idx): + file_idx = self.data_idx_order_list[idx] + data_line = self.data_lines[file_idx] + try: + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + file_name = self._try_parse_filename_list(file_name) + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + data = {'img_path': img_path, 'label': label} + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + data['ext_data'] = self.get_ext_data() + outs = transform(data, self.ops) + except: + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + data_line, traceback.format_exc())) + outs = None + if outs is None: + # during evaluation, we should fix the idx to get same results for many times of evaluation. + rnd_idx = np.random.randint(self.__len__( + )) if self.mode == "train" else (idx + 1) % self.__len__() + return self.__getitem__(rnd_idx) + return outs + + def __len__(self): + return len(self.data_idx_order_list) diff --git a/backend/ppocr/losses/__init__.py b/backend/ppocr/losses/__init__.py new file mode 100755 index 0000000..de8419b --- /dev/null +++ b/backend/ppocr/losses/__init__.py @@ -0,0 +1,71 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import paddle +import paddle.nn as nn + +# basic_loss +from .basic_loss import LossFromOutput + +# det loss +from .det_db_loss import DBLoss +from .det_east_loss import EASTLoss +from .det_sast_loss import SASTLoss +from .det_pse_loss import PSELoss +from .det_fce_loss import FCELoss + +# rec loss +from .rec_ctc_loss import CTCLoss +from .rec_att_loss import AttentionLoss +from .rec_srn_loss import SRNLoss +from .rec_nrtr_loss import NRTRLoss +from .rec_sar_loss import SARLoss +from .rec_aster_loss import AsterLoss +from .rec_pren_loss import PRENLoss +from .rec_multi_loss import MultiLoss + +# cls loss +from .cls_loss import ClsLoss + +# e2e loss +from .e2e_pg_loss import PGLoss +from .kie_sdmgr_loss import SDMGRLoss + +# basic loss function +from .basic_loss import DistanceLoss + +# combined loss function +from .combined_loss import CombinedLoss + +# table loss +from .table_att_loss import TableAttentionLoss + +# vqa token loss +from .vqa_token_layoutlm_loss import VQASerTokenLayoutLMLoss + + +def build_loss(config): + support_dict = [ + 'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'FCELoss', 'CTCLoss', + 'ClsLoss', 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', + 'NRTRLoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', + 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss' + ] + config = copy.deepcopy(config) + module_name = config.pop('name') + assert module_name in support_dict, Exception('loss only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/losses/ace_loss.py b/backend/ppocr/losses/ace_loss.py new file mode 100644 index 0000000..915b99e --- /dev/null +++ b/backend/ppocr/losses/ace_loss.py @@ -0,0 +1,52 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This code is refer from: https://github.com/viig99/LS-ACELoss + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + + +class ACELoss(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + self.loss_func = nn.CrossEntropyLoss( + weight=None, + ignore_index=0, + reduction='none', + soft_label=True, + axis=-1) + + def __call__(self, predicts, batch): + if isinstance(predicts, (list, tuple)): + predicts = predicts[-1] + + B, N = predicts.shape[:2] + div = paddle.to_tensor([N]).astype('float32') + + predicts = nn.functional.softmax(predicts, axis=-1) + aggregation_preds = paddle.sum(predicts, axis=1) + aggregation_preds = paddle.divide(aggregation_preds, div) + + length = batch[2].astype("float32") + batch = batch[3].astype("float32") + batch[:, 0] = paddle.subtract(div, length) + batch = paddle.divide(batch, div) + + loss = self.loss_func(aggregation_preds, batch) + return {"loss_ace": loss} diff --git a/backend/ppocr/losses/basic_loss.py b/backend/ppocr/losses/basic_loss.py new file mode 100644 index 0000000..2df96ea --- /dev/null +++ b/backend/ppocr/losses/basic_loss.py @@ -0,0 +1,155 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.nn import L1Loss +from paddle.nn import MSELoss as L2Loss +from paddle.nn import SmoothL1Loss + + +class CELoss(nn.Layer): + def __init__(self, epsilon=None): + super().__init__() + if epsilon is not None and (epsilon <= 0 or epsilon >= 1): + epsilon = None + self.epsilon = epsilon + + def _labelsmoothing(self, target, class_num): + if target.shape[-1] != class_num: + one_hot_target = F.one_hot(target, class_num) + else: + one_hot_target = target + soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon) + soft_target = paddle.reshape(soft_target, shape=[-1, class_num]) + return soft_target + + def forward(self, x, label): + loss_dict = {} + if self.epsilon is not None: + class_num = x.shape[-1] + label = self._labelsmoothing(label, class_num) + x = -F.log_softmax(x, axis=-1) + loss = paddle.sum(x * label, axis=-1) + else: + if label.shape[-1] == x.shape[-1]: + label = F.softmax(label, axis=-1) + soft_label = True + else: + soft_label = False + loss = F.cross_entropy(x, label=label, soft_label=soft_label) + return loss + + +class KLJSLoss(object): + def __init__(self, mode='kl'): + assert mode in ['kl', 'js', 'KL', 'JS' + ], "mode can only be one of ['kl', 'js', 'KL', 'JS']" + self.mode = mode + + def __call__(self, p1, p2, reduction="mean"): + + loss = paddle.multiply(p2, paddle.log((p2 + 1e-5) / (p1 + 1e-5) + 1e-5)) + + if self.mode.lower() == "js": + loss += paddle.multiply( + p1, paddle.log((p1 + 1e-5) / (p2 + 1e-5) + 1e-5)) + loss *= 0.5 + if reduction == "mean": + loss = paddle.mean(loss, axis=[1, 2]) + elif reduction == "none" or reduction is None: + return loss + else: + loss = paddle.sum(loss, axis=[1, 2]) + + return loss + + +class DMLLoss(nn.Layer): + """ + DMLLoss + """ + + def __init__(self, act=None, use_log=False): + super().__init__() + if act is not None: + assert act in ["softmax", "sigmoid"] + if act == "softmax": + self.act = nn.Softmax(axis=-1) + elif act == "sigmoid": + self.act = nn.Sigmoid() + else: + self.act = None + + self.use_log = use_log + self.jskl_loss = KLJSLoss(mode="js") + + def _kldiv(self, x, target): + eps = 1.0e-10 + loss = target * (paddle.log(target + eps) - x) + # batch mean loss + loss = paddle.sum(loss) / loss.shape[0] + return loss + + def forward(self, out1, out2): + if self.act is not None: + out1 = self.act(out1) + 1e-10 + out2 = self.act(out2) + 1e-10 + if self.use_log: + # for recognition distillation, log is needed for feature map + log_out1 = paddle.log(out1) + log_out2 = paddle.log(out2) + loss = ( + self._kldiv(log_out1, out2) + self._kldiv(log_out2, out1)) / 2.0 + else: + # for detection distillation log is not needed + loss = self.jskl_loss(out1, out2) + return loss + + +class DistanceLoss(nn.Layer): + """ + DistanceLoss: + mode: loss mode + """ + + def __init__(self, mode="l2", **kargs): + super().__init__() + assert mode in ["l1", "l2", "smooth_l1"] + if mode == "l1": + self.loss_func = nn.L1Loss(**kargs) + elif mode == "l2": + self.loss_func = nn.MSELoss(**kargs) + elif mode == "smooth_l1": + self.loss_func = nn.SmoothL1Loss(**kargs) + + def forward(self, x, y): + return self.loss_func(x, y) + + +class LossFromOutput(nn.Layer): + def __init__(self, key='loss', reduction='none'): + super().__init__() + self.key = key + self.reduction = reduction + + def forward(self, predicts, batch): + loss = predicts[self.key] + if self.reduction == 'mean': + loss = paddle.mean(loss) + elif self.reduction == 'sum': + loss = paddle.sum(loss) + return {'loss': loss} diff --git a/backend/ppocr/losses/center_loss.py b/backend/ppocr/losses/center_loss.py new file mode 100644 index 0000000..f62b8af --- /dev/null +++ b/backend/ppocr/losses/center_loss.py @@ -0,0 +1,88 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +# This code is refer from: https://github.com/KaiyangZhou/pytorch-center-loss + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import pickle + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class CenterLoss(nn.Layer): + """ + Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016. + """ + + def __init__(self, num_classes=6625, feat_dim=96, center_file_path=None): + super().__init__() + self.num_classes = num_classes + self.feat_dim = feat_dim + self.centers = paddle.randn( + shape=[self.num_classes, self.feat_dim]).astype("float64") + + if center_file_path is not None: + assert os.path.exists( + center_file_path + ), f"center path({center_file_path}) must exist when it is not None." + with open(center_file_path, 'rb') as f: + char_dict = pickle.load(f) + for key in char_dict.keys(): + self.centers[key] = paddle.to_tensor(char_dict[key]) + + def __call__(self, predicts, batch): + assert isinstance(predicts, (list, tuple)) + features, predicts = predicts + + feats_reshape = paddle.reshape( + features, [-1, features.shape[-1]]).astype("float64") + label = paddle.argmax(predicts, axis=2) + label = paddle.reshape(label, [label.shape[0] * label.shape[1]]) + + batch_size = feats_reshape.shape[0] + + #calc l2 distance between feats and centers + square_feat = paddle.sum(paddle.square(feats_reshape), + axis=1, + keepdim=True) + square_feat = paddle.expand(square_feat, [batch_size, self.num_classes]) + + square_center = paddle.sum(paddle.square(self.centers), + axis=1, + keepdim=True) + square_center = paddle.expand( + square_center, [self.num_classes, batch_size]).astype("float64") + square_center = paddle.transpose(square_center, [1, 0]) + + distmat = paddle.add(square_feat, square_center) + feat_dot_center = paddle.matmul(feats_reshape, + paddle.transpose(self.centers, [1, 0])) + distmat = distmat - 2.0 * feat_dot_center + + #generate the mask + classes = paddle.arange(self.num_classes).astype("int64") + label = paddle.expand( + paddle.unsqueeze(label, 1), (batch_size, self.num_classes)) + mask = paddle.equal( + paddle.expand(classes, [batch_size, self.num_classes]), + label).astype("float64") + dist = paddle.multiply(distmat, mask) + + loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size + return {'loss_center': loss} diff --git a/backend/ppocr/losses/cls_loss.py b/backend/ppocr/losses/cls_loss.py new file mode 100755 index 0000000..abc5e5b --- /dev/null +++ b/backend/ppocr/losses/cls_loss.py @@ -0,0 +1,30 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + + +class ClsLoss(nn.Layer): + def __init__(self, **kwargs): + super(ClsLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(reduction='mean') + + def forward(self, predicts, batch): + label = batch[1].astype("int64") + loss = self.loss_func(input=predicts, label=label) + return {'loss': loss} diff --git a/backend/ppocr/losses/combined_loss.py b/backend/ppocr/losses/combined_loss.py new file mode 100644 index 0000000..f4cdee8 --- /dev/null +++ b/backend/ppocr/losses/combined_loss.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +from .rec_ctc_loss import CTCLoss +from .center_loss import CenterLoss +from .ace_loss import ACELoss +from .rec_sar_loss import SARLoss + +from .distillation_loss import DistillationCTCLoss +from .distillation_loss import DistillationSARLoss +from .distillation_loss import DistillationDMLLoss +from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss + + +class CombinedLoss(nn.Layer): + """ + CombinedLoss: + a combionation of loss function + """ + + def __init__(self, loss_config_list=None): + super().__init__() + self.loss_func = [] + self.loss_weight = [] + assert isinstance(loss_config_list, list), ( + 'operator config should be a list') + for config in loss_config_list: + assert isinstance(config, + dict) and len(config) == 1, "yaml format error" + name = list(config)[0] + param = config[name] + assert "weight" in param, "weight must be in param, but param just contains {}".format( + param.keys()) + self.loss_weight.append(param.pop("weight")) + self.loss_func.append(eval(name)(**param)) + + def forward(self, input, batch, **kargs): + loss_dict = {} + loss_all = 0. + for idx, loss_func in enumerate(self.loss_func): + loss = loss_func(input, batch, **kargs) + if isinstance(loss, paddle.Tensor): + loss = {"loss_{}_{}".format(str(loss), idx): loss} + + weight = self.loss_weight[idx] + + loss = {key: loss[key] * weight for key in loss} + + if "loss" in loss: + loss_all += loss["loss"] + else: + loss_all += paddle.add_n(list(loss.values())) + loss_dict.update(loss) + loss_dict["loss"] = loss_all + return loss_dict diff --git a/backend/ppocr/losses/det_basic_loss.py b/backend/ppocr/losses/det_basic_loss.py new file mode 100644 index 0000000..61ea579 --- /dev/null +++ b/backend/ppocr/losses/det_basic_loss.py @@ -0,0 +1,153 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/models/losses/basic_loss.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class BalanceLoss(nn.Layer): + def __init__(self, + balance_loss=True, + main_loss_type='DiceLoss', + negative_ratio=3, + return_origin=False, + eps=1e-6, + **kwargs): + """ + The BalanceLoss for Differentiable Binarization text detection + args: + balance_loss (bool): whether balance loss or not, default is True + main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss', + 'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'. + negative_ratio (int|float): float, default is 3. + return_origin (bool): whether return unbalanced loss or not, default is False. + eps (float): default is 1e-6. + """ + super(BalanceLoss, self).__init__() + self.balance_loss = balance_loss + self.main_loss_type = main_loss_type + self.negative_ratio = negative_ratio + self.return_origin = return_origin + self.eps = eps + + if self.main_loss_type == "CrossEntropy": + self.loss = nn.CrossEntropyLoss() + elif self.main_loss_type == "Euclidean": + self.loss = nn.MSELoss() + elif self.main_loss_type == "DiceLoss": + self.loss = DiceLoss(self.eps) + elif self.main_loss_type == "BCELoss": + self.loss = BCELoss(reduction='none') + elif self.main_loss_type == "MaskL1Loss": + self.loss = MaskL1Loss(self.eps) + else: + loss_type = [ + 'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss' + ] + raise Exception( + "main_loss_type in BalanceLoss() can only be one of {}".format( + loss_type)) + + def forward(self, pred, gt, mask=None): + """ + The BalanceLoss for Differentiable Binarization text detection + args: + pred (variable): predicted feature maps. + gt (variable): ground truth feature maps. + mask (variable): masked maps. + return: (variable) balanced loss + """ + positive = gt * mask + negative = (1 - gt) * mask + + positive_count = int(positive.sum()) + negative_count = int( + min(negative.sum(), positive_count * self.negative_ratio)) + loss = self.loss(pred, gt, mask=mask) + + if not self.balance_loss: + return loss + + positive_loss = positive * loss + negative_loss = negative * loss + negative_loss = paddle.reshape(negative_loss, shape=[-1]) + if negative_count > 0: + sort_loss = negative_loss.sort(descending=True) + negative_loss = sort_loss[:negative_count] + # negative_loss, _ = paddle.topk(negative_loss, k=negative_count_int) + balance_loss = (positive_loss.sum() + negative_loss.sum()) / ( + positive_count + negative_count + self.eps) + else: + balance_loss = positive_loss.sum() / (positive_count + self.eps) + if self.return_origin: + return balance_loss, loss + + return balance_loss + + +class DiceLoss(nn.Layer): + def __init__(self, eps=1e-6): + super(DiceLoss, self).__init__() + self.eps = eps + + def forward(self, pred, gt, mask, weights=None): + """ + DiceLoss function. + """ + + assert pred.shape == gt.shape + assert pred.shape == mask.shape + if weights is not None: + assert weights.shape == mask.shape + mask = weights * mask + intersection = paddle.sum(pred * gt * mask) + + union = paddle.sum(pred * mask) + paddle.sum(gt * mask) + self.eps + loss = 1 - 2.0 * intersection / union + assert loss <= 1 + return loss + + +class MaskL1Loss(nn.Layer): + def __init__(self, eps=1e-6): + super(MaskL1Loss, self).__init__() + self.eps = eps + + def forward(self, pred, gt, mask): + """ + Mask L1 Loss + """ + loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps) + loss = paddle.mean(loss) + return loss + + +class BCELoss(nn.Layer): + def __init__(self, reduction='mean'): + super(BCELoss, self).__init__() + self.reduction = reduction + + def forward(self, input, label, mask=None, weight=None, name=None): + loss = F.binary_cross_entropy(input, label, reduction=self.reduction) + return loss diff --git a/backend/ppocr/losses/det_db_loss.py b/backend/ppocr/losses/det_db_loss.py new file mode 100755 index 0000000..708ffbd --- /dev/null +++ b/backend/ppocr/losses/det_db_loss.py @@ -0,0 +1,76 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/models/losses/DB_loss.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + +from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss + + +class DBLoss(nn.Layer): + """ + Differentiable Binarization (DB) Loss Function + args: + param (dict): the super paramter for DB Loss + """ + + def __init__(self, + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + **kwargs): + super(DBLoss, self).__init__() + self.alpha = alpha + self.beta = beta + self.dice_loss = DiceLoss(eps=eps) + self.l1_loss = MaskL1Loss(eps=eps) + self.bce_loss = BalanceLoss( + balance_loss=balance_loss, + main_loss_type=main_loss_type, + negative_ratio=ohem_ratio) + + def forward(self, predicts, labels): + predict_maps = predicts['maps'] + label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = labels[ + 1:] + shrink_maps = predict_maps[:, 0, :, :] + threshold_maps = predict_maps[:, 1, :, :] + binary_maps = predict_maps[:, 2, :, :] + + loss_shrink_maps = self.bce_loss(shrink_maps, label_shrink_map, + label_shrink_mask) + loss_threshold_maps = self.l1_loss(threshold_maps, label_threshold_map, + label_threshold_mask) + loss_binary_maps = self.dice_loss(binary_maps, label_shrink_map, + label_shrink_mask) + loss_shrink_maps = self.alpha * loss_shrink_maps + loss_threshold_maps = self.beta * loss_threshold_maps + + loss_all = loss_shrink_maps + loss_threshold_maps \ + + loss_binary_maps + losses = {'loss': loss_all, \ + "loss_shrink_maps": loss_shrink_maps, \ + "loss_threshold_maps": loss_threshold_maps, \ + "loss_binary_maps": loss_binary_maps} + return losses diff --git a/backend/ppocr/losses/det_east_loss.py b/backend/ppocr/losses/det_east_loss.py new file mode 100644 index 0000000..bcf5372 --- /dev/null +++ b/backend/ppocr/losses/det_east_loss.py @@ -0,0 +1,63 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .det_basic_loss import DiceLoss + + +class EASTLoss(nn.Layer): + """ + """ + + def __init__(self, + eps=1e-6, + **kwargs): + super(EASTLoss, self).__init__() + self.dice_loss = DiceLoss(eps=eps) + + def forward(self, predicts, labels): + l_score, l_geo, l_mask = labels[1:] + f_score = predicts['f_score'] + f_geo = predicts['f_geo'] + + dice_loss = self.dice_loss(f_score, l_score, l_mask) + + #smoooth_l1_loss + channels = 8 + l_geo_split = paddle.split( + l_geo, num_or_sections=channels + 1, axis=1) + f_geo_split = paddle.split(f_geo, num_or_sections=channels, axis=1) + smooth_l1 = 0 + for i in range(0, channels): + geo_diff = l_geo_split[i] - f_geo_split[i] + abs_geo_diff = paddle.abs(geo_diff) + smooth_l1_sign = paddle.less_than(abs_geo_diff, l_score) + smooth_l1_sign = paddle.cast(smooth_l1_sign, dtype='float32') + in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + \ + (abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign) + out_loss = l_geo_split[-1] / channels * in_loss * l_score + smooth_l1 += out_loss + smooth_l1_loss = paddle.mean(smooth_l1 * l_score) + + dice_loss = dice_loss * 0.01 + total_loss = dice_loss + smooth_l1_loss + losses = {"loss":total_loss, \ + "dice_loss":dice_loss,\ + "smooth_l1_loss":smooth_l1_loss} + return losses diff --git a/backend/ppocr/losses/det_fce_loss.py b/backend/ppocr/losses/det_fce_loss.py new file mode 100644 index 0000000..d7dfb5a --- /dev/null +++ b/backend/ppocr/losses/det_fce_loss.py @@ -0,0 +1,227 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/losses/fce_loss.py +""" + +import numpy as np +from paddle import nn +import paddle +import paddle.nn.functional as F +from functools import partial + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +class FCELoss(nn.Layer): + """The class for implementing FCENet loss + FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped + Text Detection + + [https://arxiv.org/abs/2104.10442] + + Args: + fourier_degree (int) : The maximum Fourier transform degree k. + num_sample (int) : The sampling points number of regression + loss. If it is too small, fcenet tends to be overfitting. + ohem_ratio (float): the negative/positive ratio in OHEM. + """ + + def __init__(self, fourier_degree, num_sample, ohem_ratio=3.): + super().__init__() + self.fourier_degree = fourier_degree + self.num_sample = num_sample + self.ohem_ratio = ohem_ratio + + def forward(self, preds, labels): + assert isinstance(preds, dict) + preds = preds['levels'] + + p3_maps, p4_maps, p5_maps = labels[1:] + assert p3_maps[0].shape[0] == 4 * self.fourier_degree + 5,\ + 'fourier degree not equal in FCEhead and FCEtarget' + + # to tensor + gts = [p3_maps, p4_maps, p5_maps] + for idx, maps in enumerate(gts): + gts[idx] = paddle.to_tensor(np.stack(maps)) + + losses = multi_apply(self.forward_single, preds, gts) + + loss_tr = paddle.to_tensor(0.).astype('float32') + loss_tcl = paddle.to_tensor(0.).astype('float32') + loss_reg_x = paddle.to_tensor(0.).astype('float32') + loss_reg_y = paddle.to_tensor(0.).astype('float32') + loss_all = paddle.to_tensor(0.).astype('float32') + + for idx, loss in enumerate(losses): + loss_all += sum(loss) + if idx == 0: + loss_tr += sum(loss) + elif idx == 1: + loss_tcl += sum(loss) + elif idx == 2: + loss_reg_x += sum(loss) + else: + loss_reg_y += sum(loss) + + results = dict( + loss=loss_all, + loss_text=loss_tr, + loss_center=loss_tcl, + loss_reg_x=loss_reg_x, + loss_reg_y=loss_reg_y, ) + return results + + def forward_single(self, pred, gt): + cls_pred = paddle.transpose(pred[0], (0, 2, 3, 1)) + reg_pred = paddle.transpose(pred[1], (0, 2, 3, 1)) + gt = paddle.transpose(gt, (0, 2, 3, 1)) + + k = 2 * self.fourier_degree + 1 + tr_pred = paddle.reshape(cls_pred[:, :, :, :2], (-1, 2)) + tcl_pred = paddle.reshape(cls_pred[:, :, :, 2:], (-1, 2)) + x_pred = paddle.reshape(reg_pred[:, :, :, 0:k], (-1, k)) + y_pred = paddle.reshape(reg_pred[:, :, :, k:2 * k], (-1, k)) + + tr_mask = gt[:, :, :, :1].reshape([-1]) + tcl_mask = gt[:, :, :, 1:2].reshape([-1]) + train_mask = gt[:, :, :, 2:3].reshape([-1]) + x_map = paddle.reshape(gt[:, :, :, 3:3 + k], (-1, k)) + y_map = paddle.reshape(gt[:, :, :, 3 + k:], (-1, k)) + + tr_train_mask = (train_mask * tr_mask).astype('bool') + tr_train_mask2 = paddle.concat( + [tr_train_mask.unsqueeze(1), tr_train_mask.unsqueeze(1)], axis=1) + # tr loss + loss_tr = self.ohem(tr_pred, tr_mask, train_mask) + # tcl loss + loss_tcl = paddle.to_tensor(0.).astype('float32') + tr_neg_mask = tr_train_mask.logical_not() + tr_neg_mask2 = paddle.concat( + [tr_neg_mask.unsqueeze(1), tr_neg_mask.unsqueeze(1)], axis=1) + if tr_train_mask.sum().item() > 0: + loss_tcl_pos = F.cross_entropy( + tcl_pred.masked_select(tr_train_mask2).reshape([-1, 2]), + tcl_mask.masked_select(tr_train_mask).astype('int64')) + loss_tcl_neg = F.cross_entropy( + tcl_pred.masked_select(tr_neg_mask2).reshape([-1, 2]), + tcl_mask.masked_select(tr_neg_mask).astype('int64')) + loss_tcl = loss_tcl_pos + 0.5 * loss_tcl_neg + + # regression loss + loss_reg_x = paddle.to_tensor(0.).astype('float32') + loss_reg_y = paddle.to_tensor(0.).astype('float32') + if tr_train_mask.sum().item() > 0: + weight = (tr_mask.masked_select(tr_train_mask.astype('bool')) + .astype('float32') + tcl_mask.masked_select( + tr_train_mask.astype('bool')).astype('float32')) / 2 + weight = weight.reshape([-1, 1]) + + ft_x, ft_y = self.fourier2poly(x_map, y_map) + ft_x_pre, ft_y_pre = self.fourier2poly(x_pred, y_pred) + + dim = ft_x.shape[1] + + tr_train_mask3 = paddle.concat( + [tr_train_mask.unsqueeze(1) for i in range(dim)], axis=1) + + loss_reg_x = paddle.mean(weight * F.smooth_l1_loss( + ft_x_pre.masked_select(tr_train_mask3).reshape([-1, dim]), + ft_x.masked_select(tr_train_mask3).reshape([-1, dim]), + reduction='none')) + loss_reg_y = paddle.mean(weight * F.smooth_l1_loss( + ft_y_pre.masked_select(tr_train_mask3).reshape([-1, dim]), + ft_y.masked_select(tr_train_mask3).reshape([-1, dim]), + reduction='none')) + + return loss_tr, loss_tcl, loss_reg_x, loss_reg_y + + def ohem(self, predict, target, train_mask): + + pos = (target * train_mask).astype('bool') + neg = ((1 - target) * train_mask).astype('bool') + + pos2 = paddle.concat([pos.unsqueeze(1), pos.unsqueeze(1)], axis=1) + neg2 = paddle.concat([neg.unsqueeze(1), neg.unsqueeze(1)], axis=1) + + n_pos = pos.astype('float32').sum() + + if n_pos.item() > 0: + loss_pos = F.cross_entropy( + predict.masked_select(pos2).reshape([-1, 2]), + target.masked_select(pos).astype('int64'), + reduction='sum') + loss_neg = F.cross_entropy( + predict.masked_select(neg2).reshape([-1, 2]), + target.masked_select(neg).astype('int64'), + reduction='none') + n_neg = min( + int(neg.astype('float32').sum().item()), + int(self.ohem_ratio * n_pos.astype('float32'))) + else: + loss_pos = paddle.to_tensor(0.) + loss_neg = F.cross_entropy( + predict.masked_select(neg2).reshape([-1, 2]), + target.masked_select(neg).astype('int64'), + reduction='none') + n_neg = 100 + if len(loss_neg) > n_neg: + loss_neg, _ = paddle.topk(loss_neg, n_neg) + + return (loss_pos + loss_neg.sum()) / (n_pos + n_neg).astype('float32') + + def fourier2poly(self, real_maps, imag_maps): + """Transform Fourier coefficient maps to polygon maps. + + Args: + real_maps (tensor): A map composed of the real parts of the + Fourier coefficients, whose shape is (-1, 2k+1) + imag_maps (tensor):A map composed of the imag parts of the + Fourier coefficients, whose shape is (-1, 2k+1) + + Returns + x_maps (tensor): A map composed of the x value of the polygon + represented by n sample points (xn, yn), whose shape is (-1, n) + y_maps (tensor): A map composed of the y value of the polygon + represented by n sample points (xn, yn), whose shape is (-1, n) + """ + + k_vect = paddle.arange( + -self.fourier_degree, self.fourier_degree + 1, + dtype='float32').reshape([-1, 1]) + i_vect = paddle.arange( + 0, self.num_sample, dtype='float32').reshape([1, -1]) + + transform_matrix = 2 * np.pi / self.num_sample * paddle.matmul(k_vect, + i_vect) + + x1 = paddle.einsum('ak, kn-> an', real_maps, + paddle.cos(transform_matrix)) + x2 = paddle.einsum('ak, kn-> an', imag_maps, + paddle.sin(transform_matrix)) + y1 = paddle.einsum('ak, kn-> an', real_maps, + paddle.sin(transform_matrix)) + y2 = paddle.einsum('ak, kn-> an', imag_maps, + paddle.cos(transform_matrix)) + + x_maps = x1 - x2 + y_maps = y1 + y2 + + return x_maps, y_maps diff --git a/backend/ppocr/losses/det_pse_loss.py b/backend/ppocr/losses/det_pse_loss.py new file mode 100644 index 0000000..6b31343 --- /dev/null +++ b/backend/ppocr/losses/det_pse_loss.py @@ -0,0 +1,149 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +import paddle +from paddle import nn +from paddle.nn import functional as F +import numpy as np +from ppocr.utils.iou import iou + + +class PSELoss(nn.Layer): + def __init__(self, + alpha, + ohem_ratio=3, + kernel_sample_mask='pred', + reduction='sum', + eps=1e-6, + **kwargs): + """Implement PSE Loss. + """ + super(PSELoss, self).__init__() + assert reduction in ['sum', 'mean', 'none'] + self.alpha = alpha + self.ohem_ratio = ohem_ratio + self.kernel_sample_mask = kernel_sample_mask + self.reduction = reduction + self.eps = eps + + def forward(self, outputs, labels): + predicts = outputs['maps'] + predicts = F.interpolate(predicts, scale_factor=4) + + texts = predicts[:, 0, :, :] + kernels = predicts[:, 1:, :, :] + gt_texts, gt_kernels, training_masks = labels[1:] + + # text loss + selected_masks = self.ohem_batch(texts, gt_texts, training_masks) + + loss_text = self.dice_loss(texts, gt_texts, selected_masks) + iou_text = iou((texts > 0).astype('int64'), + gt_texts, + training_masks, + reduce=False) + losses = dict(loss_text=loss_text, iou_text=iou_text) + + # kernel loss + loss_kernels = [] + if self.kernel_sample_mask == 'gt': + selected_masks = gt_texts * training_masks + elif self.kernel_sample_mask == 'pred': + selected_masks = ( + F.sigmoid(texts) > 0.5).astype('float32') * training_masks + + for i in range(kernels.shape[1]): + kernel_i = kernels[:, i, :, :] + gt_kernel_i = gt_kernels[:, i, :, :] + loss_kernel_i = self.dice_loss(kernel_i, gt_kernel_i, + selected_masks) + loss_kernels.append(loss_kernel_i) + loss_kernels = paddle.mean(paddle.stack(loss_kernels, axis=1), axis=1) + iou_kernel = iou((kernels[:, -1, :, :] > 0).astype('int64'), + gt_kernels[:, -1, :, :], + training_masks * gt_texts, + reduce=False) + losses.update(dict(loss_kernels=loss_kernels, iou_kernel=iou_kernel)) + loss = self.alpha * loss_text + (1 - self.alpha) * loss_kernels + losses['loss'] = loss + if self.reduction == 'sum': + losses = {x: paddle.sum(v) for x, v in losses.items()} + elif self.reduction == 'mean': + losses = {x: paddle.mean(v) for x, v in losses.items()} + return losses + + def dice_loss(self, input, target, mask): + input = F.sigmoid(input) + + input = input.reshape([input.shape[0], -1]) + target = target.reshape([target.shape[0], -1]) + mask = mask.reshape([mask.shape[0], -1]) + + input = input * mask + target = target * mask + + a = paddle.sum(input * target, 1) + b = paddle.sum(input * input, 1) + self.eps + c = paddle.sum(target * target, 1) + self.eps + d = (2 * a) / (b + c) + return 1 - d + + def ohem_single(self, score, gt_text, training_mask, ohem_ratio=3): + pos_num = int(paddle.sum((gt_text > 0.5).astype('float32'))) - int( + paddle.sum( + paddle.logical_and((gt_text > 0.5), (training_mask <= 0.5)) + .astype('float32'))) + + if pos_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + neg_num = int(paddle.sum((gt_text <= 0.5).astype('float32'))) + neg_num = int(min(pos_num * ohem_ratio, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + neg_score = paddle.masked_select(score, gt_text <= 0.5) + neg_score_sorted = paddle.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + + selected_mask = paddle.logical_and( + paddle.logical_or((score >= threshold), (gt_text > 0.5)), + (training_mask > 0.5)) + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + def ohem_batch(self, scores, gt_texts, training_masks, ohem_ratio=3): + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append( + self.ohem_single(scores[i, :, :], gt_texts[i, :, :], + training_masks[i, :, :], ohem_ratio)) + + selected_masks = paddle.concat(selected_masks, 0).astype('float32') + return selected_masks diff --git a/backend/ppocr/losses/det_sast_loss.py b/backend/ppocr/losses/det_sast_loss.py new file mode 100644 index 0000000..2e0c756 --- /dev/null +++ b/backend/ppocr/losses/det_sast_loss.py @@ -0,0 +1,121 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .det_basic_loss import DiceLoss +import numpy as np + + +class SASTLoss(nn.Layer): + """ + """ + + def __init__(self, eps=1e-6, **kwargs): + super(SASTLoss, self).__init__() + self.dice_loss = DiceLoss(eps=eps) + + def forward(self, predicts, labels): + """ + tcl_pos: N x 128 x 3 + tcl_mask: N x 128 x 1 + tcl_label: N x X list or LoDTensor + """ + + f_score = predicts['f_score'] + f_border = predicts['f_border'] + f_tvo = predicts['f_tvo'] + f_tco = predicts['f_tco'] + + l_score, l_border, l_mask, l_tvo, l_tco = labels[1:] + + #score_loss + intersection = paddle.sum(f_score * l_score * l_mask) + union = paddle.sum(f_score * l_mask) + paddle.sum(l_score * l_mask) + score_loss = 1.0 - 2 * intersection / (union + 1e-5) + + #border loss + l_border_split, l_border_norm = paddle.split( + l_border, num_or_sections=[4, 1], axis=1) + f_border_split = f_border + border_ex_shape = l_border_norm.shape * np.array([1, 4, 1, 1]) + l_border_norm_split = paddle.expand( + x=l_border_norm, shape=border_ex_shape) + l_border_score = paddle.expand(x=l_score, shape=border_ex_shape) + l_border_mask = paddle.expand(x=l_mask, shape=border_ex_shape) + + border_diff = l_border_split - f_border_split + abs_border_diff = paddle.abs(border_diff) + border_sign = abs_border_diff < 1.0 + border_sign = paddle.cast(border_sign, dtype='float32') + border_sign.stop_gradient = True + border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \ + (abs_border_diff - 0.5) * (1.0 - border_sign) + border_out_loss = l_border_norm_split * border_in_loss + border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \ + (paddle.sum(l_border_score * l_border_mask) + 1e-5) + + #tvo_loss + l_tvo_split, l_tvo_norm = paddle.split( + l_tvo, num_or_sections=[8, 1], axis=1) + f_tvo_split = f_tvo + tvo_ex_shape = l_tvo_norm.shape * np.array([1, 8, 1, 1]) + l_tvo_norm_split = paddle.expand(x=l_tvo_norm, shape=tvo_ex_shape) + l_tvo_score = paddle.expand(x=l_score, shape=tvo_ex_shape) + l_tvo_mask = paddle.expand(x=l_mask, shape=tvo_ex_shape) + # + tvo_geo_diff = l_tvo_split - f_tvo_split + abs_tvo_geo_diff = paddle.abs(tvo_geo_diff) + tvo_sign = abs_tvo_geo_diff < 1.0 + tvo_sign = paddle.cast(tvo_sign, dtype='float32') + tvo_sign.stop_gradient = True + tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \ + (abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign) + tvo_out_loss = l_tvo_norm_split * tvo_in_loss + tvo_loss = paddle.sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \ + (paddle.sum(l_tvo_score * l_tvo_mask) + 1e-5) + + #tco_loss + l_tco_split, l_tco_norm = paddle.split( + l_tco, num_or_sections=[2, 1], axis=1) + f_tco_split = f_tco + tco_ex_shape = l_tco_norm.shape * np.array([1, 2, 1, 1]) + l_tco_norm_split = paddle.expand(x=l_tco_norm, shape=tco_ex_shape) + l_tco_score = paddle.expand(x=l_score, shape=tco_ex_shape) + l_tco_mask = paddle.expand(x=l_mask, shape=tco_ex_shape) + + tco_geo_diff = l_tco_split - f_tco_split + abs_tco_geo_diff = paddle.abs(tco_geo_diff) + tco_sign = abs_tco_geo_diff < 1.0 + tco_sign = paddle.cast(tco_sign, dtype='float32') + tco_sign.stop_gradient = True + tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \ + (abs_tco_geo_diff - 0.5) * (1.0 - tco_sign) + tco_out_loss = l_tco_norm_split * tco_in_loss + tco_loss = paddle.sum(tco_out_loss * l_tco_score * l_tco_mask) / \ + (paddle.sum(l_tco_score * l_tco_mask) + 1e-5) + + # total loss + tvo_lw, tco_lw = 1.5, 1.5 + score_lw, border_lw = 1.0, 1.0 + total_loss = score_loss * score_lw + border_loss * border_lw + \ + tvo_loss * tvo_lw + tco_loss * tco_lw + + losses = {'loss':total_loss, "score_loss":score_loss,\ + "border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss} + return losses diff --git a/backend/ppocr/losses/distillation_loss.py b/backend/ppocr/losses/distillation_loss.py new file mode 100644 index 0000000..565b066 --- /dev/null +++ b/backend/ppocr/losses/distillation_loss.py @@ -0,0 +1,324 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +import cv2 + +from .rec_ctc_loss import CTCLoss +from .rec_sar_loss import SARLoss +from .basic_loss import DMLLoss +from .basic_loss import DistanceLoss +from .det_db_loss import DBLoss +from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss + + +def _sum_loss(loss_dict): + if "loss" in loss_dict.keys(): + return loss_dict + else: + loss_dict["loss"] = 0. + for k, value in loss_dict.items(): + if k == "loss": + continue + else: + loss_dict["loss"] += value + return loss_dict + + +class DistillationDMLLoss(DMLLoss): + """ + """ + + def __init__(self, + model_name_pairs=[], + act=None, + use_log=False, + key=None, + multi_head=False, + dis_head='ctc', + maps_name=None, + name="dml"): + super().__init__(act=act, use_log=use_log) + assert isinstance(model_name_pairs, list) + self.key = key + self.multi_head = multi_head + self.dis_head = dis_head + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + + if self.maps_name is None: + if self.multi_head: + loss = super().forward(out1[self.dis_head], + out2[self.dis_head]) + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + else: + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], self.maps_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + +class DistillationCTCLoss(CTCLoss): + def __init__(self, + model_name_list=[], + key=None, + multi_head=False, + name="loss_ctc"): + super().__init__() + self.model_name_list = model_name_list + self.key = key + self.name = name + self.multi_head = multi_head + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + if self.multi_head: + assert 'ctc' in out, 'multi head has multi out' + loss = super().forward(out['ctc'], batch[:2] + batch[3:]) + else: + loss = super().forward(out, batch) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, model_name, + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + return loss_dict + + +class DistillationSARLoss(SARLoss): + def __init__(self, + model_name_list=[], + key=None, + multi_head=False, + name="loss_sar", + **kwargs): + ignore_index = kwargs.get('ignore_index', 92) + super().__init__(ignore_index=ignore_index) + self.model_name_list = model_name_list + self.key = key + self.name = name + self.multi_head = multi_head + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + if self.multi_head: + assert 'sar' in out, 'multi head has multi out' + loss = super().forward(out['sar'], batch[:1] + batch[2:]) + else: + loss = super().forward(out, batch) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, model_name, + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + return loss_dict + + +class DistillationDBLoss(DBLoss): + def __init__(self, + model_name_list=[], + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + name="db", + **kwargs): + super().__init__() + self.model_name_list = model_name_list + self.name = name + self.key = None + + def forward(self, predicts, batch): + loss_dict = {} + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + loss = super().forward(out, batch) + + if isinstance(loss, dict): + for key in loss.keys(): + if key == "loss": + continue + name = "{}_{}_{}".format(self.name, model_name, key) + loss_dict[name] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + + loss_dict = _sum_loss(loss_dict) + return loss_dict + + +class DistillationDilaDBLoss(DBLoss): + def __init__(self, + model_name_pairs=[], + key=None, + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + name="dila_dbloss"): + super().__init__() + self.model_name_pairs = model_name_pairs + self.name = name + self.key = key + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + stu_outs = predicts[pair[0]] + tch_outs = predicts[pair[1]] + if self.key is not None: + stu_preds = stu_outs[self.key] + tch_preds = tch_outs[self.key] + + stu_shrink_maps = stu_preds[:, 0, :, :] + stu_binary_maps = stu_preds[:, 2, :, :] + + # dilation to teacher prediction + dilation_w = np.array([[1, 1], [1, 1]]) + th_shrink_maps = tch_preds[:, 0, :, :] + th_shrink_maps = th_shrink_maps.numpy() > 0.3 # thresh = 0.3 + dilate_maps = np.zeros_like(th_shrink_maps).astype(np.float32) + for i in range(th_shrink_maps.shape[0]): + dilate_maps[i] = cv2.dilate( + th_shrink_maps[i, :, :].astype(np.uint8), dilation_w) + th_shrink_maps = paddle.to_tensor(dilate_maps) + + label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = batch[ + 1:] + + # calculate the shrink map loss + bce_loss = self.alpha * self.bce_loss( + stu_shrink_maps, th_shrink_maps, label_shrink_mask) + loss_binary_maps = self.dice_loss(stu_binary_maps, th_shrink_maps, + label_shrink_mask) + + # k = f"{self.name}_{pair[0]}_{pair[1]}" + k = "{}_{}_{}".format(self.name, pair[0], pair[1]) + loss_dict[k] = bce_loss + loss_binary_maps + + loss_dict = _sum_loss(loss_dict) + return loss_dict + + +class DistillationDistanceLoss(DistanceLoss): + """ + """ + + def __init__(self, + mode="l2", + model_name_pairs=[], + key=None, + name="loss_distance", + **kargs): + super().__init__(mode=mode, **kargs) + assert isinstance(model_name_pairs, list) + self.key = key + self.model_name_pairs = model_name_pairs + self.name = name + "_l2" + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, key, idx)] = loss[ + key] + else: + loss_dict["{}_{}_{}_{}".format(self.name, pair[0], pair[1], + idx)] = loss + return loss_dict diff --git a/backend/ppocr/losses/e2e_pg_loss.py b/backend/ppocr/losses/e2e_pg_loss.py new file mode 100644 index 0000000..10a8ed0 --- /dev/null +++ b/backend/ppocr/losses/e2e_pg_loss.py @@ -0,0 +1,140 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +import paddle + +from .det_basic_loss import DiceLoss +from ppocr.utils.e2e_utils.extract_batchsize import pre_process + + +class PGLoss(nn.Layer): + def __init__(self, + tcl_bs, + max_text_length, + max_text_nums, + pad_num, + eps=1e-6, + **kwargs): + super(PGLoss, self).__init__() + self.tcl_bs = tcl_bs + self.max_text_nums = max_text_nums + self.max_text_length = max_text_length + self.pad_num = pad_num + self.dice_loss = DiceLoss(eps=eps) + + def border_loss(self, f_border, l_border, l_score, l_mask): + l_border_split, l_border_norm = paddle.tensor.split( + l_border, num_or_sections=[4, 1], axis=1) + f_border_split = f_border + b, c, h, w = l_border_norm.shape + l_border_norm_split = paddle.expand( + x=l_border_norm, shape=[b, 4 * c, h, w]) + b, c, h, w = l_score.shape + l_border_score = paddle.expand(x=l_score, shape=[b, 4 * c, h, w]) + b, c, h, w = l_mask.shape + l_border_mask = paddle.expand(x=l_mask, shape=[b, 4 * c, h, w]) + border_diff = l_border_split - f_border_split + abs_border_diff = paddle.abs(border_diff) + border_sign = abs_border_diff < 1.0 + border_sign = paddle.cast(border_sign, dtype='float32') + border_sign.stop_gradient = True + border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \ + (abs_border_diff - 0.5) * (1.0 - border_sign) + border_out_loss = l_border_norm_split * border_in_loss + border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \ + (paddle.sum(l_border_score * l_border_mask) + 1e-5) + return border_loss + + def direction_loss(self, f_direction, l_direction, l_score, l_mask): + l_direction_split, l_direction_norm = paddle.tensor.split( + l_direction, num_or_sections=[2, 1], axis=1) + f_direction_split = f_direction + b, c, h, w = l_direction_norm.shape + l_direction_norm_split = paddle.expand( + x=l_direction_norm, shape=[b, 2 * c, h, w]) + b, c, h, w = l_score.shape + l_direction_score = paddle.expand(x=l_score, shape=[b, 2 * c, h, w]) + b, c, h, w = l_mask.shape + l_direction_mask = paddle.expand(x=l_mask, shape=[b, 2 * c, h, w]) + direction_diff = l_direction_split - f_direction_split + abs_direction_diff = paddle.abs(direction_diff) + direction_sign = abs_direction_diff < 1.0 + direction_sign = paddle.cast(direction_sign, dtype='float32') + direction_sign.stop_gradient = True + direction_in_loss = 0.5 * abs_direction_diff * abs_direction_diff * direction_sign + \ + (abs_direction_diff - 0.5) * (1.0 - direction_sign) + direction_out_loss = l_direction_norm_split * direction_in_loss + direction_loss = paddle.sum(direction_out_loss * l_direction_score * l_direction_mask) / \ + (paddle.sum(l_direction_score * l_direction_mask) + 1e-5) + return direction_loss + + def ctcloss(self, f_char, tcl_pos, tcl_mask, tcl_label, label_t): + f_char = paddle.transpose(f_char, [0, 2, 3, 1]) + tcl_pos = paddle.reshape(tcl_pos, [-1, 3]) + tcl_pos = paddle.cast(tcl_pos, dtype=int) + f_tcl_char = paddle.gather_nd(f_char, tcl_pos) + f_tcl_char = paddle.reshape(f_tcl_char, + [-1, 64, 37]) # len(Lexicon_Table)+1 + f_tcl_char_fg, f_tcl_char_bg = paddle.split(f_tcl_char, [36, 1], axis=2) + f_tcl_char_bg = f_tcl_char_bg * tcl_mask + (1.0 - tcl_mask) * 20.0 + b, c, l = tcl_mask.shape + tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, 36 * l]) + tcl_mask_fg.stop_gradient = True + f_tcl_char_fg = f_tcl_char_fg * tcl_mask_fg + (1.0 - tcl_mask_fg) * ( + -20.0) + f_tcl_char_mask = paddle.concat([f_tcl_char_fg, f_tcl_char_bg], axis=2) + f_tcl_char_ld = paddle.transpose(f_tcl_char_mask, (1, 0, 2)) + N, B, _ = f_tcl_char_ld.shape + input_lengths = paddle.to_tensor([N] * B, dtype='int64') + cost = paddle.nn.functional.ctc_loss( + log_probs=f_tcl_char_ld, + labels=tcl_label, + input_lengths=input_lengths, + label_lengths=label_t, + blank=self.pad_num, + reduction='none') + cost = cost.mean() + return cost + + def forward(self, predicts, labels): + images, tcl_maps, tcl_label_maps, border_maps \ + , direction_maps, training_masks, label_list, pos_list, pos_mask = labels + # for all the batch_size + pos_list, pos_mask, label_list, label_t = pre_process( + label_list, pos_list, pos_mask, self.max_text_length, + self.max_text_nums, self.pad_num, self.tcl_bs) + + f_score, f_border, f_direction, f_char = predicts['f_score'], predicts['f_border'], predicts['f_direction'], \ + predicts['f_char'] + score_loss = self.dice_loss(f_score, tcl_maps, training_masks) + border_loss = self.border_loss(f_border, border_maps, tcl_maps, + training_masks) + direction_loss = self.direction_loss(f_direction, direction_maps, + tcl_maps, training_masks) + ctc_loss = self.ctcloss(f_char, pos_list, pos_mask, label_list, label_t) + loss_all = score_loss + border_loss + direction_loss + 5 * ctc_loss + + losses = { + 'loss': loss_all, + "score_loss": score_loss, + "border_loss": border_loss, + "direction_loss": direction_loss, + "ctc_loss": ctc_loss + } + return losses diff --git a/backend/ppocr/losses/kie_sdmgr_loss.py b/backend/ppocr/losses/kie_sdmgr_loss.py new file mode 100644 index 0000000..745671f --- /dev/null +++ b/backend/ppocr/losses/kie_sdmgr_loss.py @@ -0,0 +1,115 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/losses/sdmgr_loss.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +import paddle + + +class SDMGRLoss(nn.Layer): + def __init__(self, node_weight=1.0, edge_weight=1.0, ignore=0): + super().__init__() + self.loss_node = nn.CrossEntropyLoss(ignore_index=ignore) + self.loss_edge = nn.CrossEntropyLoss(ignore_index=-1) + self.node_weight = node_weight + self.edge_weight = edge_weight + self.ignore = ignore + + def pre_process(self, gts, tag): + gts, tag = gts.numpy(), tag.numpy().tolist() + temp_gts = [] + batch = len(tag) + for i in range(batch): + num, recoder_len = tag[i][0], tag[i][1] + temp_gts.append( + paddle.to_tensor( + gts[i, :num, :num + 1], dtype='int64')) + return temp_gts + + def accuracy(self, pred, target, topk=1, thresh=None): + """Calculate accuracy according to the prediction and target. + + Args: + pred (torch.Tensor): The model prediction, shape (N, num_class) + target (torch.Tensor): The target of each prediction, shape (N, ) + topk (int | tuple[int], optional): If the predictions in ``topk`` + matches the target, the predictions will be regarded as + correct ones. Defaults to 1. + thresh (float, optional): If not None, predictions with scores under + this threshold are considered incorrect. Default to None. + + Returns: + float | tuple[float]: If the input ``topk`` is a single integer, + the function will return a single float as accuracy. If + ``topk`` is a tuple containing multiple integers, the + function will return a tuple containing accuracies of + each ``topk`` number. + """ + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + return_single = True + else: + return_single = False + + maxk = max(topk) + if pred.shape[0] == 0: + accu = [pred.new_tensor(0.) for i in range(len(topk))] + return accu[0] if return_single else accu + pred_value, pred_label = paddle.topk(pred, maxk, axis=1) + pred_label = pred_label.transpose( + [1, 0]) # transpose to shape (maxk, N) + correct = paddle.equal(pred_label, + (target.reshape([1, -1]).expand_as(pred_label))) + res = [] + for k in topk: + correct_k = paddle.sum(correct[:k].reshape([-1]).astype('float32'), + axis=0, + keepdim=True) + res.append( + paddle.multiply(correct_k, + paddle.to_tensor(100.0 / pred.shape[0]))) + return res[0] if return_single else res + + def forward(self, pred, batch): + node_preds, edge_preds = pred + gts, tag = batch[4], batch[5] + gts = self.pre_process(gts, tag) + node_gts, edge_gts = [], [] + for gt in gts: + node_gts.append(gt[:, 0]) + edge_gts.append(gt[:, 1:].reshape([-1])) + node_gts = paddle.concat(node_gts) + edge_gts = paddle.concat(edge_gts) + + node_valids = paddle.nonzero(node_gts != self.ignore).reshape([-1]) + edge_valids = paddle.nonzero(edge_gts != -1).reshape([-1]) + loss_node = self.loss_node(node_preds, node_gts) + loss_edge = self.loss_edge(edge_preds, edge_gts) + loss = self.node_weight * loss_node + self.edge_weight * loss_edge + return dict( + loss=loss, + loss_node=loss_node, + loss_edge=loss_edge, + acc_node=self.accuracy( + paddle.gather(node_preds, node_valids), + paddle.gather(node_gts, node_valids)), + acc_edge=self.accuracy( + paddle.gather(edge_preds, edge_valids), + paddle.gather(edge_gts, edge_valids))) diff --git a/backend/ppocr/losses/rec_aster_loss.py b/backend/ppocr/losses/rec_aster_loss.py new file mode 100644 index 0000000..fbb99d2 --- /dev/null +++ b/backend/ppocr/losses/rec_aster_loss.py @@ -0,0 +1,99 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class CosineEmbeddingLoss(nn.Layer): + def __init__(self, margin=0.): + super(CosineEmbeddingLoss, self).__init__() + self.margin = margin + self.epsilon = 1e-12 + + def forward(self, x1, x2, target): + similarity = paddle.fluid.layers.reduce_sum( + x1 * x2, dim=-1) / (paddle.norm( + x1, axis=-1) * paddle.norm( + x2, axis=-1) + self.epsilon) + one_list = paddle.full_like(target, fill_value=1) + out = paddle.fluid.layers.reduce_mean( + paddle.where( + paddle.equal(target, one_list), 1. - similarity, + paddle.maximum( + paddle.zeros_like(similarity), similarity - self.margin))) + + return out + + +class AsterLoss(nn.Layer): + def __init__(self, + weight=None, + size_average=True, + ignore_index=-100, + sequence_normalize=False, + sample_normalize=True, + **kwargs): + super(AsterLoss, self).__init__() + self.weight = weight + self.size_average = size_average + self.ignore_index = ignore_index + self.sequence_normalize = sequence_normalize + self.sample_normalize = sample_normalize + self.loss_sem = CosineEmbeddingLoss() + self.is_cosin_loss = True + self.loss_func_rec = nn.CrossEntropyLoss(weight=None, reduction='none') + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + sem_target = batch[3].astype('float32') + embedding_vectors = predicts['embedding_vectors'] + rec_pred = predicts['rec_pred'] + + if not self.is_cosin_loss: + sem_loss = paddle.sum(self.loss_sem(embedding_vectors, sem_target)) + else: + label_target = paddle.ones([embedding_vectors.shape[0]]) + sem_loss = paddle.sum( + self.loss_sem(embedding_vectors, sem_target, label_target)) + + # rec loss + batch_size, def_max_length = targets.shape[0], targets.shape[1] + + mask = paddle.zeros([batch_size, def_max_length]) + for i in range(batch_size): + mask[i, :label_lengths[i]] = 1 + mask = paddle.cast(mask, "float32") + max_length = max(label_lengths) + assert max_length == rec_pred.shape[1] + targets = targets[:, :max_length] + mask = mask[:, :max_length] + rec_pred = paddle.reshape(rec_pred, [-1, rec_pred.shape[2]]) + input = nn.functional.log_softmax(rec_pred, axis=1) + targets = paddle.reshape(targets, [-1, 1]) + mask = paddle.reshape(mask, [-1, 1]) + output = -paddle.index_sample(input, index=targets) * mask + output = paddle.sum(output) + if self.sequence_normalize: + output = output / paddle.sum(mask) + if self.sample_normalize: + output = output / batch_size + + loss = output + sem_loss * 0.1 + return {'loss': loss} diff --git a/backend/ppocr/losses/rec_att_loss.py b/backend/ppocr/losses/rec_att_loss.py new file mode 100644 index 0000000..6e2f674 --- /dev/null +++ b/backend/ppocr/losses/rec_att_loss.py @@ -0,0 +1,39 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class AttentionLoss(nn.Layer): + def __init__(self, **kwargs): + super(AttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='none') + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + batch_size, num_steps, num_classes = predicts.shape[0], predicts.shape[ + 1], predicts.shape[2] + assert len(targets.shape) == len(list(predicts.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predicts, [-1, predicts.shape[-1]]) + targets = paddle.reshape(targets, [-1]) + + return {'loss': paddle.sum(self.loss_func(inputs, targets))} diff --git a/backend/ppocr/losses/rec_ctc_loss.py b/backend/ppocr/losses/rec_ctc_loss.py new file mode 100755 index 0000000..502fc8c --- /dev/null +++ b/backend/ppocr/losses/rec_ctc_loss.py @@ -0,0 +1,45 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class CTCLoss(nn.Layer): + def __init__(self, use_focal_loss=False, **kwargs): + super(CTCLoss, self).__init__() + self.loss_func = nn.CTCLoss(blank=0, reduction='none') + self.use_focal_loss = use_focal_loss + + def forward(self, predicts, batch): + if isinstance(predicts, (list, tuple)): + predicts = predicts[-1] + predicts = predicts.transpose((1, 0, 2)) + N, B, _ = predicts.shape + preds_lengths = paddle.to_tensor( + [N] * B, dtype='int64', place=paddle.CPUPlace()) + labels = batch[1].astype("int32") + label_lengths = batch[2].astype('int64') + loss = self.loss_func(predicts, labels, preds_lengths, label_lengths) + if self.use_focal_loss: + weight = paddle.exp(-loss) + weight = paddle.subtract(paddle.to_tensor([1.0]), weight) + weight = paddle.square(weight) + loss = paddle.multiply(loss, weight) + loss = loss.mean() + return {'loss': loss} diff --git a/backend/ppocr/losses/rec_enhanced_ctc_loss.py b/backend/ppocr/losses/rec_enhanced_ctc_loss.py new file mode 100644 index 0000000..b57be64 --- /dev/null +++ b/backend/ppocr/losses/rec_enhanced_ctc_loss.py @@ -0,0 +1,70 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .ace_loss import ACELoss +from .center_loss import CenterLoss +from .rec_ctc_loss import CTCLoss + + +class EnhancedCTCLoss(nn.Layer): + def __init__(self, + use_focal_loss=False, + use_ace_loss=False, + ace_loss_weight=0.1, + use_center_loss=False, + center_loss_weight=0.05, + num_classes=6625, + feat_dim=96, + init_center=False, + center_file_path=None, + **kwargs): + super(EnhancedCTCLoss, self).__init__() + self.ctc_loss_func = CTCLoss(use_focal_loss=use_focal_loss) + + self.use_ace_loss = False + if use_ace_loss: + self.use_ace_loss = use_ace_loss + self.ace_loss_func = ACELoss() + self.ace_loss_weight = ace_loss_weight + + self.use_center_loss = False + if use_center_loss: + self.use_center_loss = use_center_loss + self.center_loss_func = CenterLoss( + num_classes=num_classes, + feat_dim=feat_dim, + init_center=init_center, + center_file_path=center_file_path) + self.center_loss_weight = center_loss_weight + + def __call__(self, predicts, batch): + loss = self.ctc_loss_func(predicts, batch)["loss"] + + if self.use_center_loss: + center_loss = self.center_loss_func( + predicts, batch)["loss_center"] * self.center_loss_weight + loss = loss + center_loss + + if self.use_ace_loss: + ace_loss = self.ace_loss_func( + predicts, batch)["loss_ace"] * self.ace_loss_weight + loss = loss + ace_loss + + return {'enhanced_ctc_loss': loss} diff --git a/backend/ppocr/losses/rec_multi_loss.py b/backend/ppocr/losses/rec_multi_loss.py new file mode 100644 index 0000000..09f007a --- /dev/null +++ b/backend/ppocr/losses/rec_multi_loss.py @@ -0,0 +1,58 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from .rec_ctc_loss import CTCLoss +from .rec_sar_loss import SARLoss + + +class MultiLoss(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + self.loss_funcs = {} + self.loss_list = kwargs.pop('loss_config_list') + self.weight_1 = kwargs.get('weight_1', 1.0) + self.weight_2 = kwargs.get('weight_2', 1.0) + self.gtc_loss = kwargs.get('gtc_loss', 'sar') + for loss_info in self.loss_list: + for name, param in loss_info.items(): + if param is not None: + kwargs.update(param) + loss = eval(name)(**kwargs) + self.loss_funcs[name] = loss + + def forward(self, predicts, batch): + self.total_loss = {} + total_loss = 0.0 + # batch [image, label_ctc, label_sar, length, valid_ratio] + for name, loss_func in self.loss_funcs.items(): + if name == 'CTCLoss': + loss = loss_func(predicts['ctc'], + batch[:2] + batch[3:])['loss'] * self.weight_1 + elif name == 'SARLoss': + loss = loss_func(predicts['sar'], + batch[:1] + batch[2:])['loss'] * self.weight_2 + else: + raise NotImplementedError( + '{} is not supported in MultiLoss yet'.format(name)) + self.total_loss[name] = loss + total_loss += loss + self.total_loss['loss'] = total_loss + return self.total_loss diff --git a/backend/ppocr/losses/rec_nrtr_loss.py b/backend/ppocr/losses/rec_nrtr_loss.py new file mode 100644 index 0000000..200a6d0 --- /dev/null +++ b/backend/ppocr/losses/rec_nrtr_loss.py @@ -0,0 +1,30 @@ +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class NRTRLoss(nn.Layer): + def __init__(self, smoothing=True, **kwargs): + super(NRTRLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(reduction='mean', ignore_index=0) + self.smoothing = smoothing + + def forward(self, pred, batch): + pred = pred.reshape([-1, pred.shape[2]]) + max_len = batch[2].max() + tgt = batch[1][:, 1:2 + max_len] + tgt = tgt.reshape([-1]) + if self.smoothing: + eps = 0.1 + n_class = pred.shape[1] + one_hot = F.one_hot(tgt, pred.shape[1]) + one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) + log_prb = F.log_softmax(pred, axis=1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = -(one_hot * log_prb).sum(axis=1) + loss = loss.masked_select(non_pad_mask).mean() + else: + loss = self.loss_func(pred, tgt) + return {'loss': loss} diff --git a/backend/ppocr/losses/rec_pren_loss.py b/backend/ppocr/losses/rec_pren_loss.py new file mode 100644 index 0000000..7bc53d2 --- /dev/null +++ b/backend/ppocr/losses/rec_pren_loss.py @@ -0,0 +1,30 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + + +class PRENLoss(nn.Layer): + def __init__(self, **kwargs): + super(PRENLoss, self).__init__() + # note: 0 is padding idx + self.loss_func = nn.CrossEntropyLoss(reduction='mean', ignore_index=0) + + def forward(self, predicts, batch): + loss = self.loss_func(predicts, batch[1].astype('int64')) + return {'loss': loss} diff --git a/backend/ppocr/losses/rec_sar_loss.py b/backend/ppocr/losses/rec_sar_loss.py new file mode 100644 index 0000000..a4f83f0 --- /dev/null +++ b/backend/ppocr/losses/rec_sar_loss.py @@ -0,0 +1,29 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SARLoss(nn.Layer): + def __init__(self, **kwargs): + super(SARLoss, self).__init__() + ignore_index = kwargs.get('ignore_index', 92) # 6626 + self.loss_func = paddle.nn.loss.CrossEntropyLoss( + reduction="mean", ignore_index=ignore_index) + + def forward(self, predicts, batch): + predict = predicts[:, : + -1, :] # ignore last index of outputs to be in same seq_len with targets + label = batch[1].astype( + "int64")[:, 1:] # ignore first index of target in loss calculation + batch_size, num_steps, num_classes = predict.shape[0], predict.shape[ + 1], predict.shape[2] + assert len(label.shape) == len(list(predict.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predict, [-1, num_classes]) + targets = paddle.reshape(label, [-1]) + loss = self.loss_func(inputs, targets) + return {'loss': loss} diff --git a/backend/ppocr/losses/rec_srn_loss.py b/backend/ppocr/losses/rec_srn_loss.py new file mode 100644 index 0000000..7d5b65e --- /dev/null +++ b/backend/ppocr/losses/rec_srn_loss.py @@ -0,0 +1,47 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SRNLoss(nn.Layer): + def __init__(self, **kwargs): + super(SRNLoss, self).__init__() + self.loss_func = paddle.nn.loss.CrossEntropyLoss(reduction="sum") + + def forward(self, predicts, batch): + predict = predicts['predict'] + word_predict = predicts['word_out'] + gsrm_predict = predicts['gsrm_out'] + label = batch[1] + + casted_label = paddle.cast(x=label, dtype='int64') + casted_label = paddle.reshape(x=casted_label, shape=[-1, 1]) + + cost_word = self.loss_func(word_predict, label=casted_label) + cost_gsrm = self.loss_func(gsrm_predict, label=casted_label) + cost_vsfd = self.loss_func(predict, label=casted_label) + + cost_word = paddle.reshape(x=paddle.sum(cost_word), shape=[1]) + cost_gsrm = paddle.reshape(x=paddle.sum(cost_gsrm), shape=[1]) + cost_vsfd = paddle.reshape(x=paddle.sum(cost_vsfd), shape=[1]) + + sum_cost = cost_word * 3.0 + cost_vsfd + cost_gsrm * 0.15 + + return {'loss': sum_cost, 'word_loss': cost_word, 'img_loss': cost_vsfd} diff --git a/backend/ppocr/losses/table_att_loss.py b/backend/ppocr/losses/table_att_loss.py new file mode 100644 index 0000000..d7fd99e --- /dev/null +++ b/backend/ppocr/losses/table_att_loss.py @@ -0,0 +1,109 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle import fluid + +class TableAttentionLoss(nn.Layer): + def __init__(self, structure_weight, loc_weight, use_giou=False, giou_weight=1.0, **kwargs): + super(TableAttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='none') + self.structure_weight = structure_weight + self.loc_weight = loc_weight + self.use_giou = use_giou + self.giou_weight = giou_weight + + def giou_loss(self, preds, bbox, eps=1e-7, reduction='mean'): + ''' + :param preds:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,] + :param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,] + :return: loss + ''' + ix1 = fluid.layers.elementwise_max(preds[:, 0], bbox[:, 0]) + iy1 = fluid.layers.elementwise_max(preds[:, 1], bbox[:, 1]) + ix2 = fluid.layers.elementwise_min(preds[:, 2], bbox[:, 2]) + iy2 = fluid.layers.elementwise_min(preds[:, 3], bbox[:, 3]) + + iw = fluid.layers.clip(ix2 - ix1 + 1e-3, 0., 1e10) + ih = fluid.layers.clip(iy2 - iy1 + 1e-3, 0., 1e10) + + # overlap + inters = iw * ih + + # union + uni = (preds[:, 2] - preds[:, 0] + 1e-3) * (preds[:, 3] - preds[:, 1] + 1e-3 + ) + (bbox[:, 2] - bbox[:, 0] + 1e-3) * ( + bbox[:, 3] - bbox[:, 1] + 1e-3) - inters + eps + + # ious + ious = inters / uni + + ex1 = fluid.layers.elementwise_min(preds[:, 0], bbox[:, 0]) + ey1 = fluid.layers.elementwise_min(preds[:, 1], bbox[:, 1]) + ex2 = fluid.layers.elementwise_max(preds[:, 2], bbox[:, 2]) + ey2 = fluid.layers.elementwise_max(preds[:, 3], bbox[:, 3]) + ew = fluid.layers.clip(ex2 - ex1 + 1e-3, 0., 1e10) + eh = fluid.layers.clip(ey2 - ey1 + 1e-3, 0., 1e10) + + # enclose erea + enclose = ew * eh + eps + giou = ious - (enclose - uni) / enclose + + loss = 1 - giou + + if reduction == 'mean': + loss = paddle.mean(loss) + elif reduction == 'sum': + loss = paddle.sum(loss) + else: + raise NotImplementedError + return loss + + def forward(self, predicts, batch): + structure_probs = predicts['structure_probs'] + structure_targets = batch[1].astype("int64") + structure_targets = structure_targets[:, 1:] + if len(batch) == 6: + structure_mask = batch[5].astype("int64") + structure_mask = structure_mask[:, 1:] + structure_mask = paddle.reshape(structure_mask, [-1]) + structure_probs = paddle.reshape(structure_probs, [-1, structure_probs.shape[-1]]) + structure_targets = paddle.reshape(structure_targets, [-1]) + structure_loss = self.loss_func(structure_probs, structure_targets) + + if len(batch) == 6: + structure_loss = structure_loss * structure_mask + +# structure_loss = paddle.sum(structure_loss) * self.structure_weight + structure_loss = paddle.mean(structure_loss) * self.structure_weight + + loc_preds = predicts['loc_preds'] + loc_targets = batch[2].astype("float32") + loc_targets_mask = batch[4].astype("float32") + loc_targets = loc_targets[:, 1:, :] + loc_targets_mask = loc_targets_mask[:, 1:, :] + loc_loss = F.mse_loss(loc_preds * loc_targets_mask, loc_targets) * self.loc_weight + if self.use_giou: + loc_loss_giou = self.giou_loss(loc_preds * loc_targets_mask, loc_targets) * self.giou_weight + total_loss = structure_loss + loc_loss + loc_loss_giou + return {'loss':total_loss, "structure_loss":structure_loss, "loc_loss":loc_loss, "loc_loss_giou":loc_loss_giou} + else: + total_loss = structure_loss + loc_loss + return {'loss':total_loss, "structure_loss":structure_loss, "loc_loss":loc_loss} \ No newline at end of file diff --git a/backend/ppocr/losses/vqa_token_layoutlm_loss.py b/backend/ppocr/losses/vqa_token_layoutlm_loss.py new file mode 100755 index 0000000..244893d --- /dev/null +++ b/backend/ppocr/losses/vqa_token_layoutlm_loss.py @@ -0,0 +1,42 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + + +class VQASerTokenLayoutLMLoss(nn.Layer): + def __init__(self, num_classes): + super().__init__() + self.loss_class = nn.CrossEntropyLoss() + self.num_classes = num_classes + self.ignore_index = self.loss_class.ignore_index + + def forward(self, predicts, batch): + labels = batch[1] + attention_mask = batch[4] + if attention_mask is not None: + active_loss = attention_mask.reshape([-1, ]) == 1 + active_outputs = predicts.reshape( + [-1, self.num_classes])[active_loss] + active_labels = labels.reshape([-1, ])[active_loss] + loss = self.loss_class(active_outputs, active_labels) + else: + loss = self.loss_class( + predicts.reshape([-1, self.num_classes]), + labels.reshape([-1, ])) + return {'loss': loss} diff --git a/backend/ppocr/metrics/__init__.py b/backend/ppocr/metrics/__init__.py new file mode 100644 index 0000000..c244066 --- /dev/null +++ b/backend/ppocr/metrics/__init__.py @@ -0,0 +1,47 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ["build_metric"] + +from .det_metric import DetMetric, DetFCEMetric +from .rec_metric import RecMetric +from .cls_metric import ClsMetric +from .e2e_metric import E2EMetric +from .distillation_metric import DistillationMetric +from .table_metric import TableMetric +from .kie_metric import KIEMetric +from .vqa_token_ser_metric import VQASerTokenMetric +from .vqa_token_re_metric import VQAReTokenMetric + + +def build_metric(config): + support_dict = [ + "DetMetric", "DetFCEMetric", "RecMetric", "ClsMetric", "E2EMetric", + "DistillationMetric", "TableMetric", 'KIEMetric', 'VQASerTokenMetric', + 'VQAReTokenMetric' + ] + + config = copy.deepcopy(config) + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "metric only support {}".format(support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/metrics/cls_metric.py b/backend/ppocr/metrics/cls_metric.py new file mode 100644 index 0000000..6c07751 --- /dev/null +++ b/backend/ppocr/metrics/cls_metric.py @@ -0,0 +1,46 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class ClsMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.reset() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + for (pred, pred_conf), (target, _) in zip(preds, labels): + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + return {'acc': correct_num / (all_num + self.eps), } + + def get_metric(self): + """ + return metrics { + 'acc': 0 + } + """ + acc = self.correct_num / (self.all_num + self.eps) + self.reset() + return {'acc': acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 diff --git a/backend/ppocr/metrics/det_metric.py b/backend/ppocr/metrics/det_metric.py new file mode 100644 index 0000000..dca94c0 --- /dev/null +++ b/backend/ppocr/metrics/det_metric.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +__all__ = ['DetMetric', 'DetFCEMetric'] + +from .eval_det_iou import DetectionIoUEvaluator + + +class DetMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.evaluator = DetectionIoUEvaluator() + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + ''' + batch: a list produced by dataloaders. + image: np.ndarray of shape (N, C, H, W). + ratio_list: np.ndarray of shape(N,2) + polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not. + preds: a list of dict produced by post process + points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ''' + gt_polyons_batch = batch[2] + ignore_tags_batch = batch[3] + for pred, gt_polyons, ignore_tags in zip(preds, gt_polyons_batch, + ignore_tags_batch): + # prepare gt + gt_info_list = [{ + 'points': gt_polyon, + 'text': '', + 'ignore': ignore_tag + } for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags)] + # prepare det + det_info_list = [{ + 'points': det_polyon, + 'text': '' + } for det_polyon in pred['points']] + result = self.evaluator.evaluate_image(gt_info_list, det_info_list) + self.results.append(result) + + def get_metric(self): + """ + return metrics { + 'precision': 0, + 'recall': 0, + 'hmean': 0 + } + """ + + metrics = self.evaluator.combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results + + +class DetFCEMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.evaluator = DetectionIoUEvaluator() + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + ''' + batch: a list produced by dataloaders. + image: np.ndarray of shape (N, C, H, W). + ratio_list: np.ndarray of shape(N,2) + polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not. + preds: a list of dict produced by post process + points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ''' + gt_polyons_batch = batch[2] + ignore_tags_batch = batch[3] + + for pred, gt_polyons, ignore_tags in zip(preds, gt_polyons_batch, + ignore_tags_batch): + # prepare gt + gt_info_list = [{ + 'points': gt_polyon, + 'text': '', + 'ignore': ignore_tag + } for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags)] + # prepare det + det_info_list = [{ + 'points': det_polyon, + 'text': '', + 'score': score + } for det_polyon, score in zip(pred['points'], pred['scores'])] + + for score_thr in self.results.keys(): + det_info_list_thr = [ + det_info for det_info in det_info_list + if det_info['score'] >= score_thr + ] + result = self.evaluator.evaluate_image(gt_info_list, + det_info_list_thr) + self.results[score_thr].append(result) + + def get_metric(self): + """ + return metrics {'heman':0, + 'thr 0.3':'precision: 0 recall: 0 hmean: 0', + 'thr 0.4':'precision: 0 recall: 0 hmean: 0', + 'thr 0.5':'precision: 0 recall: 0 hmean: 0', + 'thr 0.6':'precision: 0 recall: 0 hmean: 0', + 'thr 0.7':'precision: 0 recall: 0 hmean: 0', + 'thr 0.8':'precision: 0 recall: 0 hmean: 0', + 'thr 0.9':'precision: 0 recall: 0 hmean: 0', + } + """ + metrics = {} + hmean = 0 + for score_thr in self.results.keys(): + metric = self.evaluator.combine_results(self.results[score_thr]) + # for key, value in metric.items(): + # metrics['{}_{}'.format(key, score_thr)] = value + metric_str = 'precision:{:.5f} recall:{:.5f} hmean:{:.5f}'.format( + metric['precision'], metric['recall'], metric['hmean']) + metrics['thr {}'.format(score_thr)] = metric_str + hmean = max(hmean, metric['hmean']) + metrics['hmean'] = hmean + + self.reset() + return metrics + + def reset(self): + self.results = { + 0.3: [], + 0.4: [], + 0.5: [], + 0.6: [], + 0.7: [], + 0.8: [], + 0.9: [] + } # clear results diff --git a/backend/ppocr/metrics/distillation_metric.py b/backend/ppocr/metrics/distillation_metric.py new file mode 100644 index 0000000..c440ceb --- /dev/null +++ b/backend/ppocr/metrics/distillation_metric.py @@ -0,0 +1,73 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import copy + +from .rec_metric import RecMetric +from .det_metric import DetMetric +from .e2e_metric import E2EMetric +from .cls_metric import ClsMetric + + +class DistillationMetric(object): + def __init__(self, + key=None, + base_metric_name=None, + main_indicator=None, + **kwargs): + self.main_indicator = main_indicator + self.key = key + self.main_indicator = main_indicator + self.base_metric_name = base_metric_name + self.kwargs = kwargs + self.metrics = None + + def _init_metrcis(self, preds): + self.metrics = dict() + mod = importlib.import_module(__name__) + for key in preds: + self.metrics[key] = getattr(mod, self.base_metric_name)( + main_indicator=self.main_indicator, **self.kwargs) + self.metrics[key].reset() + + def __call__(self, preds, batch, **kwargs): + assert isinstance(preds, dict) + if self.metrics is None: + self._init_metrcis(preds) + output = dict() + for key in preds: + self.metrics[key].__call__(preds[key], batch, **kwargs) + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + output = dict() + for key in self.metrics: + metric = self.metrics[key].get_metric() + # main indicator + if key == self.key: + output.update(metric) + else: + for sub_key in metric: + output["{}_{}".format(key, sub_key)] = metric[sub_key] + return output + + def reset(self): + for key in self.metrics: + self.metrics[key].reset() diff --git a/backend/ppocr/metrics/e2e_metric.py b/backend/ppocr/metrics/e2e_metric.py new file mode 100644 index 0000000..2f8ba3b --- /dev/null +++ b/backend/ppocr/metrics/e2e_metric.py @@ -0,0 +1,86 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +__all__ = ['E2EMetric'] + +from ppocr.utils.e2e_metric.Deteval import get_socre_A, get_socre_B, combine_results +from ppocr.utils.e2e_utils.extract_textpoint_slow import get_dict + + +class E2EMetric(object): + def __init__(self, + mode, + gt_mat_dir, + character_dict_path, + main_indicator='f_score_e2e', + **kwargs): + self.mode = mode + self.gt_mat_dir = gt_mat_dir + self.label_list = get_dict(character_dict_path) + self.max_index = len(self.label_list) + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + if self.mode == 'A': + gt_polyons_batch = batch[2] + temp_gt_strs_batch = batch[3][0] + ignore_tags_batch = batch[4] + gt_strs_batch = [] + + for temp_list in temp_gt_strs_batch: + t = "" + for index in temp_list: + if index < self.max_index: + t += self.label_list[index] + gt_strs_batch.append(t) + + for pred, gt_polyons, gt_strs, ignore_tags in zip( + [preds], gt_polyons_batch, [gt_strs_batch], ignore_tags_batch): + # prepare gt + gt_info_list = [{ + 'points': gt_polyon, + 'text': gt_str, + 'ignore': ignore_tag + } for gt_polyon, gt_str, ignore_tag in + zip(gt_polyons, gt_strs, ignore_tags)] + # prepare det + e2e_info_list = [{ + 'points': det_polyon, + 'texts': pred_str + } for det_polyon, pred_str in + zip(pred['points'], pred['texts'])] + + result = get_socre_A(gt_info_list, e2e_info_list) + self.results.append(result) + else: + img_id = batch[5][0] + e2e_info_list = [{ + 'points': det_polyon, + 'texts': pred_str + } for det_polyon, pred_str in zip(preds['points'], preds['texts'])] + result = get_socre_B(self.gt_mat_dir, img_id, e2e_info_list) + self.results.append(result) + + def get_metric(self): + metrics = combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results diff --git a/backend/ppocr/metrics/eval_det_iou.py b/backend/ppocr/metrics/eval_det_iou.py new file mode 100644 index 0000000..bc05e7d --- /dev/null +++ b/backend/ppocr/metrics/eval_det_iou.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon +""" +reference from : +https://github.com/MhLiao/DB/blob/3c32b808d4412680310d3d28eeb6a2d5bf1566c5/concern/icdar2015_eval/detection/iou.py#L8 +""" + + +class DetectionIoUEvaluator(object): + def __init__(self, iou_constraint=0.5, area_precision_constraint=0.5): + self.iou_constraint = iou_constraint + self.area_precision_constraint = area_precision_constraint + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def compute_ap(confList, matchList, numGtCare): + correct = 0 + AP = 0 + if len(confList) > 0: + confList = np.array(confList) + matchList = np.array(matchList) + sorted_ind = np.argsort(-confList) + confList = confList[sorted_ind] + matchList = matchList[sorted_ind] + for n in range(len(confList)): + match = matchList[n] + if match: + correct += 1 + AP += float(correct) / (n + 1) + + if numGtCare > 0: + AP /= numGtCare + + return AP + + perSampleMetrics = {} + + matchedSum = 0 + + Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax') + + numGlobalCareGt = 0 + numGlobalCareDet = 0 + + arrGlobalConfidences = [] + arrGlobalMatches = [] + + recall = 0 + precision = 0 + hmean = 0 + + detMatched = 0 + + iouMat = np.empty([1, 1]) + + gtPols = [] + detPols = [] + + gtPolPoints = [] + detPolPoints = [] + + # Array of Ground Truth Polygons' keys marked as don't Care + gtDontCarePolsNum = [] + # Array of Detected Polygons' matched with a don't Care GT + detDontCarePolsNum = [] + + pairs = [] + detMatchedNums = [] + + arrSampleConfidences = [] + arrSampleMatch = [] + + evaluationLog = "" + + # print(len(gt)) + for n in range(len(gt)): + points = gt[n]['points'] + # transcription = gt[n]['text'] + dontCare = gt[n]['ignore'] + # points = Polygon(points) + # points = points.buffer(0) + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + gtPol = points + gtPols.append(gtPol) + gtPolPoints.append(points) + if dontCare: + gtDontCarePolsNum.append(len(gtPols) - 1) + + evaluationLog += "GT polygons: " + str(len(gtPols)) + ( + " (" + str(len(gtDontCarePolsNum)) + " don't care)\n" + if len(gtDontCarePolsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + # points = Polygon(points) + # points = points.buffer(0) + if not Polygon(points).is_valid or not Polygon(points).is_simple: + continue + + detPol = points + detPols.append(detPol) + detPolPoints.append(points) + if len(gtDontCarePolsNum) > 0: + for dontCarePol in gtDontCarePolsNum: + dontCarePol = gtPols[dontCarePol] + intersected_area = get_intersection(dontCarePol, detPol) + pdDimensions = Polygon(detPol).area + precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions + if (precision > self.area_precision_constraint): + detDontCarePolsNum.append(len(detPols) - 1) + break + + evaluationLog += "DET polygons: " + str(len(detPols)) + ( + " (" + str(len(detDontCarePolsNum)) + " don't care)\n" + if len(detDontCarePolsNum) > 0 else "\n") + + if len(gtPols) > 0 and len(detPols) > 0: + # Calculate IoU and precision matrixs + outputShape = [len(gtPols), len(detPols)] + iouMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtPols), np.int8) + detRectMat = np.zeros(len(detPols), np.int8) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = gtPols[gtNum] + pD = detPols[detNum] + iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG) + + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum: + if iouMat[gtNum, detNum] > self.iou_constraint: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + detMatched += 1 + pairs.append({'gt': gtNum, 'det': detNum}) + detMatchedNums.append(detNum) + evaluationLog += "Match GT #" + \ + str(gtNum) + " with Det #" + str(detNum) + "\n" + + numGtCare = (len(gtPols) - len(gtDontCarePolsNum)) + numDetCare = (len(detPols) - len(detDontCarePolsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if numDetCare > 0 else float(1) + else: + recall = float(detMatched) / numGtCare + precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare + + hmean = 0 if (precision + recall) == 0 else 2.0 * \ + precision * recall / (precision + recall) + + matchedSum += detMatched + numGlobalCareGt += numGtCare + numGlobalCareDet += numDetCare + + perSampleMetrics = { + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'detMatched': detMatched, + } + return perSampleMetrics + + def combine_results(self, results): + numGlobalCareGt = 0 + numGlobalCareDet = 0 + matchedSum = 0 + for result in results: + numGlobalCareGt += result['gtCare'] + numGlobalCareDet += result['detCare'] + matchedSum += result['detMatched'] + + methodRecall = 0 if numGlobalCareGt == 0 else float( + matchedSum) / numGlobalCareGt + methodPrecision = 0 if numGlobalCareDet == 0 else float( + matchedSum) / numGlobalCareDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * \ + methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + # print(methodRecall, methodPrecision, methodHmean) + # sys.exit(-1) + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + + +if __name__ == '__main__': + evaluator = DetectionIoUEvaluator() + gts = [[{ + 'points': [(0, 0), (1, 0), (1, 1), (0, 1)], + 'text': 1234, + 'ignore': False, + }, { + 'points': [(2, 2), (3, 2), (3, 3), (2, 3)], + 'text': 5678, + 'ignore': False, + }]] + preds = [[{ + 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + 'text': 123, + 'ignore': False, + }]] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/backend/ppocr/metrics/kie_metric.py b/backend/ppocr/metrics/kie_metric.py new file mode 100644 index 0000000..28ab22b --- /dev/null +++ b/backend/ppocr/metrics/kie_metric.py @@ -0,0 +1,71 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# The code is refer from: https://github.com/open-mmlab/mmocr/blob/main/mmocr/core/evaluation/kie_metric.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +__all__ = ['KIEMetric'] + + +class KIEMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.main_indicator = main_indicator + self.reset() + self.node = [] + self.gt = [] + + def __call__(self, preds, batch, **kwargs): + nodes, _ = preds + gts, tag = batch[4].squeeze(0), batch[5].tolist()[0] + gts = gts[:tag[0], :1].reshape([-1]) + self.node.append(nodes.numpy()) + self.gt.append(gts) + # result = self.compute_f1_score(nodes, gts) + # self.results.append(result) + + def compute_f1_score(self, preds, gts): + ignores = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25] + C = preds.shape[1] + classes = np.array(sorted(set(range(C)) - set(ignores))) + hist = np.bincount( + (gts * C).astype('int64') + preds.argmax(1), minlength=C + **2).reshape([C, C]).astype('float32') + diag = np.diag(hist) + recalls = diag / hist.sum(1).clip(min=1) + precisions = diag / hist.sum(0).clip(min=1) + f1 = 2 * recalls * precisions / (recalls + precisions).clip(min=1e-8) + return f1[classes] + + def combine_results(self, results): + node = np.concatenate(self.node, 0) + gts = np.concatenate(self.gt, 0) + results = self.compute_f1_score(node, gts) + data = {'hmean': results.mean()} + return data + + def get_metric(self): + + metrics = self.combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results + self.node = [] + self.gt = [] diff --git a/backend/ppocr/metrics/rec_metric.py b/backend/ppocr/metrics/rec_metric.py new file mode 100644 index 0000000..515b937 --- /dev/null +++ b/backend/ppocr/metrics/rec_metric.py @@ -0,0 +1,76 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import Levenshtein +import string + + +class RecMetric(object): + def __init__(self, + main_indicator='acc', + is_filter=False, + ignore_space=True, + **kwargs): + self.main_indicator = main_indicator + self.is_filter = is_filter + self.ignore_space = ignore_space + self.eps = 1e-5 + self.reset() + + def _normalize_text(self, text): + text = ''.join( + filter(lambda x: x in (string.digits + string.ascii_letters), text)) + return text.lower() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + norm_edit_dis = 0.0 + for (pred, pred_conf), (target, _) in zip(preds, labels): + if self.ignore_space: + pred = pred.replace(" ", "") + target = target.replace(" ", "") + if self.is_filter: + pred = self._normalize_text(pred) + target = self._normalize_text(target) + norm_edit_dis += Levenshtein.distance(pred, target) / max( + len(pred), len(target), 1) + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + self.norm_edit_dis += norm_edit_dis + return { + 'acc': correct_num / (all_num + self.eps), + 'norm_edit_dis': 1 - norm_edit_dis / (all_num + self.eps) + } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + norm_edit_dis = 1 - self.norm_edit_dis / (self.all_num + self.eps) + self.reset() + return {'acc': acc, 'norm_edit_dis': norm_edit_dis} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.norm_edit_dis = 0 diff --git a/backend/ppocr/metrics/table_metric.py b/backend/ppocr/metrics/table_metric.py new file mode 100644 index 0000000..ca4d647 --- /dev/null +++ b/backend/ppocr/metrics/table_metric.py @@ -0,0 +1,51 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + + +class TableMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.reset() + + def __call__(self, pred, batch, *args, **kwargs): + structure_probs = pred['structure_probs'].numpy() + structure_labels = batch[1] + correct_num = 0 + all_num = 0 + structure_probs = np.argmax(structure_probs, axis=2) + structure_labels = structure_labels[:, 1:] + batch_size = structure_probs.shape[0] + for bno in range(batch_size): + all_num += 1 + if (structure_probs[bno] == structure_labels[bno]).all(): + correct_num += 1 + self.correct_num += correct_num + self.all_num += all_num + return {'acc': correct_num * 1.0 / (all_num + self.eps), } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + self.reset() + return {'acc': acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 diff --git a/backend/ppocr/metrics/vqa_token_re_metric.py b/backend/ppocr/metrics/vqa_token_re_metric.py new file mode 100644 index 0000000..8a13bc0 --- /dev/null +++ b/backend/ppocr/metrics/vqa_token_re_metric.py @@ -0,0 +1,176 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +__all__ = ['KIEMetric'] + + +class VQAReTokenMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + pred_relations, relations, entities = preds + self.pred_relations_list.extend(pred_relations) + self.relations_list.extend(relations) + self.entities_list.extend(entities) + + def get_metric(self): + gt_relations = [] + for b in range(len(self.relations_list)): + rel_sent = [] + for head, tail in zip(self.relations_list[b]["head"], + self.relations_list[b]["tail"]): + rel = {} + rel["head_id"] = head + rel["head"] = (self.entities_list[b]["start"][rel["head_id"]], + self.entities_list[b]["end"][rel["head_id"]]) + rel["head_type"] = self.entities_list[b]["label"][rel[ + "head_id"]] + + rel["tail_id"] = tail + rel["tail"] = (self.entities_list[b]["start"][rel["tail_id"]], + self.entities_list[b]["end"][rel["tail_id"]]) + rel["tail_type"] = self.entities_list[b]["label"][rel[ + "tail_id"]] + + rel["type"] = 1 + rel_sent.append(rel) + gt_relations.append(rel_sent) + re_metrics = self.re_score( + self.pred_relations_list, gt_relations, mode="boundaries") + metrics = { + "precision": re_metrics["ALL"]["p"], + "recall": re_metrics["ALL"]["r"], + "hmean": re_metrics["ALL"]["f1"], + } + self.reset() + return metrics + + def reset(self): + self.pred_relations_list = [] + self.relations_list = [] + self.entities_list = [] + + def re_score(self, pred_relations, gt_relations, mode="strict"): + """Evaluate RE predictions + + Args: + pred_relations (list) : list of list of predicted relations (several relations in each sentence) + gt_relations (list) : list of list of ground truth relations + + rel = { "head": (start_idx (inclusive), end_idx (exclusive)), + "tail": (start_idx (inclusive), end_idx (exclusive)), + "head_type": ent_type, + "tail_type": ent_type, + "type": rel_type} + + vocab (Vocab) : dataset vocabulary + mode (str) : in 'strict' or 'boundaries'""" + + assert mode in ["strict", "boundaries"] + + relation_types = [v for v in [0, 1] if not v == 0] + scores = { + rel: { + "tp": 0, + "fp": 0, + "fn": 0 + } + for rel in relation_types + ["ALL"] + } + + # Count GT relations and Predicted relations + n_sents = len(gt_relations) + n_rels = sum([len([rel for rel in sent]) for sent in gt_relations]) + n_found = sum([len([rel for rel in sent]) for sent in pred_relations]) + + # Count TP, FP and FN per type + for pred_sent, gt_sent in zip(pred_relations, gt_relations): + for rel_type in relation_types: + # strict mode takes argument types into account + if mode == "strict": + pred_rels = {(rel["head"], rel["head_type"], rel["tail"], + rel["tail_type"]) + for rel in pred_sent + if rel["type"] == rel_type} + gt_rels = {(rel["head"], rel["head_type"], rel["tail"], + rel["tail_type"]) + for rel in gt_sent if rel["type"] == rel_type} + + # boundaries mode only takes argument spans into account + elif mode == "boundaries": + pred_rels = {(rel["head"], rel["tail"]) + for rel in pred_sent + if rel["type"] == rel_type} + gt_rels = {(rel["head"], rel["tail"]) + for rel in gt_sent if rel["type"] == rel_type} + + scores[rel_type]["tp"] += len(pred_rels & gt_rels) + scores[rel_type]["fp"] += len(pred_rels - gt_rels) + scores[rel_type]["fn"] += len(gt_rels - pred_rels) + + # Compute per entity Precision / Recall / F1 + for rel_type in scores.keys(): + if scores[rel_type]["tp"]: + scores[rel_type]["p"] = scores[rel_type]["tp"] / ( + scores[rel_type]["fp"] + scores[rel_type]["tp"]) + scores[rel_type]["r"] = scores[rel_type]["tp"] / ( + scores[rel_type]["fn"] + scores[rel_type]["tp"]) + else: + scores[rel_type]["p"], scores[rel_type]["r"] = 0, 0 + + if not scores[rel_type]["p"] + scores[rel_type]["r"] == 0: + scores[rel_type]["f1"] = ( + 2 * scores[rel_type]["p"] * scores[rel_type]["r"] / + (scores[rel_type]["p"] + scores[rel_type]["r"])) + else: + scores[rel_type]["f1"] = 0 + + # Compute micro F1 Scores + tp = sum([scores[rel_type]["tp"] for rel_type in relation_types]) + fp = sum([scores[rel_type]["fp"] for rel_type in relation_types]) + fn = sum([scores[rel_type]["fn"] for rel_type in relation_types]) + + if tp: + precision = tp / (tp + fp) + recall = tp / (tp + fn) + f1 = 2 * precision * recall / (precision + recall) + + else: + precision, recall, f1 = 0, 0, 0 + + scores["ALL"]["p"] = precision + scores["ALL"]["r"] = recall + scores["ALL"]["f1"] = f1 + scores["ALL"]["tp"] = tp + scores["ALL"]["fp"] = fp + scores["ALL"]["fn"] = fn + + # Compute Macro F1 Scores + scores["ALL"]["Macro_f1"] = np.mean( + [scores[ent_type]["f1"] for ent_type in relation_types]) + scores["ALL"]["Macro_p"] = np.mean( + [scores[ent_type]["p"] for ent_type in relation_types]) + scores["ALL"]["Macro_r"] = np.mean( + [scores[ent_type]["r"] for ent_type in relation_types]) + + return scores diff --git a/backend/ppocr/metrics/vqa_token_ser_metric.py b/backend/ppocr/metrics/vqa_token_ser_metric.py new file mode 100644 index 0000000..286d8ad --- /dev/null +++ b/backend/ppocr/metrics/vqa_token_ser_metric.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +__all__ = ['KIEMetric'] + + +class VQASerTokenMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + preds, labels = preds + self.pred_list.extend(preds) + self.gt_list.extend(labels) + + def get_metric(self): + from seqeval.metrics import f1_score, precision_score, recall_score + metrics = { + "precision": precision_score(self.gt_list, self.pred_list), + "recall": recall_score(self.gt_list, self.pred_list), + "hmean": f1_score(self.gt_list, self.pred_list), + } + self.reset() + return metrics + + def reset(self): + self.pred_list = [] + self.gt_list = [] diff --git a/backend/ppocr/modeling/architectures/__init__.py b/backend/ppocr/modeling/architectures/__init__.py new file mode 100755 index 0000000..e9a01cf --- /dev/null +++ b/backend/ppocr/modeling/architectures/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import importlib + +from .base_model import BaseModel +from .distillation_model import DistillationModel + +__all__ = ['build_model'] + + +def build_model(config): + config = copy.deepcopy(config) + if not "name" in config: + arch = BaseModel(config) + else: + name = config.pop("name") + mod = importlib.import_module(__name__) + arch = getattr(mod, name)(config) + return arch diff --git a/backend/ppocr/modeling/architectures/base_model.py b/backend/ppocr/modeling/architectures/base_model.py new file mode 100644 index 0000000..c6b50d4 --- /dev/null +++ b/backend/ppocr/modeling/architectures/base_model.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from paddle import nn +from ppocr.modeling.transforms import build_transform +from ppocr.modeling.backbones import build_backbone +from ppocr.modeling.necks import build_neck +from ppocr.modeling.heads import build_head + +__all__ = ['BaseModel'] + + +class BaseModel(nn.Layer): + def __init__(self, config): + """ + the module for OCR. + args: + config (dict): the super parameters for module. + """ + super(BaseModel, self).__init__() + in_channels = config.get('in_channels', 3) + model_type = config['model_type'] + # build transfrom, + # for rec, transfrom can be TPS,None + # for det and cls, transfrom shoule to be None, + # if you make model differently, you can use transfrom in det and cls + if 'Transform' not in config or config['Transform'] is None: + self.use_transform = False + else: + self.use_transform = True + config['Transform']['in_channels'] = in_channels + self.transform = build_transform(config['Transform']) + in_channels = self.transform.out_channels + + # build backbone, backbone is need for del, rec and cls + config["Backbone"]['in_channels'] = in_channels + self.backbone = build_backbone(config["Backbone"], model_type) + in_channels = self.backbone.out_channels + + # build neck + # for rec, neck can be cnn,rnn or reshape(None) + # for det, neck can be FPN, BIFPN and so on. + # for cls, neck should be none + if 'Neck' not in config or config['Neck'] is None: + self.use_neck = False + else: + self.use_neck = True + config['Neck']['in_channels'] = in_channels + self.neck = build_neck(config['Neck']) + in_channels = self.neck.out_channels + + # # build head, head is need for det, rec and cls + if 'Head' not in config or config['Head'] is None: + self.use_head = False + else: + self.use_head = True + config["Head"]['in_channels'] = in_channels + self.head = build_head(config["Head"]) + + self.return_all_feats = config.get("return_all_feats", False) + + def forward(self, x, data=None): + y = dict() + if self.use_transform: + x = self.transform(x) + x = self.backbone(x) + y["backbone_out"] = x + if self.use_neck: + x = self.neck(x) + y["neck_out"] = x + if self.use_head: + x = self.head(x, targets=data) + # for multi head, save ctc neck out for udml + if isinstance(x, dict) and 'ctc_neck' in x.keys(): + y["neck_out"] = x["ctc_neck"] + y["head_out"] = x + elif isinstance(x, dict): + y.update(x) + else: + y["head_out"] = x + if self.return_all_feats: + if self.training: + return y + else: + return {"head_out": y["head_out"]} + else: + return x diff --git a/backend/ppocr/modeling/architectures/distillation_model.py b/backend/ppocr/modeling/architectures/distillation_model.py new file mode 100644 index 0000000..cce8fd3 --- /dev/null +++ b/backend/ppocr/modeling/architectures/distillation_model.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from ppocr.modeling.transforms import build_transform +from ppocr.modeling.backbones import build_backbone +from ppocr.modeling.necks import build_neck +from ppocr.modeling.heads import build_head +from .base_model import BaseModel +from ppocr.utils.save_load import load_pretrained_params + +__all__ = ['DistillationModel'] + + +class DistillationModel(nn.Layer): + def __init__(self, config): + """ + the module for OCR distillation. + args: + config (dict): the super parameters for module. + """ + super().__init__() + self.model_list = [] + self.model_name_list = [] + for key in config["Models"]: + model_config = config["Models"][key] + freeze_params = False + pretrained = None + if "freeze_params" in model_config: + freeze_params = model_config.pop("freeze_params") + if "pretrained" in model_config: + pretrained = model_config.pop("pretrained") + model = BaseModel(model_config) + if pretrained is not None: + load_pretrained_params(model, pretrained) + if freeze_params: + for param in model.parameters(): + param.trainable = False + self.model_list.append(self.add_sublayer(key, model)) + self.model_name_list.append(key) + + def forward(self, x, data=None): + result_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + result_dict[model_name] = self.model_list[idx](x, data) + return result_dict diff --git a/backend/ppocr/modeling/backbones/__init__.py b/backend/ppocr/modeling/backbones/__init__.py new file mode 100755 index 0000000..072d6e0 --- /dev/null +++ b/backend/ppocr/modeling/backbones/__init__.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_backbone"] + + +def build_backbone(config, model_type): + if model_type == "det" or model_type == "table": + from .det_mobilenet_v3 import MobileNetV3 + from .det_resnet_vd import ResNet + from .det_resnet_vd_sast import ResNet_SAST + support_dict = ["MobileNetV3", "ResNet", "ResNet_SAST"] + elif model_type == "rec" or model_type == "cls": + from .rec_mobilenet_v3 import MobileNetV3 + from .rec_resnet_vd import ResNet + from .rec_resnet_fpn import ResNetFPN + from .rec_mv1_enhance import MobileNetV1Enhance + from .rec_nrtr_mtb import MTB + from .rec_resnet_31 import ResNet31 + from .rec_resnet_aster import ResNet_ASTER + from .rec_micronet import MicroNet + from .rec_efficientb3_pren import EfficientNetb3_PREN + from .rec_svtrnet import SVTRNet + support_dict = [ + 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', + "ResNet31", "ResNet_ASTER", 'MicroNet', 'EfficientNetb3_PREN', + 'SVTRNet' + ] + elif model_type == "e2e": + from .e2e_resnet_vd_pg import ResNet + support_dict = ['ResNet'] + elif model_type == 'kie': + from .kie_unet_sdmgr import Kie_backbone + support_dict = ['Kie_backbone'] + elif model_type == "table": + from .table_resnet_vd import ResNet + from .table_mobilenet_v3 import MobileNetV3 + support_dict = ["ResNet", "MobileNetV3"] + elif model_type == 'vqa': + from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe + support_dict = [ + "LayoutLMForSer", "LayoutLMv2ForSer", 'LayoutLMv2ForRe', + "LayoutXLMForSer", 'LayoutXLMForRe' + ] + else: + raise NotImplementedError + + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "when model typs is {}, backbone only support {}".format(model_type, + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/modeling/backbones/det_mobilenet_v3.py b/backend/ppocr/modeling/backbones/det_mobilenet_v3.py new file mode 100755 index 0000000..05113ea --- /dev/null +++ b/backend/ppocr/modeling/backbones/det_mobilenet_v3.py @@ -0,0 +1,268 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + +__all__ = ['MobileNetV3'] + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class MobileNetV3(nn.Layer): + def __init__(self, + in_channels=3, + model_name='large', + scale=0.5, + disable_se=False, + **kwargs): + """ + the MobilenetV3 backbone network for detection module. + Args: + params(dict): the super parameters for build network + """ + super(MobileNetV3, self).__init__() + + self.disable_se = disable_se + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', 1], + [3, 64, 24, False, 'relu', 2], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', 2], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hardswish', 2], + [3, 200, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 480, 112, True, 'hardswish', 1], + [3, 672, 112, True, 'hardswish', 1], + [5, 672, 160, True, 'hardswish', 2], + [5, 960, 160, True, 'hardswish', 1], + [5, 960, 160, True, 'hardswish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', 2], + [3, 72, 24, False, 'relu', 2], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hardswish', 2], + [5, 240, 40, True, 'hardswish', 1], + [5, 240, 40, True, 'hardswish', 1], + [5, 120, 48, True, 'hardswish', 1], + [5, 144, 48, True, 'hardswish', 1], + [5, 288, 96, True, 'hardswish', 2], + [5, 576, 96, True, 'hardswish', 1], + [5, 576, 96, True, 'hardswish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scale are {} but input scale is {}".format(supported_scale, scale) + inplanes = 16 + # conv1 + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hardswish') + + self.stages = [] + self.out_channels = [] + block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + se = se and not self.disable_se + start_idx = 2 if model_name == 'large' else 0 + if s == 2 and i > start_idx: + self.out_channels.append(inplanes) + self.stages.append(nn.Sequential(*block_list)) + block_list = [] + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl)) + inplanes = make_divisible(scale * c) + i += 1 + block_list.append( + ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hardswish')) + self.stages.append(nn.Sequential(*block_list)) + self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) + for i, stage in enumerate(self.stages): + self.add_sublayer(sublayer=stage, name="stage{}".format(i)) + + def forward(self, x): + x = self.conv(x) + out_list = [] + for stage in self.stages: + x = stage(x) + out_list.append(x) + return out_list + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False) + + self.bn = nn.BatchNorm(num_channels=out_channels, act=None) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = F.hardswish(x) + else: + print("The activation function({}) is selected incorrectly.". + format(self.act)) + exit() + return x + + +class ResidualUnit(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size, + stride, + use_se, + act=None): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_channels == out_channels + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=True, + act=act) + self.bottleneck_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + groups=mid_channels, + if_act=True, + act=act) + if self.if_se: + self.mid_se = SEModule(mid_channels) + self.linear_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None) + + def forward(self, inputs): + x = self.expand_conv(inputs) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(inputs, x) + return x + + +class SEModule(nn.Layer): + def __init__(self, in_channels, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels // reduction, + kernel_size=1, + stride=1, + padding=0) + self.conv2 = nn.Conv2D( + in_channels=in_channels // reduction, + out_channels=in_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) + return inputs * outputs diff --git a/backend/ppocr/modeling/backbones/det_resnet_vd.py b/backend/ppocr/modeling/backbones/det_resnet_vd.py new file mode 100644 index 0000000..8c955a4 --- /dev/null +++ b/backend/ppocr/modeling/backbones/det_resnet_vd.py @@ -0,0 +1,351 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.vision.ops import DeformConv2D +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Normal, Constant, XavierUniform + +__all__ = ["ResNet"] + + +class DeformableConvV2(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + weight_attr=None, + bias_attr=None, + lr_scale=1, + regularizer=None, + skip_quant=False, + dcn_bias_regularizer=L2Decay(0.), + dcn_bias_lr_scale=2.): + super(DeformableConvV2, self).__init__() + self.offset_channel = 2 * kernel_size**2 * groups + self.mask_channel = kernel_size**2 * groups + + if bias_attr: + # in FCOS-DCN head, specifically need learning_rate and regularizer + dcn_bias_attr = ParamAttr( + initializer=Constant(value=0), + regularizer=dcn_bias_regularizer, + learning_rate=dcn_bias_lr_scale) + else: + # in ResNet backbone, do not need bias + dcn_bias_attr = False + self.conv_dcn = DeformConv2D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 * dilation, + dilation=dilation, + deformable_groups=groups, + weight_attr=weight_attr, + bias_attr=dcn_bias_attr) + + if lr_scale == 1 and regularizer is None: + offset_bias_attr = ParamAttr(initializer=Constant(0.)) + else: + offset_bias_attr = ParamAttr( + initializer=Constant(0.), + learning_rate=lr_scale, + regularizer=regularizer) + self.conv_offset = nn.Conv2D( + in_channels, + groups * 3 * kernel_size**2, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + weight_attr=ParamAttr(initializer=Constant(0.0)), + bias_attr=offset_bias_attr) + if skip_quant: + self.conv_offset.skip_quant = True + + def forward(self, x): + offset_mask = self.conv_offset(x) + offset, mask = paddle.split( + offset_mask, + num_or_sections=[self.offset_channel, self.mask_channel], + axis=1) + mask = F.sigmoid(mask) + y = self.conv_dcn(x, offset, mask=mask) + return y + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + is_dcn=False): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + if not is_dcn: + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias_attr=False) + else: + self._conv = DeformableConvV2( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=2, #groups, + bias_attr=False) + self._batch_norm = nn.BatchNorm(out_channels, act=act) + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + is_dcn=False, ): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu') + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + is_dcn=is_dcn) + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, ): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu') + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, + in_channels=3, + layers=50, + dcn_stage=None, + out_indices=None, + **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.dcn_stage = dcn_stage if dcn_stage is not None else [ + False, False, False, False + ] + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu') + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu') + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu') + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.stages = [] + self.out_channels = [] + if layers >= 50: + for block in range(len(depth)): + block_list = [] + shortcut = False + is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + is_dcn=is_dcn)) + shortcut = True + block_list.append(bottleneck_block) + if block in self.out_indices: + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + block_list = [] + shortcut = False + # is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0)) + shortcut = True + block_list.append(basic_block) + if block in self.out_indices: + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + out = [] + for i, block in enumerate(self.stages): + y = block(y) + if i in self.out_indices: + out.append(y) + return out diff --git a/backend/ppocr/modeling/backbones/det_resnet_vd_sast.py b/backend/ppocr/modeling/backbones/det_resnet_vd_sast.py new file mode 100644 index 0000000..c9376a8 --- /dev/null +++ b/backend/ppocr/modeling/backbones/det_resnet_vd_sast.py @@ -0,0 +1,285 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ["ResNet_SAST"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet_SAST(nn.Layer): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet_SAST, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + # depth = [3, 4, 6, 3] + depth = [3, 4, 6, 3, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + # num_channels = [64, 256, 512, + # 1024] if layers >= 50 else [64, 64, 128, 256] + # num_filters = [64, 128, 256, 512] + num_channels = [64, 256, 512, + 1024, 2048] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.stages = [] + self.out_channels = [3, 64] + if layers >= 50: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(bottleneck_block) + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(basic_block) + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + out = [inputs] + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + out.append(y) + y = self.pool2d_max(y) + for block in self.stages: + y = block(y) + out.append(y) + return out \ No newline at end of file diff --git a/backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py b/backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py new file mode 100644 index 0000000..97afd34 --- /dev/null +++ b/backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py @@ -0,0 +1,265 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ["ResNet"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + # depth = [3, 4, 6, 3] + depth = [3, 4, 6, 3, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, 1024, + 2048] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act='relu', + name="conv1_1") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.stages = [] + self.out_channels = [3, 64] + # num_filters = [64, 128, 256, 512, 512] + if layers >= 50: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(bottleneck_block) + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(basic_block) + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + out = [inputs] + y = self.conv1_1(inputs) + out.append(y) + y = self.pool2d_max(y) + for block in self.stages: + y = block(y) + out.append(y) + return out diff --git a/backend/ppocr/modeling/backbones/kie_unet_sdmgr.py b/backend/ppocr/modeling/backbones/kie_unet_sdmgr.py new file mode 100644 index 0000000..545e4e7 --- /dev/null +++ b/backend/ppocr/modeling/backbones/kie_unet_sdmgr.py @@ -0,0 +1,186 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import numpy as np +import cv2 + +__all__ = ["Kie_backbone"] + + +class Encoder(nn.Layer): + def __init__(self, num_channels, num_filters): + super(Encoder, self).__init__() + self.conv1 = nn.Conv2D( + num_channels, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm(num_filters, act='relu') + + self.conv2 = nn.Conv2D( + num_filters, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn2 = nn.BatchNorm(num_filters, act='relu') + + self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + def forward(self, inputs): + x = self.conv1(inputs) + x = self.bn1(x) + x = self.conv2(x) + x = self.bn2(x) + x_pooled = self.pool(x) + return x, x_pooled + + +class Decoder(nn.Layer): + def __init__(self, num_channels, num_filters): + super(Decoder, self).__init__() + + self.conv1 = nn.Conv2D( + num_channels, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm(num_filters, act='relu') + + self.conv2 = nn.Conv2D( + num_filters, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn2 = nn.BatchNorm(num_filters, act='relu') + + self.conv0 = nn.Conv2D( + num_channels, + num_filters, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False) + self.bn0 = nn.BatchNorm(num_filters, act='relu') + + def forward(self, inputs_prev, inputs): + x = self.conv0(inputs) + x = self.bn0(x) + x = paddle.nn.functional.interpolate( + x, scale_factor=2, mode='bilinear', align_corners=False) + x = paddle.concat([inputs_prev, x], axis=1) + x = self.conv1(x) + x = self.bn1(x) + x = self.conv2(x) + x = self.bn2(x) + return x + + +class UNet(nn.Layer): + def __init__(self): + super(UNet, self).__init__() + self.down1 = Encoder(num_channels=3, num_filters=16) + self.down2 = Encoder(num_channels=16, num_filters=32) + self.down3 = Encoder(num_channels=32, num_filters=64) + self.down4 = Encoder(num_channels=64, num_filters=128) + self.down5 = Encoder(num_channels=128, num_filters=256) + + self.up1 = Decoder(32, 16) + self.up2 = Decoder(64, 32) + self.up3 = Decoder(128, 64) + self.up4 = Decoder(256, 128) + self.out_channels = 16 + + def forward(self, inputs): + x1, _ = self.down1(inputs) + _, x2 = self.down2(x1) + _, x3 = self.down3(x2) + _, x4 = self.down4(x3) + _, x5 = self.down5(x4) + + x = self.up4(x4, x5) + x = self.up3(x3, x) + x = self.up2(x2, x) + x = self.up1(x1, x) + return x + + +class Kie_backbone(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(Kie_backbone, self).__init__() + self.out_channels = 16 + self.img_feat = UNet() + self.maxpool = nn.MaxPool2D(kernel_size=7) + + def bbox2roi(self, bbox_list): + rois_list = [] + rois_num = [] + for img_id, bboxes in enumerate(bbox_list): + rois_num.append(bboxes.shape[0]) + rois_list.append(bboxes) + rois = paddle.concat(rois_list, 0) + rois_num = paddle.to_tensor(rois_num, dtype='int32') + return rois, rois_num + + def pre_process(self, img, relations, texts, gt_bboxes, tag, img_size): + img, relations, texts, gt_bboxes, tag, img_size = img.numpy( + ), relations.numpy(), texts.numpy(), gt_bboxes.numpy(), tag.numpy( + ).tolist(), img_size.numpy() + temp_relations, temp_texts, temp_gt_bboxes = [], [], [] + h, w = int(np.max(img_size[:, 0])), int(np.max(img_size[:, 1])) + img = paddle.to_tensor(img[:, :, :h, :w]) + batch = len(tag) + for i in range(batch): + num, recoder_len = tag[i][0], tag[i][1] + temp_relations.append( + paddle.to_tensor( + relations[i, :num, :num, :], dtype='float32')) + temp_texts.append( + paddle.to_tensor( + texts[i, :num, :recoder_len], dtype='float32')) + temp_gt_bboxes.append( + paddle.to_tensor( + gt_bboxes[i, :num, ...], dtype='float32')) + return img, temp_relations, temp_texts, temp_gt_bboxes + + def forward(self, inputs): + img = inputs[0] + relations, texts, gt_bboxes, tag, img_size = inputs[1], inputs[ + 2], inputs[3], inputs[5], inputs[-1] + img, relations, texts, gt_bboxes = self.pre_process( + img, relations, texts, gt_bboxes, tag, img_size) + x = self.img_feat(img) + boxes, rois_num = self.bbox2roi(gt_bboxes) + feats = paddle.fluid.layers.roi_align( + x, + boxes, + spatial_scale=1.0, + pooled_height=7, + pooled_width=7, + rois_num=rois_num) + feats = self.maxpool(feats).squeeze(-1).squeeze(-1) + return [relations, texts, feats] diff --git a/backend/ppocr/modeling/backbones/rec_efficientb3_pren.py b/backend/ppocr/modeling/backbones/rec_efficientb3_pren.py new file mode 100644 index 0000000..57eef17 --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_efficientb3_pren.py @@ -0,0 +1,228 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Code is refer from: +https://github.com/RuijieJ/pren/blob/main/Nets/EfficientNet.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +from collections import namedtuple +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ['EfficientNetb3'] + + +class EffB3Params: + @staticmethod + def get_global_params(): + """ + The fllowing are efficientnetb3's arch superparams, but to fit for scene + text recognition task, the resolution(image_size) here is changed + from 300 to 64. + """ + GlobalParams = namedtuple('GlobalParams', [ + 'drop_connect_rate', 'width_coefficient', 'depth_coefficient', + 'depth_divisor', 'image_size' + ]) + global_params = GlobalParams( + drop_connect_rate=0.3, + width_coefficient=1.2, + depth_coefficient=1.4, + depth_divisor=8, + image_size=64) + return global_params + + @staticmethod + def get_block_params(): + BlockParams = namedtuple('BlockParams', [ + 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', + 'expand_ratio', 'id_skip', 'se_ratio', 'stride' + ]) + block_params = [ + BlockParams(3, 1, 32, 16, 1, True, 0.25, 1), + BlockParams(3, 2, 16, 24, 6, True, 0.25, 2), + BlockParams(5, 2, 24, 40, 6, True, 0.25, 2), + BlockParams(3, 3, 40, 80, 6, True, 0.25, 2), + BlockParams(5, 3, 80, 112, 6, True, 0.25, 1), + BlockParams(5, 4, 112, 192, 6, True, 0.25, 2), + BlockParams(3, 1, 192, 320, 6, True, 0.25, 1) + ] + return block_params + + +class EffUtils: + @staticmethod + def round_filters(filters, global_params): + """Calculate and round number of filters based on depth multiplier.""" + multiplier = global_params.width_coefficient + if not multiplier: + return filters + divisor = global_params.depth_divisor + filters *= multiplier + new_filters = int(filters + divisor / 2) // divisor * divisor + if new_filters < 0.9 * filters: + new_filters += divisor + return int(new_filters) + + @staticmethod + def round_repeats(repeats, global_params): + """Round number of filters based on depth multiplier.""" + multiplier = global_params.depth_coefficient + if not multiplier: + return repeats + return int(math.ceil(multiplier * repeats)) + + +class ConvBlock(nn.Layer): + def __init__(self, block_params): + super(ConvBlock, self).__init__() + self.block_args = block_params + self.has_se = (self.block_args.se_ratio is not None) and \ + (0 < self.block_args.se_ratio <= 1) + self.id_skip = block_params.id_skip + + # expansion phase + self.input_filters = self.block_args.input_filters + output_filters = \ + self.block_args.input_filters * self.block_args.expand_ratio + if self.block_args.expand_ratio != 1: + self.expand_conv = nn.Conv2D( + self.input_filters, output_filters, 1, bias_attr=False) + self.bn0 = nn.BatchNorm(output_filters) + + # depthwise conv phase + k = self.block_args.kernel_size + s = self.block_args.stride + self.depthwise_conv = nn.Conv2D( + output_filters, + output_filters, + groups=output_filters, + kernel_size=k, + stride=s, + padding='same', + bias_attr=False) + self.bn1 = nn.BatchNorm(output_filters) + + # squeeze and excitation layer, if desired + if self.has_se: + num_squeezed_channels = max(1, + int(self.block_args.input_filters * + self.block_args.se_ratio)) + self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1) + self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1) + + # output phase + self.final_oup = self.block_args.output_filters + self.project_conv = nn.Conv2D( + output_filters, self.final_oup, 1, bias_attr=False) + self.bn2 = nn.BatchNorm(self.final_oup) + self.swish = nn.Swish() + + def drop_connect(self, inputs, p, training): + if not training: + return inputs + + batch_size = inputs.shape[0] + keep_prob = 1 - p + random_tensor = keep_prob + random_tensor += paddle.rand([batch_size, 1, 1, 1], dtype=inputs.dtype) + random_tensor = paddle.to_tensor(random_tensor, place=inputs.place) + binary_tensor = paddle.floor(random_tensor) + output = inputs / keep_prob * binary_tensor + return output + + def forward(self, inputs, drop_connect_rate=None): + # expansion and depthwise conv + x = inputs + if self.block_args.expand_ratio != 1: + x = self.swish(self.bn0(self.expand_conv(inputs))) + x = self.swish(self.bn1(self.depthwise_conv(x))) + + # squeeze and excitation + if self.has_se: + x_squeezed = F.adaptive_avg_pool2d(x, 1) + x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed))) + x = F.sigmoid(x_squeezed) * x + x = self.bn2(self.project_conv(x)) + + # skip conntection and drop connect + if self.id_skip and self.block_args.stride == 1 and \ + self.input_filters == self.final_oup: + if drop_connect_rate: + x = self.drop_connect( + x, p=drop_connect_rate, training=self.training) + x = x + inputs + return x + + +class EfficientNetb3_PREN(nn.Layer): + def __init__(self, in_channels): + super(EfficientNetb3_PREN, self).__init__() + self.blocks_params = EffB3Params.get_block_params() + self.global_params = EffB3Params.get_global_params() + self.out_channels = [] + # stem + stem_channels = EffUtils.round_filters(32, self.global_params) + self.conv_stem = nn.Conv2D( + in_channels, stem_channels, 3, 2, padding='same', bias_attr=False) + self.bn0 = nn.BatchNorm(stem_channels) + + self.blocks = [] + # to extract three feature maps for fpn based on efficientnetb3 backbone + self.concerned_block_idxes = [7, 17, 25] + concerned_idx = 0 + for i, block_params in enumerate(self.blocks_params): + block_params = block_params._replace( + input_filters=EffUtils.round_filters(block_params.input_filters, + self.global_params), + output_filters=EffUtils.round_filters( + block_params.output_filters, self.global_params), + num_repeat=EffUtils.round_repeats(block_params.num_repeat, + self.global_params)) + self.blocks.append( + self.add_sublayer("{}-0".format(i), ConvBlock(block_params))) + concerned_idx += 1 + if concerned_idx in self.concerned_block_idxes: + self.out_channels.append(block_params.output_filters) + if block_params.num_repeat > 1: + block_params = block_params._replace( + input_filters=block_params.output_filters, stride=1) + for j in range(block_params.num_repeat - 1): + self.blocks.append( + self.add_sublayer('{}-{}'.format(i, j + 1), + ConvBlock(block_params))) + concerned_idx += 1 + if concerned_idx in self.concerned_block_idxes: + self.out_channels.append(block_params.output_filters) + + self.swish = nn.Swish() + + def forward(self, inputs): + outs = [] + + x = self.swish(self.bn0(self.conv_stem(inputs))) + for idx, block in enumerate(self.blocks): + drop_connect_rate = self.global_params.drop_connect_rate + if drop_connect_rate: + drop_connect_rate *= float(idx) / len(self.blocks) + x = block(x, drop_connect_rate=drop_connect_rate) + if idx in self.concerned_block_idxes: + outs.append(x) + return outs diff --git a/backend/ppocr/modeling/backbones/rec_micronet.py b/backend/ppocr/modeling/backbones/rec_micronet.py new file mode 100644 index 0000000..b0ae5a1 --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_micronet.py @@ -0,0 +1,528 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/liyunsheng13/micronet/blob/main/backbone/micronet.py +https://github.com/liyunsheng13/micronet/blob/main/backbone/activation.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + +from ppocr.modeling.backbones.det_mobilenet_v3 import make_divisible + +M0_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r + [2, 1, 8, 3, 2, 2, 0, 4, 8, 2, 2, 2, 0, 1, 1], + [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 2, 1, 1], + [2, 1, 16, 5, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1], + [1, 1, 32, 5, 1, 4, 4, 4, 32, 4, 4, 2, 2, 1, 1], + [2, 1, 64, 5, 1, 4, 8, 8, 64, 8, 8, 2, 2, 1, 1], + [1, 1, 96, 3, 1, 4, 8, 8, 96, 8, 8, 2, 2, 1, 2], + [1, 1, 384, 3, 1, 4, 12, 12, 0, 0, 0, 2, 2, 1, 2], +] +M1_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4 + [2, 1, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1, 1], + [2, 1, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1, 1], + [2, 1, 16, 5, 2, 2, 0, 16, 16, 4, 4, 2, 2, 1, 1], + [1, 1, 32, 5, 1, 6, 4, 4, 32, 4, 4, 2, 2, 1, 1], + [2, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 1], + [1, 1, 96, 3, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2], + [1, 1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1, 2], +] +M2_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4 + [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 0, 1, 1], + [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1], + [1, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 2, 2, 1, 1], + [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 2, 2, 1, 1], + [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 2, 2, 1, 2], + [1, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 2], + [2, 1, 96, 5, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2], + [1, 1, 128, 3, 1, 6, 12, 12, 128, 8, 8, 2, 2, 1, 2], + [1, 1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1, 2], +] +M3_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4 + [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 0, 2, 0, 1], + [2, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 0, 2, 0, 1], + [1, 1, 24, 3, 2, 2, 0, 24, 24, 4, 4, 0, 2, 0, 1], + [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 0, 2, 0, 1], + [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 0, 2, 0, 2], + [1, 1, 64, 5, 1, 6, 8, 8, 48, 8, 8, 0, 2, 0, 2], + [1, 1, 80, 5, 1, 6, 8, 8, 80, 8, 8, 0, 2, 0, 2], + [1, 1, 80, 5, 1, 6, 10, 10, 80, 8, 8, 0, 2, 0, 2], + [1, 1, 120, 5, 1, 6, 10, 10, 120, 10, 10, 0, 2, 0, 2], + [1, 1, 120, 5, 1, 6, 12, 12, 120, 10, 10, 0, 2, 0, 2], + [1, 1, 144, 3, 1, 6, 12, 12, 144, 12, 12, 0, 2, 0, 2], + [1, 1, 432, 3, 1, 3, 12, 12, 0, 0, 0, 0, 2, 0, 2], +] + + +def get_micronet_config(mode): + return eval(mode + '_cfgs') + + +class MaxGroupPooling(nn.Layer): + def __init__(self, channel_per_group=2): + super(MaxGroupPooling, self).__init__() + self.channel_per_group = channel_per_group + + def forward(self, x): + if self.channel_per_group == 1: + return x + # max op + b, c, h, w = x.shape + + # reshape + y = paddle.reshape(x, [b, c // self.channel_per_group, -1, h, w]) + out = paddle.max(y, axis=2) + return out + + +class SpatialSepConvSF(nn.Layer): + def __init__(self, inp, oups, kernel_size, stride): + super(SpatialSepConvSF, self).__init__() + + oup1, oup2 = oups + self.conv = nn.Sequential( + nn.Conv2D( + inp, + oup1, (kernel_size, 1), (stride, 1), (kernel_size // 2, 0), + bias_attr=False, + groups=1), + nn.BatchNorm2D(oup1), + nn.Conv2D( + oup1, + oup1 * oup2, (1, kernel_size), (1, stride), + (0, kernel_size // 2), + bias_attr=False, + groups=oup1), + nn.BatchNorm2D(oup1 * oup2), + ChannelShuffle(oup1), ) + + def forward(self, x): + out = self.conv(x) + return out + + +class ChannelShuffle(nn.Layer): + def __init__(self, groups): + super(ChannelShuffle, self).__init__() + self.groups = groups + + def forward(self, x): + b, c, h, w = x.shape + + channels_per_group = c // self.groups + + # reshape + x = paddle.reshape(x, [b, self.groups, channels_per_group, h, w]) + + x = paddle.transpose(x, (0, 2, 1, 3, 4)) + out = paddle.reshape(x, [b, -1, h, w]) + + return out + + +class StemLayer(nn.Layer): + def __init__(self, inp, oup, stride, groups=(4, 4)): + super(StemLayer, self).__init__() + + g1, g2 = groups + self.stem = nn.Sequential( + SpatialSepConvSF(inp, groups, 3, stride), + MaxGroupPooling(2) if g1 * g2 == 2 * oup else nn.ReLU6()) + + def forward(self, x): + out = self.stem(x) + return out + + +class DepthSpatialSepConv(nn.Layer): + def __init__(self, inp, expand, kernel_size, stride): + super(DepthSpatialSepConv, self).__init__() + + exp1, exp2 = expand + + hidden_dim = inp * exp1 + oup = inp * exp1 * exp2 + + self.conv = nn.Sequential( + nn.Conv2D( + inp, + inp * exp1, (kernel_size, 1), (stride, 1), + (kernel_size // 2, 0), + bias_attr=False, + groups=inp), + nn.BatchNorm2D(inp * exp1), + nn.Conv2D( + hidden_dim, + oup, (1, kernel_size), + 1, (0, kernel_size // 2), + bias_attr=False, + groups=hidden_dim), + nn.BatchNorm2D(oup)) + + def forward(self, x): + x = self.conv(x) + return x + + +class GroupConv(nn.Layer): + def __init__(self, inp, oup, groups=2): + super(GroupConv, self).__init__() + self.inp = inp + self.oup = oup + self.groups = groups + self.conv = nn.Sequential( + nn.Conv2D( + inp, oup, 1, 1, 0, bias_attr=False, groups=self.groups[0]), + nn.BatchNorm2D(oup)) + + def forward(self, x): + x = self.conv(x) + return x + + +class DepthConv(nn.Layer): + def __init__(self, inp, oup, kernel_size, stride): + super(DepthConv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2D( + inp, + oup, + kernel_size, + stride, + kernel_size // 2, + bias_attr=False, + groups=inp), + nn.BatchNorm2D(oup)) + + def forward(self, x): + out = self.conv(x) + return out + + +class DYShiftMax(nn.Layer): + def __init__(self, + inp, + oup, + reduction=4, + act_max=1.0, + act_relu=True, + init_a=[0.0, 0.0], + init_b=[0.0, 0.0], + relu_before_pool=False, + g=None, + expansion=False): + super(DYShiftMax, self).__init__() + self.oup = oup + self.act_max = act_max * 2 + self.act_relu = act_relu + self.avg_pool = nn.Sequential(nn.ReLU() if relu_before_pool == True else + nn.Sequential(), nn.AdaptiveAvgPool2D(1)) + + self.exp = 4 if act_relu else 2 + self.init_a = init_a + self.init_b = init_b + + # determine squeeze + squeeze = make_divisible(inp // reduction, 4) + if squeeze < 4: + squeeze = 4 + + self.fc = nn.Sequential( + nn.Linear(inp, squeeze), + nn.ReLU(), nn.Linear(squeeze, oup * self.exp), nn.Hardsigmoid()) + + if g is None: + g = 1 + self.g = g[1] + if self.g != 1 and expansion: + self.g = inp // self.g + + self.gc = inp // self.g + index = paddle.to_tensor([range(inp)]) + index = paddle.reshape(index, [1, inp, 1, 1]) + index = paddle.reshape(index, [1, self.g, self.gc, 1, 1]) + indexgs = paddle.split(index, [1, self.g - 1], axis=1) + indexgs = paddle.concat((indexgs[1], indexgs[0]), axis=1) + indexs = paddle.split(indexgs, [1, self.gc - 1], axis=2) + indexs = paddle.concat((indexs[1], indexs[0]), axis=2) + self.index = paddle.reshape(indexs, [inp]) + self.expansion = expansion + + def forward(self, x): + x_in = x + x_out = x + + b, c, _, _ = x_in.shape + y = self.avg_pool(x_in) + y = paddle.reshape(y, [b, c]) + y = self.fc(y) + y = paddle.reshape(y, [b, self.oup * self.exp, 1, 1]) + y = (y - 0.5) * self.act_max + + n2, c2, h2, w2 = x_out.shape + x2 = paddle.to_tensor(x_out.numpy()[:, self.index.numpy(), :, :]) + + if self.exp == 4: + temp = y.shape + a1, b1, a2, b2 = paddle.split(y, temp[1] // self.oup, axis=1) + + a1 = a1 + self.init_a[0] + a2 = a2 + self.init_a[1] + + b1 = b1 + self.init_b[0] + b2 = b2 + self.init_b[1] + + z1 = x_out * a1 + x2 * b1 + z2 = x_out * a2 + x2 * b2 + + out = paddle.maximum(z1, z2) + + elif self.exp == 2: + temp = y.shape + a1, b1 = paddle.split(y, temp[1] // self.oup, axis=1) + a1 = a1 + self.init_a[0] + b1 = b1 + self.init_b[0] + out = x_out * a1 + x2 * b1 + + return out + + +class DYMicroBlock(nn.Layer): + def __init__(self, + inp, + oup, + kernel_size=3, + stride=1, + ch_exp=(2, 2), + ch_per_group=4, + groups_1x1=(1, 1), + depthsep=True, + shuffle=False, + activation_cfg=None): + super(DYMicroBlock, self).__init__() + + self.identity = stride == 1 and inp == oup + + y1, y2, y3 = activation_cfg['dy'] + act_reduction = 8 * activation_cfg['ratio'] + init_a = activation_cfg['init_a'] + init_b = activation_cfg['init_b'] + + t1 = ch_exp + gs1 = ch_per_group + hidden_fft, g1, g2 = groups_1x1 + hidden_dim2 = inp * t1[0] * t1[1] + + if gs1[0] == 0: + self.layers = nn.Sequential( + DepthSpatialSepConv(inp, t1, kernel_size, stride), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=True if y2 == 2 else False, + init_a=init_a, + reduction=act_reduction, + init_b=init_b, + g=gs1, + expansion=False) if y2 > 0 else nn.ReLU6(), + ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(), + ChannelShuffle(hidden_dim2 // 2) + if shuffle and y2 != 0 else nn.Sequential(), + GroupConv(hidden_dim2, oup, (g1, g2)), + DYShiftMax( + oup, + oup, + act_max=2.0, + act_relu=False, + init_a=[1.0, 0.0], + reduction=act_reduction // 2, + init_b=[0.0, 0.0], + g=(g1, g2), + expansion=False) if y3 > 0 else nn.Sequential(), + ChannelShuffle(g2) if shuffle else nn.Sequential(), + ChannelShuffle(oup // 2) + if shuffle and oup % 2 == 0 and y3 != 0 else nn.Sequential(), ) + elif g2 == 0: + self.layers = nn.Sequential( + GroupConv(inp, hidden_dim2, gs1), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=False, + init_a=[1.0, 0.0], + reduction=act_reduction, + init_b=[0.0, 0.0], + g=gs1, + expansion=False) if y3 > 0 else nn.Sequential(), ) + else: + self.layers = nn.Sequential( + GroupConv(inp, hidden_dim2, gs1), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=True if y1 == 2 else False, + init_a=init_a, + reduction=act_reduction, + init_b=init_b, + g=gs1, + expansion=False) if y1 > 0 else nn.ReLU6(), + ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(), + DepthSpatialSepConv(hidden_dim2, (1, 1), kernel_size, stride) + if depthsep else + DepthConv(hidden_dim2, hidden_dim2, kernel_size, stride), + nn.Sequential(), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=True if y2 == 2 else False, + init_a=init_a, + reduction=act_reduction, + init_b=init_b, + g=gs1, + expansion=True) if y2 > 0 else nn.ReLU6(), + ChannelShuffle(hidden_dim2 // 4) + if shuffle and y1 != 0 and y2 != 0 else nn.Sequential() + if y1 == 0 and y2 == 0 else ChannelShuffle(hidden_dim2 // 2), + GroupConv(hidden_dim2, oup, (g1, g2)), + DYShiftMax( + oup, + oup, + act_max=2.0, + act_relu=False, + init_a=[1.0, 0.0], + reduction=act_reduction // 2 + if oup < hidden_dim2 else act_reduction, + init_b=[0.0, 0.0], + g=(g1, g2), + expansion=False) if y3 > 0 else nn.Sequential(), + ChannelShuffle(g2) if shuffle else nn.Sequential(), + ChannelShuffle(oup // 2) + if shuffle and y3 != 0 else nn.Sequential(), ) + + def forward(self, x): + identity = x + out = self.layers(x) + + if self.identity: + out = out + identity + + return out + + +class MicroNet(nn.Layer): + """ + the MicroNet backbone network for recognition module. + Args: + mode(str): {'M0', 'M1', 'M2', 'M3'} + Four models are proposed based on four different computational costs (4M, 6M, 12M, 21M MAdds) + Default: 'M3'. + """ + + def __init__(self, mode='M3', **kwargs): + super(MicroNet, self).__init__() + + self.cfgs = get_micronet_config(mode) + + activation_cfg = {} + if mode == 'M0': + input_channel = 4 + stem_groups = 2, 2 + out_ch = 384 + activation_cfg['init_a'] = 1.0, 1.0 + activation_cfg['init_b'] = 0.0, 0.0 + elif mode == 'M1': + input_channel = 6 + stem_groups = 3, 2 + out_ch = 576 + activation_cfg['init_a'] = 1.0, 1.0 + activation_cfg['init_b'] = 0.0, 0.0 + elif mode == 'M2': + input_channel = 8 + stem_groups = 4, 2 + out_ch = 768 + activation_cfg['init_a'] = 1.0, 1.0 + activation_cfg['init_b'] = 0.0, 0.0 + elif mode == 'M3': + input_channel = 12 + stem_groups = 4, 3 + out_ch = 432 + activation_cfg['init_a'] = 1.0, 0.5 + activation_cfg['init_b'] = 0.0, 0.5 + else: + raise NotImplementedError("mode[" + mode + + "_model] is not implemented!") + + layers = [StemLayer(3, input_channel, stride=2, groups=stem_groups)] + + for idx, val in enumerate(self.cfgs): + s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r = val + + t1 = (c1, c2) + gs1 = (g1, g2) + gs2 = (c3, g3, g4) + activation_cfg['dy'] = [y1, y2, y3] + activation_cfg['ratio'] = r + + output_channel = c + layers.append( + DYMicroBlock( + input_channel, + output_channel, + kernel_size=ks, + stride=s, + ch_exp=t1, + ch_per_group=gs1, + groups_1x1=gs2, + depthsep=True, + shuffle=True, + activation_cfg=activation_cfg, )) + input_channel = output_channel + for i in range(1, n): + layers.append( + DYMicroBlock( + input_channel, + output_channel, + kernel_size=ks, + stride=1, + ch_exp=t1, + ch_per_group=gs1, + groups_1x1=gs2, + depthsep=True, + shuffle=True, + activation_cfg=activation_cfg, )) + input_channel = output_channel + self.features = nn.Sequential(*layers) + + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + + self.out_channels = make_divisible(out_ch) + + def forward(self, x): + x = self.features(x) + x = self.pool(x) + return x diff --git a/backend/ppocr/modeling/backbones/rec_mobilenet_v3.py b/backend/ppocr/modeling/backbones/rec_mobilenet_v3.py new file mode 100644 index 0000000..917e000 --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_mobilenet_v3.py @@ -0,0 +1,138 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import nn + +from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible + +__all__ = ['MobileNetV3'] + + +class MobileNetV3(nn.Layer): + def __init__(self, + in_channels=3, + model_name='small', + scale=0.5, + large_stride=None, + small_stride=None, + disable_se=False, + **kwargs): + super(MobileNetV3, self).__init__() + self.disable_se = disable_se + if small_stride is None: + small_stride = [2, 2, 2, 2] + if large_stride is None: + large_stride = [1, 2, 2, 2] + + assert isinstance(large_stride, list), "large_stride type must " \ + "be list but got {}".format(type(large_stride)) + assert isinstance(small_stride, list), "small_stride type must " \ + "be list but got {}".format(type(small_stride)) + assert len(large_stride) == 4, "large_stride length must be " \ + "4 but got {}".format(len(large_stride)) + assert len(small_stride) == 4, "small_stride length must be " \ + "4 but got {}".format(len(small_stride)) + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', large_stride[0]], + [3, 64, 24, False, 'relu', (large_stride[1], 1)], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', (large_stride[2], 1)], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hardswish', 1], + [3, 200, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 480, 112, True, 'hardswish', 1], + [3, 672, 112, True, 'hardswish', 1], + [5, 672, 160, True, 'hardswish', (large_stride[3], 1)], + [5, 960, 160, True, 'hardswish', 1], + [5, 960, 160, True, 'hardswish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', (small_stride[0], 1)], + [3, 72, 24, False, 'relu', (small_stride[1], 1)], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hardswish', (small_stride[2], 1)], + [5, 240, 40, True, 'hardswish', 1], + [5, 240, 40, True, 'hardswish', 1], + [5, 120, 48, True, 'hardswish', 1], + [5, 144, 48, True, 'hardswish', 1], + [5, 288, 96, True, 'hardswish', (small_stride[3], 1)], + [5, 576, 96, True, 'hardswish', 1], + [5, 576, 96, True, 'hardswish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scales are {} but input scale is {}".format(supported_scale, scale) + + inplanes = 16 + # conv1 + self.conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hardswish') + i = 0 + block_list = [] + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + se = se and not self.disable_se + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl)) + inplanes = make_divisible(scale * c) + i += 1 + self.blocks = nn.Sequential(*block_list) + + self.conv2 = ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hardswish') + + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = make_divisible(scale * cls_ch_squeeze) + + def forward(self, x): + x = self.conv1(x) + x = self.blocks(x) + x = self.conv2(x) + x = self.pool(x) + return x diff --git a/backend/ppocr/modeling/backbones/rec_mv1_enhance.py b/backend/ppocr/modeling/backbones/rec_mv1_enhance.py new file mode 100644 index 0000000..bb6af5e --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_mv1_enhance.py @@ -0,0 +1,256 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This code is refer from: https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/arch/backbone/legendary_models/pp_lcnet.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import numpy as np +import paddle +from paddle import ParamAttr, reshape, transpose +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal +from paddle.regularizer import L2Decay +from paddle.nn.functional import hardswish, hardsigmoid + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + act=act, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Layer): + def __init__(self, + in_channels=3, + scale=0.5, + last_conv_stride=1, + last_pool_type='max', + **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=3, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=last_conv_stride, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + if last_pool_type == 'avg': + self.pool = nn.AvgPool2D(kernel_size=2, stride=2, padding=0) + else: + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + return paddle.multiply(x=inputs, y=outputs) diff --git a/backend/ppocr/modeling/backbones/rec_nrtr_mtb.py b/backend/ppocr/modeling/backbones/rec_nrtr_mtb.py new file mode 100644 index 0000000..22e02a6 --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_nrtr_mtb.py @@ -0,0 +1,48 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import nn +import paddle + + +class MTB(nn.Layer): + def __init__(self, cnn_num, in_channels): + super(MTB, self).__init__() + self.block = nn.Sequential() + self.out_channels = in_channels + self.cnn_num = cnn_num + if self.cnn_num == 2: + for i in range(self.cnn_num): + self.block.add_sublayer( + 'conv_{}'.format(i), + nn.Conv2D( + in_channels=in_channels + if i == 0 else 32 * (2**(i - 1)), + out_channels=32 * (2**i), + kernel_size=3, + stride=2, + padding=1)) + self.block.add_sublayer('relu_{}'.format(i), nn.ReLU()) + self.block.add_sublayer('bn_{}'.format(i), + nn.BatchNorm2D(32 * (2**i))) + + def forward(self, images): + x = self.block(images) + if self.cnn_num == 2: + # (b, w, h, c) + x = paddle.transpose(x, [0, 3, 2, 1]) + x_shape = paddle.shape(x) + x = paddle.reshape( + x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]]) + return x diff --git a/backend/ppocr/modeling/backbones/rec_resnet_31.py b/backend/ppocr/modeling/backbones/rec_resnet_31.py new file mode 100644 index 0000000..9651701 --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_resnet_31.py @@ -0,0 +1,210 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +__all__ = ["ResNet31"] + + +def conv3x3(in_channel, out_channel, stride=1): + return nn.Conv2D( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, in_channels, channels, stride=1, downsample=False): + super().__init__() + self.conv1 = conv3x3(in_channels, channels, stride) + self.bn1 = nn.BatchNorm2D(channels) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels) + self.bn2 = nn.BatchNorm2D(channels) + self.downsample = downsample + if downsample: + self.downsample = nn.Sequential( + nn.Conv2D( + in_channels, + channels * self.expansion, + 1, + stride, + bias_attr=False), + nn.BatchNorm2D(channels * self.expansion), ) + else: + self.downsample = nn.Sequential() + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet31(nn.Layer): + ''' + Args: + in_channels (int): Number of channels of input image tensor. + layers (list[int]): List of BasicBlock number for each stage. + channels (list[int]): List of out_channels of Conv2d layer. + out_indices (None | Sequence[int]): Indices of output stages. + last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + ''' + + def __init__(self, + in_channels=3, + layers=[1, 2, 5, 3], + channels=[64, 128, 256, 256, 512, 512, 512], + out_indices=None, + last_stage_pool=False): + super(ResNet31, self).__init__() + assert isinstance(in_channels, int) + assert isinstance(last_stage_pool, bool) + + self.out_indices = out_indices + self.last_stage_pool = last_stage_pool + + # conv 1 (Conv Conv) + self.conv1_1 = nn.Conv2D( + in_channels, channels[0], kernel_size=3, stride=1, padding=1) + self.bn1_1 = nn.BatchNorm2D(channels[0]) + self.relu1_1 = nn.ReLU() + + self.conv1_2 = nn.Conv2D( + channels[0], channels[1], kernel_size=3, stride=1, padding=1) + self.bn1_2 = nn.BatchNorm2D(channels[1]) + self.relu1_2 = nn.ReLU() + + # conv 2 (Max-pooling, Residual block, Conv) + self.pool2 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block2 = self._make_layer(channels[1], channels[2], layers[0]) + self.conv2 = nn.Conv2D( + channels[2], channels[2], kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(channels[2]) + self.relu2 = nn.ReLU() + + # conv 3 (Max-pooling, Residual block, Conv) + self.pool3 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block3 = self._make_layer(channels[2], channels[3], layers[1]) + self.conv3 = nn.Conv2D( + channels[3], channels[3], kernel_size=3, stride=1, padding=1) + self.bn3 = nn.BatchNorm2D(channels[3]) + self.relu3 = nn.ReLU() + + # conv 4 (Max-pooling, Residual block, Conv) + self.pool4 = nn.MaxPool2D( + kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) + self.block4 = self._make_layer(channels[3], channels[4], layers[2]) + self.conv4 = nn.Conv2D( + channels[4], channels[4], kernel_size=3, stride=1, padding=1) + self.bn4 = nn.BatchNorm2D(channels[4]) + self.relu4 = nn.ReLU() + + # conv 5 ((Max-pooling), Residual block, Conv) + self.pool5 = None + if self.last_stage_pool: + self.pool5 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block5 = self._make_layer(channels[4], channels[5], layers[3]) + self.conv5 = nn.Conv2D( + channels[5], channels[5], kernel_size=3, stride=1, padding=1) + self.bn5 = nn.BatchNorm2D(channels[5]) + self.relu5 = nn.ReLU() + + self.out_channels = channels[-1] + + def _make_layer(self, input_channels, output_channels, blocks): + layers = [] + for _ in range(blocks): + downsample = None + if input_channels != output_channels: + downsample = nn.Sequential( + nn.Conv2D( + input_channels, + output_channels, + kernel_size=1, + stride=1, + bias_attr=False), + nn.BatchNorm2D(output_channels), ) + + layers.append( + BasicBlock( + input_channels, output_channels, downsample=downsample)) + input_channels = output_channels + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu1_1(x) + + x = self.conv1_2(x) + x = self.bn1_2(x) + x = self.relu1_2(x) + + outs = [] + for i in range(4): + layer_index = i + 2 + pool_layer = getattr(self, f'pool{layer_index}') + block_layer = getattr(self, f'block{layer_index}') + conv_layer = getattr(self, f'conv{layer_index}') + bn_layer = getattr(self, f'bn{layer_index}') + relu_layer = getattr(self, f'relu{layer_index}') + + if pool_layer is not None: + x = pool_layer(x) + x = block_layer(x) + x = conv_layer(x) + x = bn_layer(x) + x = relu_layer(x) + + outs.append(x) + + if self.out_indices is not None: + return tuple([outs[i] for i in self.out_indices]) + + return x diff --git a/backend/ppocr/modeling/backbones/rec_resnet_aster.py b/backend/ppocr/modeling/backbones/rec_resnet_aster.py new file mode 100644 index 0000000..6a2710d --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_resnet_aster.py @@ -0,0 +1,143 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/resnet_aster.py +""" +import paddle +import paddle.nn as nn + +import sys +import math + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2D( + in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False) + + +def get_sinusoid_encoding(n_position, feat_dim, wave_length=10000): + # [n_position] + positions = paddle.arange(0, n_position) + # [feat_dim] + dim_range = paddle.arange(0, feat_dim) + dim_range = paddle.pow(wave_length, 2 * (dim_range // 2) / feat_dim) + # [n_position, feat_dim] + angles = paddle.unsqueeze( + positions, axis=1) / paddle.unsqueeze( + dim_range, axis=0) + angles = paddle.cast(angles, "float32") + angles[:, 0::2] = paddle.sin(angles[:, 0::2]) + angles[:, 1::2] = paddle.cos(angles[:, 1::2]) + return angles + + +class AsterBlock(nn.Layer): + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(AsterBlock, self).__init__() + self.conv1 = conv1x1(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2D(planes) + self.relu = nn.ReLU() + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2D(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + return out + + +class ResNet_ASTER(nn.Layer): + """For aster or crnn""" + + def __init__(self, with_lstm=True, n_group=1, in_channels=3): + super(ResNet_ASTER, self).__init__() + self.with_lstm = with_lstm + self.n_group = n_group + + self.layer0 = nn.Sequential( + nn.Conv2D( + in_channels, + 32, + kernel_size=(3, 3), + stride=1, + padding=1, + bias_attr=False), + nn.BatchNorm2D(32), + nn.ReLU()) + + self.inplanes = 32 + self.layer1 = self._make_layer(32, 3, [2, 2]) # [16, 50] + self.layer2 = self._make_layer(64, 4, [2, 2]) # [8, 25] + self.layer3 = self._make_layer(128, 6, [2, 1]) # [4, 25] + self.layer4 = self._make_layer(256, 6, [2, 1]) # [2, 25] + self.layer5 = self._make_layer(512, 3, [2, 1]) # [1, 25] + + if with_lstm: + self.rnn = nn.LSTM(512, 256, direction="bidirect", num_layers=2) + self.out_channels = 2 * 256 + else: + self.out_channels = 512 + + def _make_layer(self, planes, blocks, stride): + downsample = None + if stride != [1, 1] or self.inplanes != planes: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes, stride), nn.BatchNorm2D(planes)) + + layers = [] + layers.append(AsterBlock(self.inplanes, planes, stride, downsample)) + self.inplanes = planes + for _ in range(1, blocks): + layers.append(AsterBlock(self.inplanes, planes)) + return nn.Sequential(*layers) + + def forward(self, x): + x0 = self.layer0(x) + x1 = self.layer1(x0) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + x5 = self.layer5(x4) + + cnn_feat = x5.squeeze(2) # [N, c, w] + cnn_feat = paddle.transpose(cnn_feat, perm=[0, 2, 1]) + if self.with_lstm: + rnn_feat, _ = self.rnn(cnn_feat) + return rnn_feat + else: + return cnn_feat diff --git a/backend/ppocr/modeling/backbones/rec_resnet_fpn.py b/backend/ppocr/modeling/backbones/rec_resnet_fpn.py new file mode 100644 index 0000000..a7e876a --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_resnet_fpn.py @@ -0,0 +1,307 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import paddle.fluid as fluid +import paddle +import numpy as np + +__all__ = ["ResNetFPN"] + + +class ResNetFPN(nn.Layer): + def __init__(self, in_channels=1, layers=50, **kwargs): + super(ResNetFPN, self).__init__() + supported_layers = { + 18: { + 'depth': [2, 2, 2, 2], + 'block_class': BasicBlock + }, + 34: { + 'depth': [3, 4, 6, 3], + 'block_class': BasicBlock + }, + 50: { + 'depth': [3, 4, 6, 3], + 'block_class': BottleneckBlock + }, + 101: { + 'depth': [3, 4, 23, 3], + 'block_class': BottleneckBlock + }, + 152: { + 'depth': [3, 8, 36, 3], + 'block_class': BottleneckBlock + } + } + stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)] + num_filters = [64, 128, 256, 512] + self.depth = supported_layers[layers]['depth'] + self.F = [] + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", + name="conv1") + self.block_list = [] + in_ch = 64 + if layers >= 50: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + block_list = self.add_sublayer( + "bottleneckBlock_{}_{}".format(block, i), + BottleneckBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + name=conv_name)) + in_ch = num_filters[block] * 4 + self.block_list.append(block_list) + self.F.append(block_list) + else: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + basic_block = self.add_sublayer( + conv_name, + BasicBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + is_first=block == i == 0, + name=conv_name)) + in_ch = basic_block.out_channels + self.block_list.append(basic_block) + out_ch_list = [in_ch // 4, in_ch // 2, in_ch] + self.base_block = [] + self.conv_trans = [] + self.bn_block = [] + for i in [-2, -3]: + in_channels = out_ch_list[i + 1] + out_ch_list[i] + + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_0".format(i), + nn.Conv2D( + in_channels=in_channels, + out_channels=out_ch_list[i], + kernel_size=1, + weight_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True)))) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_1".format(i), + nn.Conv2D( + in_channels=out_ch_list[i], + out_channels=out_ch_list[i], + kernel_size=3, + padding=1, + weight_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True)))) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_2".format(i), + nn.BatchNorm( + num_channels=out_ch_list[i], + act="relu", + param_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True)))) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_3".format(i), + nn.Conv2D( + in_channels=out_ch_list[i], + out_channels=512, + kernel_size=1, + bias_attr=ParamAttr(trainable=True), + weight_attr=ParamAttr(trainable=True)))) + self.out_channels = 512 + + def __call__(self, x): + x = self.conv(x) + fpn_list = [] + F = [] + for i in range(len(self.depth)): + fpn_list.append(np.sum(self.depth[:i + 1])) + + for i, block in enumerate(self.block_list): + x = block(x) + for number in fpn_list: + if i + 1 == number: + F.append(x) + base = F[-1] + + j = 0 + for i, block in enumerate(self.base_block): + if i % 3 == 0 and i < 6: + j = j + 1 + b, c, w, h = F[-j - 1].shape + if [w, h] == list(base.shape[2:]): + base = base + else: + base = self.conv_trans[j - 1](base) + base = self.bn_block[j - 1](base) + base = paddle.concat([base, F[-j - 1]], axis=1) + base = block(base) + return base + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 if stride == (1, 1) else kernel_size, + dilation=2 if stride == (1, 1) else 1, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '.conv2d.output.1.w_0'), + bias_attr=False, ) + + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name=name + '.output.1.w_0'), + bias_attr=ParamAttr(name=name + '.output.1.b_0'), + moving_mean_name=bn_name + "_mean", + moving_variance_name=bn_name + "_variance") + + def __call__(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class ShortCut(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name, is_first=False): + super(ShortCut, self).__init__() + self.use_conv = True + + if in_channels != out_channels or stride != 1 or is_first == True: + if stride == (1, 1): + self.conv = ConvBNLayer( + in_channels, out_channels, 1, 1, name=name) + else: # stride==(2,2) + self.conv = ConvBNLayer( + in_channels, out_channels, 1, stride, name=name) + else: + self.use_conv = False + + def forward(self, x): + if self.use_conv: + x = self.conv(x) + return x + + +class BottleneckBlock(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels * 4, + stride=stride, + is_first=False, + name=name + "_branch1") + self.out_channels = out_channels * 4 + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = self.conv2(y) + y = y + self.short(x) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name, is_first): + super(BasicBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + act='relu', + stride=stride, + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels, + stride=stride, + is_first=is_first, + name=name + "_branch1") + self.out_channels = out_channels + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = y + self.short(x) + return F.relu(y) diff --git a/backend/ppocr/modeling/backbones/rec_resnet_vd.py b/backend/ppocr/modeling/backbones/rec_resnet_vd.py new file mode 100644 index 0000000..0187deb --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_resnet_vd.py @@ -0,0 +1,286 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ["ResNet"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=stride, stride=stride, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1 if is_vd_mode else stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + is_vd_mode=not if_first and stride[0] != 1, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + is_vd_mode=not if_first and stride[0] != 1, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if layers in [101, 152, 200] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=stride, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + self.block_list.append(bottleneck_block) + self.out_channels = num_filters[block] * 4 + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=stride, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + self.block_list.append(basic_block) + self.out_channels = num_filters[block] + self.out_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + for block in self.block_list: + y = block(y) + y = self.out_pool(y) + return y diff --git a/backend/ppocr/modeling/backbones/rec_svtrnet.py b/backend/ppocr/modeling/backbones/rec_svtrnet.py new file mode 100644 index 0000000..c57bf46 --- /dev/null +++ b/backend/ppocr/modeling/backbones/rec_svtrnet.py @@ -0,0 +1,584 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import ParamAttr +from paddle.nn.initializer import KaimingNormal +import numpy as np +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal + +trunc_normal_ = TruncatedNormal(std=.02) +normal_ = Normal +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act=nn.GELU): + super().__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform()), + bias_attr=bias_attr) + self.norm = nn.BatchNorm2D(out_channels) + self.act = act() + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvMixer(nn.Layer): + def __init__( + self, + dim, + num_heads=8, + HW=[8, 25], + local_k=[3, 3], ): + super().__init__() + self.HW = HW + self.dim = dim + self.local_mixer = nn.Conv2D( + dim, + dim, + local_k, + 1, [local_k[0] // 2, local_k[1] // 2], + groups=num_heads, + weight_attr=ParamAttr(initializer=KaimingNormal())) + + def forward(self, x): + h = self.HW[0] + w = self.HW[1] + x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) + x = self.local_mixer(x) + x = x.flatten(2).transpose([0, 2, 1]) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + mixer='Global', + HW=[8, 25], + local_k=[7, 11], + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.HW = HW + if HW is not None: + H = HW[0] + W = HW[1] + self.N = H * W + self.C = dim + if mixer == 'Local' and HW is not None: + hk = local_k[0] + wk = local_k[1] + mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype='float32') + for h in range(0, H): + for w in range(0, W): + mask[h * W + w, h:h + hk, w:w + wk] = 0. + mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk // + 2].flatten(1) + mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32') + mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf) + self.mask = mask.unsqueeze([0, 1]) + self.mixer = mixer + + def forward(self, x): + if self.HW is not None: + N = self.N + C = self.C + else: + _, N, C = x.shape + qkv = self.qkv(x).reshape((0, N, 3, self.num_heads, C // + self.num_heads)).transpose((2, 0, 3, 1, 4)) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) + if self.mixer == 'Local': + attn += self.mask + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mixer='Global', + local_mixer=[7, 11], + HW=[8, 25], + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-6, + prenorm=True): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + else: + self.norm1 = norm_layer(dim) + if mixer == 'Global' or mixer == 'Local': + self.mixer = Attention( + dim, + num_heads=num_heads, + mixer=mixer, + HW=HW, + local_k=local_mixer, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + elif mixer == 'Conv': + self.mixer = ConvMixer( + dim, num_heads=num_heads, HW=HW, local_k=local_mixer) + else: + raise TypeError("The mixer must be one of [Global, Local, Conv]") + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.prenorm = prenorm + + def forward(self, x): + if self.prenorm: + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + else: + x = x + self.drop_path(self.mixer(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=[32, 100], + in_channels=3, + embed_dim=768, + sub_num=2): + super().__init__() + num_patches = (img_size[1] // (2 ** sub_num)) * \ + (img_size[0] // (2 ** sub_num)) + self.img_size = img_size + self.num_patches = num_patches + self.embed_dim = embed_dim + self.norm = None + if sub_num == 2: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None)) + if sub_num == 3: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None), + ConvBNLayer( + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None)) + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose((0, 2, 1)) + return x + + +class SubSample(nn.Layer): + def __init__(self, + in_channels, + out_channels, + types='Pool', + stride=[2, 1], + sub_norm='nn.LayerNorm', + act=None): + super().__init__() + self.types = types + if types == 'Pool': + self.avgpool = nn.AvgPool2D( + kernel_size=[3, 5], stride=stride, padding=[1, 2]) + self.maxpool = nn.MaxPool2D( + kernel_size=[3, 5], stride=stride, padding=[1, 2]) + self.proj = nn.Linear(in_channels, out_channels) + else: + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal())) + self.norm = eval(sub_norm)(out_channels) + if act is not None: + self.act = act() + else: + self.act = None + + def forward(self, x): + + if self.types == 'Pool': + x1 = self.avgpool(x) + x2 = self.maxpool(x) + x = (x1 + x2) * 0.5 + out = self.proj(x.flatten(2).transpose((0, 2, 1))) + else: + x = self.conv(x) + out = x.flatten(2).transpose((0, 2, 1)) + out = self.norm(out) + if self.act is not None: + out = self.act(out) + + return out + + +class SVTRNet(nn.Layer): + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=[64, 128, 256], + depth=[3, 6, 3], + num_heads=[2, 4, 8], + mixer=['Local'] * 6 + ['Global'] * + 6, # Local atten, Global atten, Conv + local_mixer=[[7, 11], [7, 11], [7, 11]], + patch_merging='Conv', # Conv, Pool, None + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + last_drop=0.1, + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + sub_norm='nn.LayerNorm', + epsilon=1e-6, + out_channels=192, + out_char_num=25, + block_unit='Block', + act='nn.GELU', + last_stage=True, + sub_num=2, + prenorm=True, + use_lenhead=False, + **kwargs): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.out_channels = out_channels + self.prenorm = prenorm + patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim[0], + sub_num=sub_num) + num_patches = self.patch_embed.num_patches + self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] + self.pos_embed = self.create_parameter( + shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + self.pos_drop = nn.Dropout(p=drop_rate) + Block_unit = eval(block_unit) + + dpr = np.linspace(0, drop_path_rate, sum(depth)) + self.blocks1 = nn.LayerList([ + Block_unit( + dim=embed_dim[0], + num_heads=num_heads[0], + mixer=mixer[0:depth[0]][i], + HW=self.HW, + local_mixer=local_mixer[0], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[0:depth[0]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[0]) + ]) + if patch_merging is not None: + self.sub_sample1 = SubSample( + embed_dim[0], + embed_dim[1], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 2, self.HW[1]] + else: + HW = self.HW + self.patch_merging = patch_merging + self.blocks2 = nn.LayerList([ + Block_unit( + dim=embed_dim[1], + num_heads=num_heads[1], + mixer=mixer[depth[0]:depth[0] + depth[1]][i], + HW=HW, + local_mixer=local_mixer[1], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0]:depth[0] + depth[1]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[1]) + ]) + if patch_merging is not None: + self.sub_sample2 = SubSample( + embed_dim[1], + embed_dim[2], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 4, self.HW[1]] + else: + HW = self.HW + self.blocks3 = nn.LayerList([ + Block_unit( + dim=embed_dim[2], + num_heads=num_heads[2], + mixer=mixer[depth[0] + depth[1]:][i], + HW=HW, + local_mixer=local_mixer[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] + depth[1]:][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[2]) + ]) + self.last_stage = last_stage + if last_stage: + self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num]) + self.last_conv = nn.Conv2D( + in_channels=embed_dim[2], + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False) + self.hardswish = nn.Hardswish() + self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") + if not prenorm: + self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon) + self.use_lenhead = use_lenhead + if use_lenhead: + self.len_conv = nn.Linear(embed_dim[2], self.out_channels) + self.hardswish_len = nn.Hardswish() + self.dropout_len = nn.Dropout( + p=last_drop, mode="downscale_in_infer") + + trunc_normal_(self.pos_embed) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample1( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[0], self.HW[0], self.HW[1]])) + for blk in self.blocks2: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample2( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[1], self.HW[0] // 2, self.HW[1]])) + for blk in self.blocks3: + x = blk(x) + if not self.prenorm: + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.use_lenhead: + len_x = self.len_conv(x.mean(1)) + len_x = self.dropout_len(self.hardswish_len(len_x)) + if self.last_stage: + if self.patch_merging is not None: + h = self.HW[0] // 4 + else: + h = self.HW[0] + x = self.avg_pool( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[2], h, self.HW[1]])) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + if self.use_lenhead: + return x, len_x + return x diff --git a/backend/ppocr/modeling/backbones/vqa_layoutlm.py b/backend/ppocr/modeling/backbones/vqa_layoutlm.py new file mode 100644 index 0000000..ede5b7a --- /dev/null +++ b/backend/ppocr/modeling/backbones/vqa_layoutlm.py @@ -0,0 +1,172 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from paddle import nn + +from paddlenlp.transformers import LayoutXLMModel, LayoutXLMForTokenClassification, LayoutXLMForRelationExtraction +from paddlenlp.transformers import LayoutLMModel, LayoutLMForTokenClassification +from paddlenlp.transformers import LayoutLMv2Model, LayoutLMv2ForTokenClassification, LayoutLMv2ForRelationExtraction + +__all__ = ["LayoutXLMForSer", 'LayoutLMForSer'] + +pretrained_model_dict = { + LayoutXLMModel: 'layoutxlm-base-uncased', + LayoutLMModel: 'layoutlm-base-uncased', + LayoutLMv2Model: 'layoutlmv2-base-uncased' +} + + +class NLPBaseModel(nn.Layer): + def __init__(self, + base_model_class, + model_class, + type='ser', + pretrained=True, + checkpoints=None, + **kwargs): + super(NLPBaseModel, self).__init__() + if checkpoints is not None: + self.model = model_class.from_pretrained(checkpoints) + else: + pretrained_model_name = pretrained_model_dict[base_model_class] + if pretrained: + base_model = base_model_class.from_pretrained( + pretrained_model_name) + else: + base_model = base_model_class( + **base_model_class.pretrained_init_configuration[ + pretrained_model_name]) + if type == 'ser': + self.model = model_class( + base_model, num_classes=kwargs['num_classes'], dropout=None) + else: + self.model = model_class(base_model, dropout=None) + self.out_channels = 1 + + +class LayoutLMForSer(NLPBaseModel): + def __init__(self, num_classes, pretrained=True, checkpoints=None, + **kwargs): + super(LayoutLMForSer, self).__init__( + LayoutLMModel, + LayoutLMForTokenClassification, + 'ser', + pretrained, + checkpoints, + num_classes=num_classes) + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[2], + attention_mask=x[4], + token_type_ids=x[5], + position_ids=None, + output_hidden_states=False) + return x + + +class LayoutLMv2ForSer(NLPBaseModel): + def __init__(self, num_classes, pretrained=True, checkpoints=None, + **kwargs): + super(LayoutLMv2ForSer, self).__init__( + LayoutLMv2Model, + LayoutLMv2ForTokenClassification, + 'ser', + pretrained, + checkpoints, + num_classes=num_classes) + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[2], + image=x[3], + attention_mask=x[4], + token_type_ids=x[5], + position_ids=None, + head_mask=None, + labels=None) + return x[0] + + +class LayoutXLMForSer(NLPBaseModel): + def __init__(self, num_classes, pretrained=True, checkpoints=None, + **kwargs): + super(LayoutXLMForSer, self).__init__( + LayoutXLMModel, + LayoutXLMForTokenClassification, + 'ser', + pretrained, + checkpoints, + num_classes=num_classes) + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[2], + image=x[3], + attention_mask=x[4], + token_type_ids=x[5], + position_ids=None, + head_mask=None, + labels=None) + return x[0] + + +class LayoutLMv2ForRe(NLPBaseModel): + def __init__(self, pretrained=True, checkpoints=None, **kwargs): + super(LayoutLMv2ForRe, self).__init__(LayoutLMv2Model, + LayoutLMv2ForRelationExtraction, + 're', pretrained, checkpoints) + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[1], + labels=None, + image=x[2], + attention_mask=x[3], + token_type_ids=x[4], + position_ids=None, + head_mask=None, + entities=x[5], + relations=x[6]) + return x + + +class LayoutXLMForRe(NLPBaseModel): + def __init__(self, pretrained=True, checkpoints=None, **kwargs): + super(LayoutXLMForRe, self).__init__(LayoutXLMModel, + LayoutXLMForRelationExtraction, + 're', pretrained, checkpoints) + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[1], + labels=None, + image=x[2], + attention_mask=x[3], + token_type_ids=x[4], + position_ids=None, + head_mask=None, + entities=x[5], + relations=x[6]) + return x diff --git a/backend/ppocr/modeling/heads/__init__.py b/backend/ppocr/modeling/heads/__init__.py new file mode 100755 index 0000000..1670ea3 --- /dev/null +++ b/backend/ppocr/modeling/heads/__init__.py @@ -0,0 +1,58 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_head'] + + +def build_head(config): + # det head + from .det_db_head import DBHead + from .det_east_head import EASTHead + from .det_sast_head import SASTHead + from .det_pse_head import PSEHead + from .det_fce_head import FCEHead + from .e2e_pg_head import PGHead + + # rec head + from .rec_ctc_head import CTCHead + from .rec_att_head import AttentionHead + from .rec_srn_head import SRNHead + from .rec_nrtr_head import Transformer + from .rec_sar_head import SARHead + from .rec_aster_head import AsterHead + from .rec_pren_head import PRENHead + from .rec_multi_head import MultiHead + + # cls head + from .cls_head import ClsHead + + #kie head + from .kie_sdmgr_head import SDMGRHead + + from .table_att_head import TableAttentionHead + + support_dict = [ + 'DBHead', 'PSEHead', 'FCEHead', 'EASTHead', 'SASTHead', 'CTCHead', + 'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', + 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', + 'MultiHead' + ] + + #table head + + module_name = config.pop('name') + assert module_name in support_dict, Exception('head only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/modeling/heads/cls_head.py b/backend/ppocr/modeling/heads/cls_head.py new file mode 100644 index 0000000..91bfa61 --- /dev/null +++ b/backend/ppocr/modeling/heads/cls_head.py @@ -0,0 +1,52 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +import paddle.nn.functional as F + + +class ClsHead(nn.Layer): + """ + Class orientation + + Args: + + params(dict): super parameters for build Class network + """ + + def __init__(self, in_channels, class_dim, **kwargs): + super(ClsHead, self).__init__() + self.pool = nn.AdaptiveAvgPool2D(1) + stdv = 1.0 / math.sqrt(in_channels * 1.0) + self.fc = nn.Linear( + in_channels, + class_dim, + weight_attr=ParamAttr( + name="fc_0.w_0", + initializer=nn.initializer.Uniform(-stdv, stdv)), + bias_attr=ParamAttr(name="fc_0.b_0"), ) + + def forward(self, x, targets=None): + x = self.pool(x) + x = paddle.reshape(x, shape=[x.shape[0], x.shape[1]]) + x = self.fc(x) + if not self.training: + x = F.softmax(x, axis=1) + return x diff --git a/backend/ppocr/modeling/heads/det_db_head.py b/backend/ppocr/modeling/heads/det_db_head.py new file mode 100644 index 0000000..a686ae5 --- /dev/null +++ b/backend/ppocr/modeling/heads/det_db_head.py @@ -0,0 +1,118 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +def get_bias_attr(k): + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = paddle.nn.initializer.Uniform(-stdv, stdv) + bias_attr = ParamAttr(initializer=initializer) + return bias_attr + + +class Head(nn.Layer): + def __init__(self, in_channels, name_list, kernel_list=[3, 2, 2], **kwargs): + super(Head, self).__init__() + + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels // 4, + kernel_size=kernel_list[0], + padding=int(kernel_list[0] // 2), + weight_attr=ParamAttr(), + bias_attr=False) + self.conv_bn1 = nn.BatchNorm( + num_channels=in_channels // 4, + param_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1.0)), + bias_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1e-4)), + act='relu') + self.conv2 = nn.Conv2DTranspose( + in_channels=in_channels // 4, + out_channels=in_channels // 4, + kernel_size=kernel_list[1], + stride=2, + weight_attr=ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()), + bias_attr=get_bias_attr(in_channels // 4)) + self.conv_bn2 = nn.BatchNorm( + num_channels=in_channels // 4, + param_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1.0)), + bias_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1e-4)), + act="relu") + self.conv3 = nn.Conv2DTranspose( + in_channels=in_channels // 4, + out_channels=1, + kernel_size=kernel_list[2], + stride=2, + weight_attr=ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()), + bias_attr=get_bias_attr(in_channels // 4), ) + + def forward(self, x): + x = self.conv1(x) + x = self.conv_bn1(x) + x = self.conv2(x) + x = self.conv_bn2(x) + x = self.conv3(x) + x = F.sigmoid(x) + return x + + +class DBHead(nn.Layer): + """ + Differentiable Binarization (DB) for text detection: + see https://arxiv.org/abs/1911.08947 + args: + params(dict): super parameters for build DB network + """ + + def __init__(self, in_channels, k=50, **kwargs): + super(DBHead, self).__init__() + self.k = k + binarize_name_list = [ + 'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48', + 'conv2d_transpose_1', 'binarize' + ] + thresh_name_list = [ + 'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', + 'conv2d_transpose_3', 'thresh' + ] + self.binarize = Head(in_channels, binarize_name_list, **kwargs) + self.thresh = Head(in_channels, thresh_name_list, **kwargs) + + def step_function(self, x, y): + return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y))) + + def forward(self, x, targets=None): + shrink_maps = self.binarize(x) + if not self.training: + return {'maps': shrink_maps} + + threshold_maps = self.thresh(x) + binary_maps = self.step_function(shrink_maps, threshold_maps) + y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1) + return {'maps': y} diff --git a/backend/ppocr/modeling/heads/det_east_head.py b/backend/ppocr/modeling/heads/det_east_head.py new file mode 100644 index 0000000..004eb5d --- /dev/null +++ b/backend/ppocr/modeling/heads/det_east_head.py @@ -0,0 +1,121 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class EASTHead(nn.Layer): + """ + """ + def __init__(self, in_channels, model_name, **kwargs): + super(EASTHead, self).__init__() + self.model_name = model_name + if self.model_name == "large": + num_outputs = [128, 64, 1, 8] + else: + num_outputs = [64, 32, 1, 8] + + self.det_conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=num_outputs[0], + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="det_head1") + self.det_conv2 = ConvBNLayer( + in_channels=num_outputs[0], + out_channels=num_outputs[1], + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="det_head2") + self.score_conv = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[2], + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name="f_score") + self.geo_conv = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[3], + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name="f_geo") + + def forward(self, x, targets=None): + f_det = self.det_conv1(x) + f_det = self.det_conv2(f_det) + f_score = self.score_conv(f_det) + f_score = F.sigmoid(f_score) + f_geo = self.geo_conv(f_det) + f_geo = (F.sigmoid(f_geo) - 0.5) * 2 * 800 + + pred = {'f_score': f_score, 'f_geo': f_geo} + return pred diff --git a/backend/ppocr/modeling/heads/det_fce_head.py b/backend/ppocr/modeling/heads/det_fce_head.py new file mode 100644 index 0000000..9503989 --- /dev/null +++ b/backend/ppocr/modeling/heads/det_fce_head.py @@ -0,0 +1,99 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py +""" + +from paddle import nn +from paddle import ParamAttr +import paddle.nn.functional as F +from paddle.nn.initializer import Normal +import paddle +from functools import partial + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +class FCEHead(nn.Layer): + """The class for implementing FCENet head. + FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text + Detection. + + [https://arxiv.org/abs/2104.10442] + + Args: + in_channels (int): The number of input channels. + scales (list[int]) : The scale of each layer. + fourier_degree (int) : The maximum Fourier transform degree k. + """ + + def __init__(self, in_channels, fourier_degree=5): + super().__init__() + assert isinstance(in_channels, int) + + self.downsample_ratio = 1.0 + self.in_channels = in_channels + self.fourier_degree = fourier_degree + self.out_channels_cls = 4 + self.out_channels_reg = (2 * self.fourier_degree + 1) * 2 + + self.out_conv_cls = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels_cls, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr( + name='cls_weights', + initializer=Normal( + mean=0., std=0.01)), + bias_attr=True) + self.out_conv_reg = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels_reg, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr( + name='reg_weights', + initializer=Normal( + mean=0., std=0.01)), + bias_attr=True) + + def forward(self, feats, targets=None): + cls_res, reg_res = multi_apply(self.forward_single, feats) + level_num = len(cls_res) + outs = {} + if not self.training: + for i in range(level_num): + tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], axis=1) + tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], axis=1) + outs['level_{}'.format(i)] = paddle.concat( + [tr_pred, tcl_pred, reg_res[i]], axis=1) + else: + preds = [[cls_res[i], reg_res[i]] for i in range(level_num)] + outs['levels'] = preds + return outs + + def forward_single(self, x): + cls_predict = self.out_conv_cls(x) + reg_predict = self.out_conv_reg(x) + return cls_predict, reg_predict diff --git a/backend/ppocr/modeling/heads/det_pse_head.py b/backend/ppocr/modeling/heads/det_pse_head.py new file mode 100644 index 0000000..32a5b48 --- /dev/null +++ b/backend/ppocr/modeling/heads/det_pse_head.py @@ -0,0 +1,37 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +from paddle import nn + + +class PSEHead(nn.Layer): + def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs): + super(PSEHead, self).__init__() + self.conv1 = nn.Conv2D( + in_channels, hidden_dim, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(hidden_dim) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2D( + hidden_dim, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x, **kwargs): + out = self.conv1(x) + out = self.relu1(self.bn1(out)) + out = self.conv2(out) + return {'maps': out} diff --git a/backend/ppocr/modeling/heads/det_sast_head.py b/backend/ppocr/modeling/heads/det_sast_head.py new file mode 100644 index 0000000..7a88a2d --- /dev/null +++ b/backend/ppocr/modeling/heads/det_sast_head.py @@ -0,0 +1,128 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class SAST_Header1(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(SAST_Header1, self).__init__() + out_channels = [64, 64, 128] + self.score_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'), + ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4') + ) + self.border_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'), + ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4') + ) + + def forward(self, x): + f_score = self.score_conv(x) + f_score = F.sigmoid(f_score) + f_border = self.border_conv(x) + return f_score, f_border + + +class SAST_Header2(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(SAST_Header2, self).__init__() + out_channels = [64, 64, 128] + self.tvo_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'), + ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4') + ) + self.tco_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'), + ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4') + ) + + def forward(self, x): + f_tvo = self.tvo_conv(x) + f_tco = self.tco_conv(x) + return f_tvo, f_tco + + +class SASTHead(nn.Layer): + """ + """ + def __init__(self, in_channels, **kwargs): + super(SASTHead, self).__init__() + + self.head1 = SAST_Header1(in_channels) + self.head2 = SAST_Header2(in_channels) + + def forward(self, x, targets=None): + f_score, f_border = self.head1(x) + f_tvo, f_tco = self.head2(x) + + predicts = {} + predicts['f_score'] = f_score + predicts['f_border'] = f_border + predicts['f_tvo'] = f_tvo + predicts['f_tco'] = f_tco + return predicts \ No newline at end of file diff --git a/backend/ppocr/modeling/heads/e2e_pg_head.py b/backend/ppocr/modeling/heads/e2e_pg_head.py new file mode 100644 index 0000000..274e1cd --- /dev/null +++ b/backend/ppocr/modeling/heads/e2e_pg_head.py @@ -0,0 +1,253 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance", + use_global_stats=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class PGHead(nn.Layer): + """ + """ + + def __init__(self, in_channels, **kwargs): + super(PGHead, self).__init__() + self.conv_f_score1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_score{}".format(1)) + self.conv_f_score2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_score{}".format(2)) + self.conv_f_score3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_score{}".format(3)) + + self.conv1 = nn.Conv2D( + in_channels=128, + out_channels=1, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_score{}".format(4)), + bias_attr=False) + + self.conv_f_boder1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_boder{}".format(1)) + self.conv_f_boder2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_boder{}".format(2)) + self.conv_f_boder3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_boder{}".format(3)) + self.conv2 = nn.Conv2D( + in_channels=128, + out_channels=4, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_boder{}".format(4)), + bias_attr=False) + self.conv_f_char1 = ConvBNLayer( + in_channels=in_channels, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(1)) + self.conv_f_char2 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_char{}".format(2)) + self.conv_f_char3 = ConvBNLayer( + in_channels=128, + out_channels=256, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(3)) + self.conv_f_char4 = ConvBNLayer( + in_channels=256, + out_channels=256, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_char{}".format(4)) + self.conv_f_char5 = ConvBNLayer( + in_channels=256, + out_channels=256, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(5)) + self.conv3 = nn.Conv2D( + in_channels=256, + out_channels=37, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_char{}".format(6)), + bias_attr=False) + + self.conv_f_direc1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_direc{}".format(1)) + self.conv_f_direc2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_direc{}".format(2)) + self.conv_f_direc3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_direc{}".format(3)) + self.conv4 = nn.Conv2D( + in_channels=128, + out_channels=2, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_direc{}".format(4)), + bias_attr=False) + + def forward(self, x, targets=None): + f_score = self.conv_f_score1(x) + f_score = self.conv_f_score2(f_score) + f_score = self.conv_f_score3(f_score) + f_score = self.conv1(f_score) + f_score = F.sigmoid(f_score) + + # f_border + f_border = self.conv_f_boder1(x) + f_border = self.conv_f_boder2(f_border) + f_border = self.conv_f_boder3(f_border) + f_border = self.conv2(f_border) + + f_char = self.conv_f_char1(x) + f_char = self.conv_f_char2(f_char) + f_char = self.conv_f_char3(f_char) + f_char = self.conv_f_char4(f_char) + f_char = self.conv_f_char5(f_char) + f_char = self.conv3(f_char) + + f_direction = self.conv_f_direc1(x) + f_direction = self.conv_f_direc2(f_direction) + f_direction = self.conv_f_direc3(f_direction) + f_direction = self.conv4(f_direction) + + predicts = {} + predicts['f_score'] = f_score + predicts['f_border'] = f_border + predicts['f_char'] = f_char + predicts['f_direction'] = f_direction + return predicts diff --git a/backend/ppocr/modeling/heads/kie_sdmgr_head.py b/backend/ppocr/modeling/heads/kie_sdmgr_head.py new file mode 100644 index 0000000..ac5f73f --- /dev/null +++ b/backend/ppocr/modeling/heads/kie_sdmgr_head.py @@ -0,0 +1,207 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class SDMGRHead(nn.Layer): + def __init__(self, + in_channels, + num_chars=92, + visual_dim=16, + fusion_dim=1024, + node_input=32, + node_embed=256, + edge_input=5, + edge_embed=256, + num_gnn=2, + num_classes=26, + bidirectional=False): + super().__init__() + + self.fusion = Block([visual_dim, node_embed], node_embed, fusion_dim) + self.node_embed = nn.Embedding(num_chars, node_input, 0) + hidden = node_embed // 2 if bidirectional else node_embed + self.rnn = nn.LSTM( + input_size=node_input, hidden_size=hidden, num_layers=1) + self.edge_embed = nn.Linear(edge_input, edge_embed) + self.gnn_layers = nn.LayerList( + [GNNLayer(node_embed, edge_embed) for _ in range(num_gnn)]) + self.node_cls = nn.Linear(node_embed, num_classes) + self.edge_cls = nn.Linear(edge_embed, 2) + + def forward(self, input, targets): + relations, texts, x = input + node_nums, char_nums = [], [] + for text in texts: + node_nums.append(text.shape[0]) + char_nums.append(paddle.sum((text > -1).astype(int), axis=-1)) + + max_num = max([char_num.max() for char_num in char_nums]) + all_nodes = paddle.concat([ + paddle.concat( + [text, paddle.zeros( + (text.shape[0], max_num - text.shape[1]))], -1) + for text in texts + ]) + temp = paddle.clip(all_nodes, min=0).astype(int) + embed_nodes = self.node_embed(temp) + rnn_nodes, _ = self.rnn(embed_nodes) + + b, h, w = rnn_nodes.shape + nodes = paddle.zeros([b, w]) + all_nums = paddle.concat(char_nums) + valid = paddle.nonzero((all_nums > 0).astype(int)) + temp_all_nums = ( + paddle.gather(all_nums, valid) - 1).unsqueeze(-1).unsqueeze(-1) + temp_all_nums = paddle.expand(temp_all_nums, [ + temp_all_nums.shape[0], temp_all_nums.shape[1], rnn_nodes.shape[-1] + ]) + temp_all_nodes = paddle.gather(rnn_nodes, valid) + N, C, A = temp_all_nodes.shape + one_hot = F.one_hot( + temp_all_nums[:, 0, :], num_classes=C).transpose([0, 2, 1]) + one_hot = paddle.multiply( + temp_all_nodes, one_hot.astype("float32")).sum(axis=1, keepdim=True) + t = one_hot.expand([N, 1, A]).squeeze(1) + nodes = paddle.scatter(nodes, valid.squeeze(1), t) + + if x is not None: + nodes = self.fusion([x, nodes]) + + all_edges = paddle.concat( + [rel.reshape([-1, rel.shape[-1]]) for rel in relations]) + embed_edges = self.edge_embed(all_edges.astype('float32')) + embed_edges = F.normalize(embed_edges) + + for gnn_layer in self.gnn_layers: + nodes, cat_nodes = gnn_layer(nodes, embed_edges, node_nums) + + node_cls, edge_cls = self.node_cls(nodes), self.edge_cls(cat_nodes) + return node_cls, edge_cls + + +class GNNLayer(nn.Layer): + def __init__(self, node_dim=256, edge_dim=256): + super().__init__() + self.in_fc = nn.Linear(node_dim * 2 + edge_dim, node_dim) + self.coef_fc = nn.Linear(node_dim, 1) + self.out_fc = nn.Linear(node_dim, node_dim) + self.relu = nn.ReLU() + + def forward(self, nodes, edges, nums): + start, cat_nodes = 0, [] + for num in nums: + sample_nodes = nodes[start:start + num] + cat_nodes.append( + paddle.concat([ + paddle.expand(sample_nodes.unsqueeze(1), [-1, num, -1]), + paddle.expand(sample_nodes.unsqueeze(0), [num, -1, -1]) + ], -1).reshape([num**2, -1])) + start += num + cat_nodes = paddle.concat([paddle.concat(cat_nodes), edges], -1) + cat_nodes = self.relu(self.in_fc(cat_nodes)) + coefs = self.coef_fc(cat_nodes) + + start, residuals = 0, [] + for num in nums: + residual = F.softmax( + -paddle.eye(num).unsqueeze(-1) * 1e9 + + coefs[start:start + num**2].reshape([num, num, -1]), 1) + residuals.append((residual * cat_nodes[start:start + num**2] + .reshape([num, num, -1])).sum(1)) + start += num**2 + + nodes += self.relu(self.out_fc(paddle.concat(residuals))) + return [nodes, cat_nodes] + + +class Block(nn.Layer): + def __init__(self, + input_dims, + output_dim, + mm_dim=1600, + chunks=20, + rank=15, + shared=False, + dropout_input=0., + dropout_pre_lin=0., + dropout_output=0., + pos_norm='before_cat'): + super().__init__() + self.rank = rank + self.dropout_input = dropout_input + self.dropout_pre_lin = dropout_pre_lin + self.dropout_output = dropout_output + assert (pos_norm in ['before_cat', 'after_cat']) + self.pos_norm = pos_norm + # Modules + self.linear0 = nn.Linear(input_dims[0], mm_dim) + self.linear1 = (self.linear0 + if shared else nn.Linear(input_dims[1], mm_dim)) + self.merge_linears0 = nn.LayerList() + self.merge_linears1 = nn.LayerList() + self.chunks = self.chunk_sizes(mm_dim, chunks) + for size in self.chunks: + ml0 = nn.Linear(size, size * rank) + self.merge_linears0.append(ml0) + ml1 = ml0 if shared else nn.Linear(size, size * rank) + self.merge_linears1.append(ml1) + self.linear_out = nn.Linear(mm_dim, output_dim) + + def forward(self, x): + x0 = self.linear0(x[0]) + x1 = self.linear1(x[1]) + bs = x1.shape[0] + if self.dropout_input > 0: + x0 = F.dropout(x0, p=self.dropout_input, training=self.training) + x1 = F.dropout(x1, p=self.dropout_input, training=self.training) + x0_chunks = paddle.split(x0, self.chunks, -1) + x1_chunks = paddle.split(x1, self.chunks, -1) + zs = [] + for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0, + self.merge_linears1): + m = m0(x0_c) * m1(x1_c) # bs x split_size*rank + m = m.reshape([bs, self.rank, -1]) + z = paddle.sum(m, 1) + if self.pos_norm == 'before_cat': + z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z)) + z = F.normalize(z) + zs.append(z) + z = paddle.concat(zs, 1) + if self.pos_norm == 'after_cat': + z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z)) + z = F.normalize(z) + + if self.dropout_pre_lin > 0: + z = F.dropout(z, p=self.dropout_pre_lin, training=self.training) + z = self.linear_out(z) + if self.dropout_output > 0: + z = F.dropout(z, p=self.dropout_output, training=self.training) + return z + + def chunk_sizes(self, dim, chunks): + split_size = (dim + chunks - 1) // chunks + sizes_list = [split_size] * chunks + sizes_list[-1] = sizes_list[-1] - (sum(sizes_list) - dim) + return sizes_list diff --git a/backend/ppocr/modeling/heads/multiheadAttention.py b/backend/ppocr/modeling/heads/multiheadAttention.py new file mode 100755 index 0000000..900865b --- /dev/null +++ b/backend/ppocr/modeling/heads/multiheadAttention.py @@ -0,0 +1,163 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import Linear +from paddle.nn.initializer import XavierUniform as xavier_uniform_ +from paddle.nn.initializer import Constant as constant_ +from paddle.nn.initializer import XavierNormal as xavier_normal_ + +zeros_ = constant_(value=0.) +ones_ = constant_(value=1.) + + +class MultiheadAttention(nn.Layer): + """Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + + Args: + embed_dim: total dimension of the model + num_heads: parallel attention layers, or heads + + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + bias=True, + add_bias_kv=False, + add_zero_attn=False): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim**-0.5 + self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias) + self._reset_parameters() + self.conv1 = paddle.nn.Conv2D( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + self.conv2 = paddle.nn.Conv2D( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + self.conv3 = paddle.nn.Conv2D( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + + def _reset_parameters(self): + xavier_uniform_(self.out_proj.weight) + + def forward(self, + query, + key, + value, + key_padding_mask=None, + incremental_state=None, + attn_mask=None): + """ + Inputs of forward function + query: [target length, batch size, embed dim] + key: [sequence length, batch size, embed dim] + value: [sequence length, batch size, embed dim] + key_padding_mask: if True, mask padding based on batch size + incremental_state: if provided, previous time steps are cashed + need_weights: output attn_output_weights + static_kv: key and value are static + + Outputs of forward function + attn_output: [target length, batch size, embed dim] + attn_output_weights: [batch size, target length, sequence length] + """ + q_shape = paddle.shape(query) + src_shape = paddle.shape(key) + q = self._in_proj_q(query) + k = self._in_proj_k(key) + v = self._in_proj_v(value) + q *= self.scaling + q = paddle.transpose( + paddle.reshape( + q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]), + [1, 2, 0, 3]) + k = paddle.transpose( + paddle.reshape( + k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), + [1, 2, 0, 3]) + v = paddle.transpose( + paddle.reshape( + v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), + [1, 2, 0, 3]) + if key_padding_mask is not None: + assert key_padding_mask.shape[0] == q_shape[1] + assert key_padding_mask.shape[1] == src_shape[0] + attn_output_weights = paddle.matmul(q, + paddle.transpose(k, [0, 1, 3, 2])) + if attn_mask is not None: + attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0) + attn_output_weights += attn_mask + if key_padding_mask is not None: + attn_output_weights = paddle.reshape( + attn_output_weights, + [q_shape[1], self.num_heads, q_shape[0], src_shape[0]]) + key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2) + key = paddle.cast(key, 'float32') + y = paddle.full( + shape=paddle.shape(key), dtype='float32', fill_value='-inf') + y = paddle.where(key == 0., key, y) + attn_output_weights += y + attn_output_weights = F.softmax( + attn_output_weights.astype('float32'), + axis=-1, + dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 + else attn_output_weights.dtype) + attn_output_weights = F.dropout( + attn_output_weights, p=self.dropout, training=self.training) + + attn_output = paddle.matmul(attn_output_weights, v) + attn_output = paddle.reshape( + paddle.transpose(attn_output, [2, 0, 1, 3]), + [q_shape[0], q_shape[1], self.embed_dim]) + attn_output = self.out_proj(attn_output) + + return attn_output + + def _in_proj_q(self, query): + query = paddle.transpose(query, [1, 2, 0]) + query = paddle.unsqueeze(query, axis=2) + res = self.conv1(query) + res = paddle.squeeze(res, axis=2) + res = paddle.transpose(res, [2, 0, 1]) + return res + + def _in_proj_k(self, key): + key = paddle.transpose(key, [1, 2, 0]) + key = paddle.unsqueeze(key, axis=2) + res = self.conv2(key) + res = paddle.squeeze(res, axis=2) + res = paddle.transpose(res, [2, 0, 1]) + return res + + def _in_proj_v(self, value): + value = paddle.transpose(value, [1, 2, 0]) #(1, 2, 0) + value = paddle.unsqueeze(value, axis=2) + res = self.conv3(value) + res = paddle.squeeze(res, axis=2) + res = paddle.transpose(res, [2, 0, 1]) + return res diff --git a/backend/ppocr/modeling/heads/rec_aster_head.py b/backend/ppocr/modeling/heads/rec_aster_head.py new file mode 100644 index 0000000..c95e8fd --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_aster_head.py @@ -0,0 +1,393 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/attention_recognition_head.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class AsterHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + sDim, + attDim, + max_len_labels, + time_step=25, + beam_width=5, + **kwargs): + super(AsterHead, self).__init__() + self.num_classes = out_channels + self.in_planes = in_channels + self.sDim = sDim + self.attDim = attDim + self.max_len_labels = max_len_labels + self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim, + attDim, max_len_labels) + self.time_step = time_step + self.embeder = Embedding(self.time_step, in_channels) + self.beam_width = beam_width + self.eos = self.num_classes - 3 + + def forward(self, x, targets=None, embed=None): + return_dict = {} + embedding_vectors = self.embeder(x) + + if self.training: + rec_targets, rec_lengths, _ = targets + rec_pred = self.decoder([x, rec_targets, rec_lengths], + embedding_vectors) + return_dict['rec_pred'] = rec_pred + return_dict['embedding_vectors'] = embedding_vectors + else: + rec_pred, rec_pred_scores = self.decoder.beam_search( + x, self.beam_width, self.eos, embedding_vectors) + return_dict['rec_pred'] = rec_pred + return_dict['rec_pred_scores'] = rec_pred_scores + return_dict['embedding_vectors'] = embedding_vectors + + return return_dict + + +class Embedding(nn.Layer): + def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300): + super(Embedding, self).__init__() + self.in_timestep = in_timestep + self.in_planes = in_planes + self.embed_dim = embed_dim + self.mid_dim = mid_dim + self.eEmbed = nn.Linear( + in_timestep * in_planes, + self.embed_dim) # Embed encoder output to a word-embedding like + + def forward(self, x): + x = paddle.reshape(x, [paddle.shape(x)[0], -1]) + x = self.eEmbed(x) + return x + + +class AttentionRecognitionHead(nn.Layer): + """ + input: [b x 16 x 64 x in_planes] + output: probability sequence: [b x T x num_classes] + """ + + def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels): + super(AttentionRecognitionHead, self).__init__() + self.num_classes = out_channels # this is the output classes. So it includes the . + self.in_planes = in_channels + self.sDim = sDim + self.attDim = attDim + self.max_len_labels = max_len_labels + + self.decoder = DecoderUnit( + sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim) + + def forward(self, x, embed): + x, targets, lengths = x + batch_size = paddle.shape(x)[0] + # Decoder + state = self.decoder.get_initial_state(embed) + outputs = [] + for i in range(max(lengths)): + if i == 0: + y_prev = paddle.full( + shape=[batch_size], fill_value=self.num_classes) + else: + y_prev = targets[:, i - 1] + output, state = self.decoder(x, state, y_prev) + outputs.append(output) + outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1) + return outputs + + # inference stage. + def sample(self, x): + x, _, _ = x + batch_size = x.size(0) + # Decoder + state = paddle.zeros([1, batch_size, self.sDim]) + + predicted_ids, predicted_scores = [], [] + for i in range(self.max_len_labels): + if i == 0: + y_prev = paddle.full( + shape=[batch_size], fill_value=self.num_classes) + else: + y_prev = predicted + + output, state = self.decoder(x, state, y_prev) + output = F.softmax(output, axis=1) + score, predicted = output.max(1) + predicted_ids.append(predicted.unsqueeze(1)) + predicted_scores.append(score.unsqueeze(1)) + predicted_ids = paddle.concat([predicted_ids, 1]) + predicted_scores = paddle.concat([predicted_scores, 1]) + # return predicted_ids.squeeze(), predicted_scores.squeeze() + return predicted_ids, predicted_scores + + def beam_search(self, x, beam_width, eos, embed): + def _inflate(tensor, times, dim): + repeat_dims = [1] * tensor.dim() + repeat_dims[dim] = times + output = paddle.tile(tensor, repeat_dims) + return output + + # https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py + batch_size, l, d = x.shape + x = paddle.tile( + paddle.transpose( + x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1]) + inflated_encoder_feats = paddle.reshape( + paddle.transpose( + x, perm=[1, 0, 2, 3]), [-1, l, d]) + + # Initialize the decoder + state = self.decoder.get_initial_state(embed, tile_times=beam_width) + + pos_index = paddle.reshape( + paddle.arange(batch_size) * beam_width, shape=[-1, 1]) + + # Initialize the scores + sequence_scores = paddle.full( + shape=[batch_size * beam_width, 1], fill_value=-float('Inf')) + index = [i * beam_width for i in range(0, batch_size)] + sequence_scores[index] = 0.0 + + # Initialize the input vector + y_prev = paddle.full( + shape=[batch_size * beam_width], fill_value=self.num_classes) + + # Store decisions for backtracking + stored_scores = list() + stored_predecessors = list() + stored_emitted_symbols = list() + + for i in range(self.max_len_labels): + output, state = self.decoder(inflated_encoder_feats, state, y_prev) + state = paddle.unsqueeze(state, axis=0) + log_softmax_output = paddle.nn.functional.log_softmax( + output, axis=1) + + sequence_scores = _inflate(sequence_scores, self.num_classes, 1) + sequence_scores += log_softmax_output + scores, candidates = paddle.topk( + paddle.reshape(sequence_scores, [batch_size, -1]), + beam_width, + axis=1) + + # Reshape input = (bk, 1) and sequence_scores = (bk, 1) + y_prev = paddle.reshape( + candidates % self.num_classes, shape=[batch_size * beam_width]) + sequence_scores = paddle.reshape( + scores, shape=[batch_size * beam_width, 1]) + + # Update fields for next timestep + pos_index = paddle.expand_as(pos_index, candidates) + predecessors = paddle.cast( + candidates / self.num_classes + pos_index, dtype='int64') + predecessors = paddle.reshape( + predecessors, shape=[batch_size * beam_width, 1]) + state = paddle.index_select( + state, index=predecessors.squeeze(), axis=1) + + # Update sequence socres and erase scores for symbol so that they aren't expanded + stored_scores.append(sequence_scores.clone()) + y_prev = paddle.reshape(y_prev, shape=[-1, 1]) + eos_prev = paddle.full_like(y_prev, fill_value=eos) + mask = eos_prev == y_prev + mask = paddle.nonzero(mask) + if mask.dim() > 0: + sequence_scores = sequence_scores.numpy() + mask = mask.numpy() + sequence_scores[mask] = -float('inf') + sequence_scores = paddle.to_tensor(sequence_scores) + + # Cache results for backtracking + stored_predecessors.append(predecessors) + y_prev = paddle.squeeze(y_prev) + stored_emitted_symbols.append(y_prev) + + # Do backtracking to return the optimal values + #====== backtrak ======# + # Initialize return variables given different types + p = list() + l = [[self.max_len_labels] * beam_width for _ in range(batch_size) + ] # Placeholder for lengths of top-k sequences + + # the last step output of the beams are not sorted + # thus they are sorted here + sorted_score, sorted_idx = paddle.topk( + paddle.reshape( + stored_scores[-1], shape=[batch_size, beam_width]), + beam_width) + + # initialize the sequence scores with the sorted last step beam scores + s = sorted_score.clone() + + batch_eos_found = [0] * batch_size # the number of EOS found + # in the backward loop below for each batch + t = self.max_len_labels - 1 + # initialize the back pointer with the sorted order of the last step beams. + # add pos_index for indexing variable with b*k as the first dimension. + t_predecessors = paddle.reshape( + sorted_idx + pos_index.expand_as(sorted_idx), + shape=[batch_size * beam_width]) + while t >= 0: + # Re-order the variables with the back pointer + current_symbol = paddle.index_select( + stored_emitted_symbols[t], index=t_predecessors, axis=0) + t_predecessors = paddle.index_select( + stored_predecessors[t].squeeze(), index=t_predecessors, axis=0) + eos_indices = stored_emitted_symbols[t] == eos + eos_indices = paddle.nonzero(eos_indices) + + if eos_indices.dim() > 0: + for i in range(eos_indices.shape[0] - 1, -1, -1): + # Indices of the EOS symbol for both variables + # with b*k as the first dimension, and b, k for + # the first two dimensions + idx = eos_indices[i] + b_idx = int(idx[0] / beam_width) + # The indices of the replacing position + # according to the replacement strategy noted above + res_k_idx = beam_width - (batch_eos_found[b_idx] % + beam_width) - 1 + batch_eos_found[b_idx] += 1 + res_idx = b_idx * beam_width + res_k_idx + + # Replace the old information in return variables + # with the new ended sequence information + t_predecessors[res_idx] = stored_predecessors[t][idx[0]] + current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]] + s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0] + l[b_idx][res_k_idx] = t + 1 + + # record the back tracked results + p.append(current_symbol) + t -= 1 + + # Sort and re-order again as the added ended sequences may change + # the order (very unlikely) + s, re_sorted_idx = s.topk(beam_width) + for b_idx in range(batch_size): + l[b_idx] = [ + l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :] + ] + + re_sorted_idx = paddle.reshape( + re_sorted_idx + pos_index.expand_as(re_sorted_idx), + [batch_size * beam_width]) + + # Reverse the sequences and re-order at the same time + # It is reversed because the backtracking happens in reverse time order + p = [ + paddle.reshape( + paddle.index_select(step, re_sorted_idx, 0), + shape=[batch_size, beam_width, -1]) for step in reversed(p) + ] + p = paddle.concat(p, -1)[:, 0, :] + return p, paddle.ones_like(p) + + +class AttentionUnit(nn.Layer): + def __init__(self, sDim, xDim, attDim): + super(AttentionUnit, self).__init__() + + self.sDim = sDim + self.xDim = xDim + self.attDim = attDim + + self.sEmbed = nn.Linear(sDim, attDim) + self.xEmbed = nn.Linear(xDim, attDim) + self.wEmbed = nn.Linear(attDim, 1) + + def forward(self, x, sPrev): + batch_size, T, _ = x.shape # [b x T x xDim] + x = paddle.reshape(x, [-1, self.xDim]) # [(b x T) x xDim] + xProj = self.xEmbed(x) # [(b x T) x attDim] + xProj = paddle.reshape(xProj, [batch_size, T, -1]) # [b x T x attDim] + + sPrev = sPrev.squeeze(0) + sProj = self.sEmbed(sPrev) # [b x attDim] + sProj = paddle.unsqueeze(sProj, 1) # [b x 1 x attDim] + sProj = paddle.expand(sProj, + [batch_size, T, self.attDim]) # [b x T x attDim] + + sumTanh = paddle.tanh(sProj + xProj) + sumTanh = paddle.reshape(sumTanh, [-1, self.attDim]) + + vProj = self.wEmbed(sumTanh) # [(b x T) x 1] + vProj = paddle.reshape(vProj, [batch_size, T]) + alpha = F.softmax( + vProj, axis=1) # attention weights for each sample in the minibatch + return alpha + + +class DecoderUnit(nn.Layer): + def __init__(self, sDim, xDim, yDim, attDim): + super(DecoderUnit, self).__init__() + self.sDim = sDim + self.xDim = xDim + self.yDim = yDim + self.attDim = attDim + self.emdDim = attDim + + self.attention_unit = AttentionUnit(sDim, xDim, attDim) + self.tgt_embedding = nn.Embedding( + yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal( + std=0.01)) # the last is used for + self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim) + self.fc = nn.Linear( + sDim, + yDim, + weight_attr=nn.initializer.Normal(std=0.01), + bias_attr=nn.initializer.Constant(value=0)) + self.embed_fc = nn.Linear(300, self.sDim) + + def get_initial_state(self, embed, tile_times=1): + assert embed.shape[1] == 300 + state = self.embed_fc(embed) # N * sDim + if tile_times != 1: + state = state.unsqueeze(1) + trans_state = paddle.transpose(state, perm=[1, 0, 2]) + state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1]) + trans_state = paddle.transpose(state, perm=[1, 0, 2]) + state = paddle.reshape(trans_state, shape=[-1, self.sDim]) + state = state.unsqueeze(0) # 1 * N * sDim + return state + + def forward(self, x, sPrev, yPrev): + # x: feature sequence from the image decoder. + batch_size, T, _ = x.shape + alpha = self.attention_unit(x, sPrev) + context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1) + yPrev = paddle.cast(yPrev, dtype="int64") + yProj = self.tgt_embedding(yPrev) + + concat_context = paddle.concat([yProj, context], 1) + concat_context = paddle.squeeze(concat_context, 1) + sPrev = paddle.squeeze(sPrev, 0) + output, state = self.gru(concat_context, sPrev) + output = paddle.squeeze(output, axis=1) + output = self.fc(output) + return output, state \ No newline at end of file diff --git a/backend/ppocr/modeling/heads/rec_att_head.py b/backend/ppocr/modeling/heads/rec_att_head.py new file mode 100644 index 0000000..ab8b119 --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_att_head.py @@ -0,0 +1,202 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + + +class AttentionHead(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionHead, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionGRUCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = paddle.shape(inputs)[0] + num_steps = batch_max_length + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + char_onehots = None + outputs = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(outputs) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + next_input = probs_step.argmax(axis=1) + targets = next_input + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) + return probs + + +class AttentionGRUCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1) + + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha + + +class AttentionLSTM(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros( + (batch_size, self.hidden_size))) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + + next_input = probs_step.argmax(axis=1) + + targets = next_input + + return probs + + +class AttentionLSTMCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha diff --git a/backend/ppocr/modeling/heads/rec_ctc_head.py b/backend/ppocr/modeling/heads/rec_ctc_head.py new file mode 100755 index 0000000..6c1cf06 --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_ctc_head.py @@ -0,0 +1,87 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +from paddle import ParamAttr, nn +from paddle.nn import functional as F + + +def get_para_bias_attr(l2_decay, k): + regularizer = paddle.regularizer.L2Decay(l2_decay) + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = nn.initializer.Uniform(-stdv, stdv) + weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + return [weight_attr, bias_attr] + + +class CTCHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + fc_decay=0.0004, + mid_channels=None, + return_feats=False, + **kwargs): + super(CTCHead, self).__init__() + if mid_channels is None: + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=fc_decay, k=in_channels) + self.fc = nn.Linear( + in_channels, + out_channels, + weight_attr=weight_attr, + bias_attr=bias_attr) + else: + weight_attr1, bias_attr1 = get_para_bias_attr( + l2_decay=fc_decay, k=in_channels) + self.fc1 = nn.Linear( + in_channels, + mid_channels, + weight_attr=weight_attr1, + bias_attr=bias_attr1) + + weight_attr2, bias_attr2 = get_para_bias_attr( + l2_decay=fc_decay, k=mid_channels) + self.fc2 = nn.Linear( + mid_channels, + out_channels, + weight_attr=weight_attr2, + bias_attr=bias_attr2) + self.out_channels = out_channels + self.mid_channels = mid_channels + self.return_feats = return_feats + + def forward(self, x, targets=None): + if self.mid_channels is None: + predicts = self.fc(x) + else: + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = (x, predicts) + else: + result = predicts + if not self.training: + predicts = F.softmax(predicts, axis=2) + result = predicts + + return result diff --git a/backend/ppocr/modeling/heads/rec_multi_head.py b/backend/ppocr/modeling/heads/rec_multi_head.py new file mode 100644 index 0000000..ef78bf9 --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_multi_head.py @@ -0,0 +1,73 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +from ppocr.modeling.necks.rnn import Im2Seq, EncoderWithRNN, EncoderWithFC, SequenceEncoder, EncoderWithSVTR +from .rec_ctc_head import CTCHead +from .rec_sar_head import SARHead + + +class MultiHead(nn.Layer): + def __init__(self, in_channels, out_channels_list, **kwargs): + super().__init__() + self.head_list = kwargs.pop('head_list') + self.gtc_head = 'sar' + assert len(self.head_list) >= 2 + for idx, head_name in enumerate(self.head_list): + name = list(head_name)[0] + if name == 'SARHead': + # sar head + sar_args = self.head_list[idx][name] + self.sar_head = eval(name)(in_channels=in_channels, \ + out_channels=out_channels_list['SARLabelDecode'], **sar_args) + elif name == 'CTCHead': + # ctc neck + self.encoder_reshape = Im2Seq(in_channels) + neck_args = self.head_list[idx][name]['Neck'] + encoder_type = neck_args.pop('name') + self.encoder = encoder_type + self.ctc_encoder = SequenceEncoder(in_channels=in_channels, \ + encoder_type=encoder_type, **neck_args) + # ctc head + head_args = self.head_list[idx][name]['Head'] + self.ctc_head = eval(name)(in_channels=self.ctc_encoder.out_channels, \ + out_channels=out_channels_list['CTCLabelDecode'], **head_args) + else: + raise NotImplementedError( + '{} is not supported in MultiHead yet'.format(name)) + + def forward(self, x, targets=None): + ctc_encoder = self.ctc_encoder(x) + ctc_out = self.ctc_head(ctc_encoder, targets) + head_out = dict() + head_out['ctc'] = ctc_out + head_out['ctc_neck'] = ctc_encoder + # eval mode + if not self.training: + return ctc_out + if self.gtc_head == 'sar': + sar_out = self.sar_head(x, targets[1:]) + head_out['sar'] = sar_out + return head_out + else: + return head_out diff --git a/backend/ppocr/modeling/heads/rec_nrtr_head.py b/backend/ppocr/modeling/heads/rec_nrtr_head.py new file mode 100644 index 0000000..38ba0c9 --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_nrtr_head.py @@ -0,0 +1,826 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import copy +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import LayerList +from paddle.nn.initializer import XavierNormal as xavier_uniform_ +from paddle.nn import Dropout, Linear, LayerNorm, Conv2D +import numpy as np +from ppocr.modeling.heads.multiheadAttention import MultiheadAttention +from paddle.nn.initializer import Constant as constant_ +from paddle.nn.initializer import XavierNormal as xavier_normal_ + +zeros_ = constant_(value=0.) +ones_ = constant_(value=1.) + + +class Transformer(nn.Layer): + """A transformer model. User is able to modify the attributes as needed. The architechture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. + + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + + """ + + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + beam_size=0, + num_decoder_layers=6, + dim_feedforward=1024, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + custom_encoder=None, + custom_decoder=None, + in_channels=0, + out_channels=0, + scale_embedding=True): + super(Transformer, self).__init__() + self.out_channels = out_channels + 1 + self.embedding = Embeddings( + d_model=d_model, + vocab=self.out_channels, + padding_idx=0, + scale_embedding=scale_embedding) + self.positional_encoding = PositionalEncoding( + dropout=residual_dropout_rate, + dim=d_model, ) + if custom_encoder is not None: + self.encoder = custom_encoder + else: + if num_encoder_layers > 0: + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, attention_dropout_rate, + residual_dropout_rate) + self.encoder = TransformerEncoder(encoder_layer, + num_encoder_layers) + else: + self.encoder = None + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer( + d_model, nhead, dim_feedforward, attention_dropout_rate, + residual_dropout_rate) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers) + + self._reset_parameters() + self.beam_size = beam_size + self.d_model = d_model + self.nhead = nhead + self.tgt_word_prj = nn.Linear( + d_model, self.out_channels, bias_attr=False) + w0 = np.random.normal(0.0, d_model**-0.5, + (d_model, self.out_channels)).astype(np.float32) + self.tgt_word_prj.weight.set_value(w0) + self.apply(self._init_weights) + + def _init_weights(self, m): + + if isinstance(m, nn.Conv2D): + xavier_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward_train(self, src, tgt): + tgt = tgt[:, :-1] + + tgt_key_padding_mask = self.generate_padding_mask(tgt) + tgt = self.embedding(tgt).transpose([1, 0, 2]) + tgt = self.positional_encoding(tgt) + tgt_mask = self.generate_square_subsequent_mask(tgt.shape[0]) + + if self.encoder is not None: + src = self.positional_encoding(src.transpose([1, 0, 2])) + memory = self.encoder(src) + else: + memory = src.squeeze(2).transpose([2, 0, 1]) + output = self.decoder( + tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=None, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=None) + output = output.transpose([1, 0, 2]) + logit = self.tgt_word_prj(output) + return logit + + def forward(self, src, targets=None): + """Take in and process masked source/target sequences. + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + Shape: + - src: :math:`(S, N, E)`. + - tgt: :math:`(T, N, E)`. + Examples: + >>> output = transformer_model(src, tgt) + """ + + if self.training: + max_len = targets[1].max() + tgt = targets[0][:, :2 + max_len] + return self.forward_train(src, tgt) + else: + if self.beam_size > 0: + return self.forward_beam(src) + else: + return self.forward_test(src) + + def forward_test(self, src): + bs = paddle.shape(src)[0] + if self.encoder is not None: + src = self.positional_encoding(paddle.transpose(src, [1, 0, 2])) + memory = self.encoder(src) + else: + memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1]) + dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) + dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32) + for len_dec_seq in range(1, 25): + dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2]) + dec_seq_embed = self.positional_encoding(dec_seq_embed) + tgt_mask = self.generate_square_subsequent_mask( + paddle.shape(dec_seq_embed)[0]) + output = self.decoder( + dec_seq_embed, + memory, + tgt_mask=tgt_mask, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None) + dec_output = paddle.transpose(output, [1, 0, 2]) + dec_output = dec_output[:, -1, :] + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) + preds_idx = paddle.argmax(word_prob, axis=1) + if paddle.equal_all( + preds_idx, + paddle.full( + paddle.shape(preds_idx), 3, dtype='int64')): + break + preds_prob = paddle.max(word_prob, axis=1) + dec_seq = paddle.concat( + [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1) + dec_prob = paddle.concat( + [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1) + return [dec_seq, dec_prob] + + def forward_beam(self, images): + ''' Translation work in one batch ''' + + def get_inst_idx_to_tensor_position_map(inst_idx_list): + ''' Indicate the position of an instance in a tensor. ''' + return { + inst_idx: tensor_position + for tensor_position, inst_idx in enumerate(inst_idx_list) + } + + def collect_active_part(beamed_tensor, curr_active_inst_idx, + n_prev_active_inst, n_bm): + ''' Collect tensor parts associated to active instances. ''' + + beamed_tensor_shape = paddle.shape(beamed_tensor) + n_curr_active_inst = len(curr_active_inst_idx) + new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1], + beamed_tensor_shape[2]) + + beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1]) + beamed_tensor = beamed_tensor.index_select( + curr_active_inst_idx, axis=0) + beamed_tensor = beamed_tensor.reshape(new_shape) + + return beamed_tensor + + def collate_active_info(src_enc, inst_idx_to_position_map, + active_inst_idx_list): + # Sentences which are still active are collected, + # so the decoder will not run on completed sentences. + + n_prev_active_inst = len(inst_idx_to_position_map) + active_inst_idx = [ + inst_idx_to_position_map[k] for k in active_inst_idx_list + ] + active_inst_idx = paddle.to_tensor(active_inst_idx, dtype='int64') + active_src_enc = collect_active_part( + src_enc.transpose([1, 0, 2]), active_inst_idx, + n_prev_active_inst, n_bm).transpose([1, 0, 2]) + active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + return active_src_enc, active_inst_idx_to_position_map + + def beam_decode_step(inst_dec_beams, len_dec_seq, enc_output, + inst_idx_to_position_map, n_bm, + memory_key_padding_mask): + ''' Decode and update beam status, and then return active beam idx ''' + + def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): + dec_partial_seq = [ + b.get_current_state() for b in inst_dec_beams if not b.done + ] + dec_partial_seq = paddle.stack(dec_partial_seq) + dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq]) + return dec_partial_seq + + def predict_word(dec_seq, enc_output, n_active_inst, n_bm, + memory_key_padding_mask): + dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2]) + dec_seq = self.positional_encoding(dec_seq) + tgt_mask = self.generate_square_subsequent_mask( + paddle.shape(dec_seq)[0]) + dec_output = self.decoder( + dec_seq, + enc_output, + tgt_mask=tgt_mask, + tgt_key_padding_mask=None, + memory_key_padding_mask=memory_key_padding_mask, ) + dec_output = paddle.transpose(dec_output, [1, 0, 2]) + dec_output = dec_output[:, + -1, :] # Pick the last step: (bh * bm) * d_h + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) + word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1]) + return word_prob + + def collect_active_inst_idx_list(inst_beams, word_prob, + inst_idx_to_position_map): + active_inst_idx_list = [] + for inst_idx, inst_position in inst_idx_to_position_map.items(): + is_inst_complete = inst_beams[inst_idx].advance(word_prob[ + inst_position]) + if not is_inst_complete: + active_inst_idx_list += [inst_idx] + + return active_inst_idx_list + + n_active_inst = len(inst_idx_to_position_map) + dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) + word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm, + None) + # Update the beam with predicted word prob information and collect incomplete instances + active_inst_idx_list = collect_active_inst_idx_list( + inst_dec_beams, word_prob, inst_idx_to_position_map) + return active_inst_idx_list + + def collect_hypothesis_and_scores(inst_dec_beams, n_best): + all_hyp, all_scores = [], [] + for inst_idx in range(len(inst_dec_beams)): + scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() + all_scores += [scores[:n_best]] + hyps = [ + inst_dec_beams[inst_idx].get_hypothesis(i) + for i in tail_idxs[:n_best] + ] + all_hyp += [hyps] + return all_hyp, all_scores + + with paddle.no_grad(): + #-- Encode + if self.encoder is not None: + src = self.positional_encoding(images.transpose([1, 0, 2])) + src_enc = self.encoder(src) + else: + src_enc = images.squeeze(2).transpose([0, 2, 1]) + + n_bm = self.beam_size + src_shape = paddle.shape(src_enc) + inst_dec_beams = [Beam(n_bm) for _ in range(1)] + active_inst_idx_list = list(range(1)) + # Repeat data for beam search + src_enc = paddle.tile(src_enc, [1, n_bm, 1]) + inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + # Decode + for len_dec_seq in range(1, 25): + src_enc_copy = src_enc.clone() + active_inst_idx_list = beam_decode_step( + inst_dec_beams, len_dec_seq, src_enc_copy, + inst_idx_to_position_map, n_bm, None) + if not active_inst_idx_list: + break # all instances have finished their path to + src_enc, inst_idx_to_position_map = collate_active_info( + src_enc_copy, inst_idx_to_position_map, + active_inst_idx_list) + batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, + 1) + result_hyp = [] + hyp_scores = [] + for bs_hyp, score in zip(batch_hyp, batch_scores): + l = len(bs_hyp[0]) + bs_hyp_pad = bs_hyp[0] + [3] * (25 - l) + result_hyp.append(bs_hyp_pad) + score = float(score) / l + hyp_score = [score for _ in range(25)] + hyp_scores.append(hyp_score) + return [ + paddle.to_tensor( + np.array(result_hyp), dtype=paddle.int64), + paddle.to_tensor(hyp_scores) + ] + + def generate_square_subsequent_mask(self, sz): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = paddle.zeros([sz, sz], dtype='float32') + mask_inf = paddle.triu( + paddle.full( + shape=[sz, sz], dtype='float32', fill_value='-inf'), + diagonal=1) + mask = mask + mask_inf + return mask + + def generate_padding_mask(self, x): + padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype)) + return padding_mask + + def _reset_parameters(self): + """Initiate parameters in the transformer model.""" + + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + +class TransformerEncoder(nn.Layer): + """TransformerEncoder is a stack of N encoder layers + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + """ + + def __init__(self, encoder_layer, num_layers): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + + def forward(self, src): + """Pass the input through the endocder layers in turn. + Args: + src: the sequnce to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + """ + output = src + + for i in range(self.num_layers): + output = self.layers[i](output, + src_mask=None, + src_key_padding_mask=None) + + return output + + +class TransformerDecoder(nn.Layer): + """TransformerDecoder is a stack of N decoder layers + + Args: + decoder_layer: an instance of the TransformerDecoderLayer() class (required). + num_layers: the number of sub-decoder-layers in the decoder (required). + norm: the layer normalization component (optional). + + """ + + def __init__(self, decoder_layer, num_layers): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + """Pass the inputs (and mask) through the decoder layer in turn. + + Args: + tgt: the sequence to the decoder (required). + memory: the sequnce from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + """ + output = tgt + for i in range(self.num_layers): + output = self.layers[i]( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + + return output + + +class TransformerEncoderLayer(nn.Layer): + """TransformerEncoderLayer is made up of self-attn and feedforward network. + This standard encoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1): + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + + self.conv1 = Conv2D( + in_channels=d_model, + out_channels=dim_feedforward, + kernel_size=(1, 1)) + self.conv2 = Conv2D( + in_channels=dim_feedforward, + out_channels=d_model, + kernel_size=(1, 1)) + + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.dropout1 = Dropout(residual_dropout_rate) + self.dropout2 = Dropout(residual_dropout_rate) + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + """Pass the input through the endocder layer. + Args: + src: the sequnce to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + """ + src2 = self.self_attn( + src, + src, + src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask) + src = src + self.dropout1(src2) + src = self.norm1(src) + + src = paddle.transpose(src, [1, 2, 0]) + src = paddle.unsqueeze(src, 2) + src2 = self.conv2(F.relu(self.conv1(src))) + src2 = paddle.squeeze(src2, 2) + src2 = paddle.transpose(src2, [2, 0, 1]) + src = paddle.squeeze(src, 2) + src = paddle.transpose(src, [2, 0, 1]) + + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + +class TransformerDecoderLayer(nn.Layer): + """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. + This standard decoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1): + super(TransformerDecoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + self.multihead_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + + self.conv1 = Conv2D( + in_channels=d_model, + out_channels=dim_feedforward, + kernel_size=(1, 1)) + self.conv2 = Conv2D( + in_channels=dim_feedforward, + out_channels=d_model, + kernel_size=(1, 1)) + + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.norm3 = LayerNorm(d_model) + self.dropout1 = Dropout(residual_dropout_rate) + self.dropout2 = Dropout(residual_dropout_rate) + self.dropout3 = Dropout(residual_dropout_rate) + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + """Pass the inputs (and mask) through the decoder layer. + + Args: + tgt: the sequence to the decoder layer (required). + memory: the sequnce from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + + """ + tgt2 = self.self_attn( + tgt, + tgt, + tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn( + tgt, + memory, + memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # default + tgt = paddle.transpose(tgt, [1, 2, 0]) + tgt = paddle.unsqueeze(tgt, 2) + tgt2 = self.conv2(F.relu(self.conv1(tgt))) + tgt2 = paddle.squeeze(tgt2, 2) + tgt2 = paddle.transpose(tgt2, [2, 0, 1]) + tgt = paddle.squeeze(tgt, 2) + tgt = paddle.transpose(tgt, [2, 0, 1]) + + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + +def _get_clones(module, N): + return LayerList([copy.deepcopy(module) for i in range(N)]) + + +class PositionalEncoding(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.unsqueeze(pe, 0) + pe = paddle.transpose(pe, [1, 0, 2]) + self.register_buffer('pe', pe) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + x = x + self.pe[:paddle.shape(x)[0], :] + return self.dropout(x) + + +class PositionalEncoding_2d(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding_2d, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2]) + self.register_buffer('pe', pe) + + self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear1 = nn.Linear(dim, dim) + self.linear1.weight.data.fill_(1.) + self.avg_pool_2 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear2 = nn.Linear(dim, dim) + self.linear2.weight.data.fill_(1.) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + w_pe = self.pe[:paddle.shape(x)[-1], :] + w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0) + w_pe = w_pe * w1 + w_pe = paddle.transpose(w_pe, [1, 2, 0]) + w_pe = paddle.unsqueeze(w_pe, 2) + + h_pe = self.pe[:paddle.shape(x).shape[-2], :] + w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0) + h_pe = h_pe * w2 + h_pe = paddle.transpose(h_pe, [1, 2, 0]) + h_pe = paddle.unsqueeze(h_pe, 3) + + x = x + w_pe + h_pe + x = paddle.transpose( + paddle.reshape(x, + [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]), + [2, 0, 1]) + + return self.dropout(x) + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab, padding_idx, scale_embedding): + super(Embeddings, self).__init__() + self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx) + w0 = np.random.normal(0.0, d_model**-0.5, + (vocab, d_model)).astype(np.float32) + self.embedding.weight.set_value(w0) + self.d_model = d_model + self.scale_embedding = scale_embedding + + def forward(self, x): + if self.scale_embedding: + x = self.embedding(x) + return x * math.sqrt(self.d_model) + return self.embedding(x) + + +class Beam(): + ''' Beam search ''' + + def __init__(self, size, device=False): + + self.size = size + self._done = False + # The score for each translation on the beam. + self.scores = paddle.zeros((size, ), dtype=paddle.float32) + self.all_scores = [] + # The backpointers at each time-step. + self.prev_ks = [] + # The outputs at each time-step. + self.next_ys = [paddle.full((size, ), 0, dtype=paddle.int64)] + self.next_ys[0][0] = 2 + + def get_current_state(self): + "Get the outputs for the current timestep." + return self.get_tentative_hypothesis() + + def get_current_origin(self): + "Get the backpointers for the current timestep." + return self.prev_ks[-1] + + @property + def done(self): + return self._done + + def advance(self, word_prob): + "Update beam status and check if finished or not." + num_words = word_prob.shape[1] + + # Sum the previous scores. + if len(self.prev_ks) > 0: + beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) + else: + beam_lk = word_prob[0] + + flat_beam_lk = beam_lk.reshape([-1]) + best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, + True) # 1st sort + self.all_scores.append(self.scores) + self.scores = best_scores + # bestScoresId is flattened as a (beam x word) array, + # so we need to calculate which word and beam each score came from + prev_k = best_scores_id // num_words + self.prev_ks.append(prev_k) + self.next_ys.append(best_scores_id - prev_k * num_words) + # End condition is when top-of-beam is EOS. + if self.next_ys[-1][0] == 3: + self._done = True + self.all_scores.append(self.scores) + + return self._done + + def sort_scores(self): + "Sort the scores." + return self.scores, paddle.to_tensor( + [i for i in range(int(self.scores.shape[0]))], dtype='int32') + + def get_the_best_score_and_idx(self): + "Get the score of the best in the beam." + scores, ids = self.sort_scores() + return scores[1], ids[1] + + def get_tentative_hypothesis(self): + "Get the decoded sequence for the current timestep." + if len(self.next_ys) == 1: + dec_seq = self.next_ys[0].unsqueeze(1) + else: + _, keys = self.sort_scores() + hyps = [self.get_hypothesis(k) for k in keys] + hyps = [[2] + h for h in hyps] + dec_seq = paddle.to_tensor(hyps, dtype='int64') + return dec_seq + + def get_hypothesis(self, k): + """ Walk back to construct the full hypothesis. """ + hyp = [] + for j in range(len(self.prev_ks) - 1, -1, -1): + hyp.append(self.next_ys[j + 1][k]) + k = self.prev_ks[j][k] + return list(map(lambda x: x.item(), hyp[::-1])) diff --git a/backend/ppocr/modeling/heads/rec_pren_head.py b/backend/ppocr/modeling/heads/rec_pren_head.py new file mode 100644 index 0000000..c9e4b3e --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_pren_head.py @@ -0,0 +1,34 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from paddle.nn import functional as F + + +class PRENHead(nn.Layer): + def __init__(self, in_channels, out_channels, **kwargs): + super(PRENHead, self).__init__() + self.linear = nn.Linear(in_channels, out_channels) + + def forward(self, x, targets=None): + predicts = self.linear(x) + + if not self.training: + predicts = F.softmax(predicts, axis=2) + + return predicts diff --git a/backend/ppocr/modeling/heads/rec_sar_head.py b/backend/ppocr/modeling/heads/rec_sar_head.py new file mode 100644 index 0000000..0e6b344 --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_sar_head.py @@ -0,0 +1,410 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/sar_encoder.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/sar_decoder.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + + +class SAREncoder(nn.Layer): + """ + Args: + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + enc_drop_rnn (float): Dropout probability of RNN layer in encoder. + enc_gru (bool): If True, use GRU, else LSTM in encoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + mask (bool): If True, mask padding in RNN sequence. + """ + + def __init__(self, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + d_model=512, + d_enc=512, + mask=True, + **kwargs): + super().__init__() + assert isinstance(enc_bi_rnn, bool) + assert isinstance(enc_drop_rnn, (int, float)) + assert 0 <= enc_drop_rnn < 1.0 + assert isinstance(enc_gru, bool) + assert isinstance(d_model, int) + assert isinstance(d_enc, int) + assert isinstance(mask, bool) + + self.enc_bi_rnn = enc_bi_rnn + self.enc_drop_rnn = enc_drop_rnn + self.mask = mask + + # LSTM Encoder + if enc_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + kwargs = dict( + input_size=d_model, + hidden_size=d_enc, + num_layers=2, + time_major=False, + dropout=enc_drop_rnn, + direction=direction) + if enc_gru: + self.rnn_encoder = nn.GRU(**kwargs) + else: + self.rnn_encoder = nn.LSTM(**kwargs) + + # global feature transformation + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size) + + def forward(self, feat, img_metas=None): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + h_feat = feat.shape[2] # bsz c h w + feat_v = F.max_pool2d( + feat, kernel_size=(h_feat, 1), stride=1, padding=0) + feat_v = feat_v.squeeze(2) # bsz * C * W + feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C + holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C + + if valid_ratios is not None: + valid_hf = [] + T = holistic_feat.shape[1] + for i in range(len(valid_ratios)): + valid_step = min(T, math.ceil(T * valid_ratios[i])) - 1 + valid_hf.append(holistic_feat[i, valid_step, :]) + valid_hf = paddle.stack(valid_hf, axis=0) + else: + valid_hf = holistic_feat[:, -1, :] # bsz * C + holistic_feat = self.linear(valid_hf) # bsz * C + + return holistic_feat + + +class BaseDecoder(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + img_metas=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, img_metas) + return self.forward_test(feat, out_enc, img_metas) + + +class ParallelSARDecoder(BaseDecoder): + """ + Args: + out_channels (int): Output class number. + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + dec_bi_rnn (bool): If True, use bidirectional RNN in decoder. + dec_drop_rnn (float): Dropout of RNN layer in decoder. + dec_gru (bool): If True, use GRU, else LSTM in decoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + d_k (int): Dim of channels of attention module. + pred_dropout (float): Dropout probability of prediction layer. + max_seq_len (int): Maximum sequence length for decoding. + mask (bool): If True, mask padding in feature map. + start_idx (int): Index of start token. + padding_idx (int): Index of padding token. + pred_concat (bool): If True, concat glimpse feature from + attention with holistic feature and hidden state. + """ + + def __init__( + self, + out_channels, # 90 + unknown + start + padding + enc_bi_rnn=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_model=512, + d_enc=512, + d_k=64, + pred_dropout=0.1, + max_text_length=30, + mask=True, + pred_concat=True, + **kwargs): + super().__init__() + + self.num_classes = out_channels + self.enc_bi_rnn = enc_bi_rnn + self.d_k = d_k + self.start_idx = out_channels - 2 + self.padding_idx = out_channels - 1 + self.max_seq_len = max_text_length + self.mask = mask + self.pred_concat = pred_concat + + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + decoder_rnn_out_size = encoder_rnn_out_size * (int(dec_bi_rnn) + 1) + + # 2D attention layer + self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k) + self.conv3x3_1 = nn.Conv2D( + d_model, d_k, kernel_size=3, stride=1, padding=1) + self.conv1x1_2 = nn.Linear(d_k, 1) + + # Decoder RNN layer + if dec_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + + kwargs = dict( + input_size=encoder_rnn_out_size, + hidden_size=encoder_rnn_out_size, + num_layers=2, + time_major=False, + dropout=dec_drop_rnn, + direction=direction) + if dec_gru: + self.rnn_decoder = nn.GRU(**kwargs) + else: + self.rnn_decoder = nn.LSTM(**kwargs) + + # Decoder input embedding + self.embedding = nn.Embedding( + self.num_classes, + encoder_rnn_out_size, + padding_idx=self.padding_idx) + + # Prediction layer + self.pred_dropout = nn.Dropout(pred_dropout) + pred_num_classes = self.num_classes - 1 + if pred_concat: + fc_in_channel = decoder_rnn_out_size + d_model + encoder_rnn_out_size + else: + fc_in_channel = d_model + self.prediction = nn.Linear(fc_in_channel, pred_num_classes) + + def _2d_attention(self, + decoder_input, + feat, + holistic_feat, + valid_ratios=None): + + y = self.rnn_decoder(decoder_input)[0] + # y: bsz * (seq_len + 1) * hidden_size + + attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size + bsz, seq_len, attn_size = attn_query.shape + attn_query = paddle.unsqueeze(attn_query, axis=[3, 4]) + # (bsz, seq_len + 1, attn_size, 1, 1) + + attn_key = self.conv3x3_1(feat) + # bsz * attn_size * h * w + attn_key = attn_key.unsqueeze(1) + # bsz * 1 * attn_size * h * w + + attn_weight = paddle.tanh(paddle.add(attn_key, attn_query)) + + # bsz * (seq_len + 1) * attn_size * h * w + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 3, 4, 2]) + # bsz * (seq_len + 1) * h * w * attn_size + attn_weight = self.conv1x1_2(attn_weight) + # bsz * (seq_len + 1) * h * w * 1 + bsz, T, h, w, c = attn_weight.shape + assert c == 1 + + if valid_ratios is not None: + # cal mask of attention weight + for i in range(len(valid_ratios)): + valid_width = min(w, math.ceil(w * valid_ratios[i])) + if valid_width < w: + attn_weight[i, :, :, valid_width:, :] = float('-inf') + + attn_weight = paddle.reshape(attn_weight, [bsz, T, -1]) + attn_weight = F.softmax(attn_weight, axis=-1) + + attn_weight = paddle.reshape(attn_weight, [bsz, T, h, w, c]) + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 4, 2, 3]) + # attn_weight: bsz * T * c * h * w + # feat: bsz * c * h * w + attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight), + (3, 4), + keepdim=False) + # bsz * (seq_len + 1) * C + + # Linear transformation + if self.pred_concat: + hf_c = holistic_feat.shape[-1] + holistic_feat = paddle.expand( + holistic_feat, shape=[bsz, seq_len, hf_c]) + y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2)) + else: + y = self.prediction(attn_feat) + # bsz * (seq_len + 1) * num_classes + if self.train_mode: + y = self.pred_dropout(y) + + return y + + def forward_train(self, feat, out_enc, label, img_metas): + ''' + img_metas: [label, valid_ratio] + ''' + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + lab_embedding = self.embedding(label) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + in_dec = paddle.concat((out_enc, lab_embedding), axis=1) + # bsz * (seq_len + 1) * C + out_dec = self._2d_attention( + in_dec, feat, out_enc, valid_ratios=valid_ratios) + # bsz * (seq_len + 1) * num_classes + + return out_dec[:, 1:, :] # bsz * seq_len * num_classes + + def forward_test(self, feat, out_enc, img_metas): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + seq_len = self.max_seq_len + bsz = feat.shape[0] + start_token = paddle.full( + (bsz, ), fill_value=self.start_idx, dtype='int64') + # bsz + start_token = self.embedding(start_token) + # bsz * emb_dim + emb_dim = start_token.shape[1] + start_token = start_token.unsqueeze(1) + start_token = paddle.expand(start_token, shape=[bsz, seq_len, emb_dim]) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + decoder_input = paddle.concat((out_enc, start_token), axis=1) + # bsz * (seq_len + 1) * emb_dim + + outputs = [] + for i in range(1, seq_len + 1): + decoder_output = self._2d_attention( + decoder_input, feat, out_enc, valid_ratios=valid_ratios) + char_output = decoder_output[:, i, :] # bsz * num_classes + char_output = F.softmax(char_output, -1) + outputs.append(char_output) + max_idx = paddle.argmax(char_output, axis=1, keepdim=False) + char_embedding = self.embedding(max_idx) # bsz * emb_dim + if i < seq_len: + decoder_input[:, i + 1, :] = char_embedding + + outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes + + return outputs + + +class SARHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + enc_dim=512, + max_text_length=30, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_k=512, + pred_dropout=0.1, + pred_concat=True, + **kwargs): + super(SARHead, self).__init__() + + # encoder module + self.encoder = SAREncoder( + enc_bi_rnn=enc_bi_rnn, + enc_drop_rnn=enc_drop_rnn, + enc_gru=enc_gru, + d_model=in_channels, + d_enc=enc_dim) + + # decoder module + self.decoder = ParallelSARDecoder( + out_channels=out_channels, + enc_bi_rnn=enc_bi_rnn, + dec_bi_rnn=dec_bi_rnn, + dec_drop_rnn=dec_drop_rnn, + dec_gru=dec_gru, + d_model=in_channels, + d_enc=enc_dim, + d_k=d_k, + pred_dropout=pred_dropout, + max_text_length=max_text_length, + pred_concat=pred_concat) + + def forward(self, feat, targets=None): + ''' + img_metas: [label, valid_ratio] + ''' + holistic_feat = self.encoder(feat, targets) # bsz c + + if self.training: + label = targets[0] # label + label = paddle.to_tensor(label, dtype='int64') + final_out = self.decoder( + feat, holistic_feat, label, img_metas=targets) + else: + final_out = self.decoder( + feat, + holistic_feat, + label=None, + img_metas=targets, + train_mode=False) + # (bsz, seq_len, num_classes) + + return final_out diff --git a/backend/ppocr/modeling/heads/rec_srn_head.py b/backend/ppocr/modeling/heads/rec_srn_head.py new file mode 100644 index 0000000..8d59e47 --- /dev/null +++ b/backend/ppocr/modeling/heads/rec_srn_head.py @@ -0,0 +1,280 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import paddle.fluid as fluid +import numpy as np +from .self_attention import WrapEncoderForFeature +from .self_attention import WrapEncoder +from paddle.static import Program +from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN +import paddle.fluid.framework as framework + +from collections import OrderedDict +gradient_clip = 10 + + +class PVAM(nn.Layer): + def __init__(self, in_channels, char_num, max_text_length, num_heads, + num_encoder_tus, hidden_dims): + super(PVAM, self).__init__() + self.char_num = char_num + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_tus + self.hidden_dims = hidden_dims + # Transformer encoder + t = 256 + c = 512 + self.wrap_encoder_for_feature = WrapEncoderForFeature( + src_vocab_size=1, + max_length=t, + n_layer=self.num_encoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + # PVAM + self.flatten0 = paddle.nn.Flatten(start_axis=0, stop_axis=1) + self.fc0 = paddle.nn.Linear( + in_features=in_channels, + out_features=in_channels, ) + self.emb = paddle.nn.Embedding( + num_embeddings=self.max_length, embedding_dim=in_channels) + self.flatten1 = paddle.nn.Flatten(start_axis=0, stop_axis=2) + self.fc1 = paddle.nn.Linear( + in_features=in_channels, out_features=1, bias_attr=False) + + def forward(self, inputs, encoder_word_pos, gsrm_word_pos): + b, c, h, w = inputs.shape + conv_features = paddle.reshape(inputs, shape=[-1, c, h * w]) + conv_features = paddle.transpose(conv_features, perm=[0, 2, 1]) + # transformer encoder + b, t, c = conv_features.shape + + enc_inputs = [conv_features, encoder_word_pos, None] + word_features = self.wrap_encoder_for_feature(enc_inputs) + + # pvam + b, t, c = word_features.shape + word_features = self.fc0(word_features) + word_features_ = paddle.reshape(word_features, [-1, 1, t, c]) + word_features_ = paddle.tile(word_features_, [1, self.max_length, 1, 1]) + word_pos_feature = self.emb(gsrm_word_pos) + word_pos_feature_ = paddle.reshape(word_pos_feature, + [-1, self.max_length, 1, c]) + word_pos_feature_ = paddle.tile(word_pos_feature_, [1, 1, t, 1]) + y = word_pos_feature_ + word_features_ + y = F.tanh(y) + attention_weight = self.fc1(y) + attention_weight = paddle.reshape( + attention_weight, shape=[-1, self.max_length, t]) + attention_weight = F.softmax(attention_weight, axis=-1) + pvam_features = paddle.matmul(attention_weight, + word_features) #[b, max_length, c] + return pvam_features + + +class GSRM(nn.Layer): + def __init__(self, in_channels, char_num, max_text_length, num_heads, + num_encoder_tus, num_decoder_tus, hidden_dims): + super(GSRM, self).__init__() + self.char_num = char_num + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_tus + self.num_decoder_TUs = num_decoder_tus + self.hidden_dims = hidden_dims + + self.fc0 = paddle.nn.Linear( + in_features=in_channels, out_features=self.char_num) + self.wrap_encoder0 = WrapEncoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + self.wrap_encoder1 = WrapEncoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + self.mul = lambda x: paddle.matmul(x=x, + y=self.wrap_encoder0.prepare_decoder.emb0.weight, + transpose_y=True) + + def forward(self, inputs, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2): + # ===== GSRM Visual-to-semantic embedding block ===== + b, t, c = inputs.shape + pvam_features = paddle.reshape(inputs, [-1, c]) + word_out = self.fc0(pvam_features) + word_ids = paddle.argmax(F.softmax(word_out), axis=1) + word_ids = paddle.reshape(x=word_ids, shape=[-1, t, 1]) + + #===== GSRM Semantic reasoning block ===== + """ + This module is achieved through bi-transformers, + ngram_feature1 is the froward one, ngram_fetaure2 is the backward one + """ + pad_idx = self.char_num + + word1 = paddle.cast(word_ids, "float32") + word1 = F.pad(word1, [1, 0], value=1.0 * pad_idx, data_format="NLC") + word1 = paddle.cast(word1, "int64") + word1 = word1[:, :-1, :] + word2 = word_ids + + enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] + enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] + + gsrm_feature1 = self.wrap_encoder0(enc_inputs_1) + gsrm_feature2 = self.wrap_encoder1(enc_inputs_2) + + gsrm_feature2 = F.pad(gsrm_feature2, [0, 1], + value=0., + data_format="NLC") + gsrm_feature2 = gsrm_feature2[:, 1:, ] + gsrm_features = gsrm_feature1 + gsrm_feature2 + + gsrm_out = self.mul(gsrm_features) + + b, t, c = gsrm_out.shape + gsrm_out = paddle.reshape(gsrm_out, [-1, c]) + + return gsrm_features, word_out, gsrm_out + + +class VSFD(nn.Layer): + def __init__(self, in_channels=512, pvam_ch=512, char_num=38): + super(VSFD, self).__init__() + self.char_num = char_num + self.fc0 = paddle.nn.Linear( + in_features=in_channels * 2, out_features=pvam_ch) + self.fc1 = paddle.nn.Linear( + in_features=pvam_ch, out_features=self.char_num) + + def forward(self, pvam_feature, gsrm_feature): + b, t, c1 = pvam_feature.shape + b, t, c2 = gsrm_feature.shape + combine_feature_ = paddle.concat([pvam_feature, gsrm_feature], axis=2) + img_comb_feature_ = paddle.reshape( + combine_feature_, shape=[-1, c1 + c2]) + img_comb_feature_map = self.fc0(img_comb_feature_) + img_comb_feature_map = F.sigmoid(img_comb_feature_map) + img_comb_feature_map = paddle.reshape( + img_comb_feature_map, shape=[-1, t, c1]) + combine_feature = img_comb_feature_map * pvam_feature + ( + 1.0 - img_comb_feature_map) * gsrm_feature + img_comb_feature = paddle.reshape(combine_feature, shape=[-1, c1]) + + out = self.fc1(img_comb_feature) + return out + + +class SRNHead(nn.Layer): + def __init__(self, in_channels, out_channels, max_text_length, num_heads, + num_encoder_TUs, num_decoder_TUs, hidden_dims, **kwargs): + super(SRNHead, self).__init__() + self.char_num = out_channels + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_TUs + self.num_decoder_TUs = num_decoder_TUs + self.hidden_dims = hidden_dims + + self.pvam = PVAM( + in_channels=in_channels, + char_num=self.char_num, + max_text_length=self.max_length, + num_heads=self.num_heads, + num_encoder_tus=self.num_encoder_TUs, + hidden_dims=self.hidden_dims) + + self.gsrm = GSRM( + in_channels=in_channels, + char_num=self.char_num, + max_text_length=self.max_length, + num_heads=self.num_heads, + num_encoder_tus=self.num_encoder_TUs, + num_decoder_tus=self.num_decoder_TUs, + hidden_dims=self.hidden_dims) + self.vsfd = VSFD(in_channels=in_channels, char_num=self.char_num) + + self.gsrm.wrap_encoder1.prepare_decoder.emb0 = self.gsrm.wrap_encoder0.prepare_decoder.emb0 + + def forward(self, inputs, targets=None): + others = targets[-4:] + encoder_word_pos = others[0] + gsrm_word_pos = others[1] + gsrm_slf_attn_bias1 = others[2] + gsrm_slf_attn_bias2 = others[3] + + pvam_feature = self.pvam(inputs, encoder_word_pos, gsrm_word_pos) + + gsrm_feature, word_out, gsrm_out = self.gsrm( + pvam_feature, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + final_out = self.vsfd(pvam_feature, gsrm_feature) + if not self.training: + final_out = F.softmax(final_out, axis=1) + + _, decoded_out = paddle.topk(final_out, k=1) + + predicts = OrderedDict([ + ('predict', final_out), + ('pvam_feature', pvam_feature), + ('decoded_out', decoded_out), + ('word_out', word_out), + ('gsrm_out', gsrm_out), + ]) + + return predicts diff --git a/backend/ppocr/modeling/heads/self_attention.py b/backend/ppocr/modeling/heads/self_attention.py new file mode 100644 index 0000000..6c27fdb --- /dev/null +++ b/backend/ppocr/modeling/heads/self_attention.py @@ -0,0 +1,406 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +from paddle import ParamAttr, nn +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import paddle.fluid as fluid +import numpy as np +gradient_clip = 10 + + +class WrapEncoderForFeature(nn.Layer): + def __init__(self, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=0): + super(WrapEncoderForFeature, self).__init__() + + self.prepare_encoder = PrepareEncoder( + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx, + word_emb_param_name="src_word_emb_table") + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, enc_inputs): + conv_features, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self.prepare_encoder(conv_features, src_pos) + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class WrapEncoder(nn.Layer): + """ + embedder + encoder + """ + + def __init__(self, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=0): + super(WrapEncoder, self).__init__() + + self.prepare_decoder = PrepareDecoder( + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx) + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, enc_inputs): + src_word, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self.prepare_decoder(src_word, src_pos) + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class Encoder(nn.Layer): + """ + encoder + """ + + def __init__(self, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(Encoder, self).__init__() + + self.encoder_layers = list() + for i in range(n_layer): + self.encoder_layers.append( + self.add_sublayer( + "layer_%d" % i, + EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, + postprocess_cmd))) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + for encoder_layer in self.encoder_layers: + enc_output = encoder_layer(enc_input, attn_bias) + enc_input = enc_output + enc_output = self.processer(enc_output) + return enc_output + + +class EncoderLayer(nn.Layer): + """ + EncoderLayer + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderLayer, self).__init__() + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) + attn_output = self.postprocesser1(attn_output, enc_input) + ffn_output = self.ffn(self.preprocesser2(attn_output)) + ffn_output = self.postprocesser2(ffn_output, attn_output) + return ffn_output + + +class MultiHeadAttention(nn.Layer): + """ + Multi-Head Attention + """ + + def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): + super(MultiHeadAttention, self).__init__() + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + self.d_model = d_model + self.dropout_rate = dropout_rate + self.q_fc = paddle.nn.Linear( + in_features=d_model, out_features=d_key * n_head, bias_attr=False) + self.k_fc = paddle.nn.Linear( + in_features=d_model, out_features=d_key * n_head, bias_attr=False) + self.v_fc = paddle.nn.Linear( + in_features=d_model, out_features=d_value * n_head, bias_attr=False) + self.proj_fc = paddle.nn.Linear( + in_features=d_value * n_head, out_features=d_model, bias_attr=False) + + def _prepare_qkv(self, queries, keys, values, cache=None): + if keys is None: # self-attention + keys, values = queries, queries + static_kv = False + else: # cross-attention + static_kv = True + + q = self.q_fc(queries) + q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = paddle.transpose(x=q, perm=[0, 2, 1, 3]) + + if cache is not None and static_kv and "static_k" in cache: + # for encoder-decoder attention in inference and has cached + k = cache["static_k"] + v = cache["static_v"] + else: + k = self.k_fc(keys) + v = self.v_fc(values) + k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = paddle.transpose(x=k, perm=[0, 2, 1, 3]) + v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = paddle.transpose(x=v, perm=[0, 2, 1, 3]) + + if cache is not None: + if static_kv and not "static_k" in cache: + # for encoder-decoder attention in inference and has not cached + cache["static_k"], cache["static_v"] = k, v + elif not static_kv: + # for decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + k = paddle.concat([cache_k, k], axis=2) + v = paddle.concat([cache_v, v], axis=2) + cache["k"], cache["v"] = k, v + + return q, k, v + + def forward(self, queries, keys, values, attn_bias, cache=None): + # compute q ,k ,v + keys = queries if keys is None else keys + values = keys if values is None else values + q, k, v = self._prepare_qkv(queries, keys, values, cache) + + # scale dot product attention + product = paddle.matmul(x=q, y=k, transpose_y=True) + product = product * self.d_model**-0.5 + if attn_bias is not None: + product += attn_bias + weights = F.softmax(product) + if self.dropout_rate: + weights = F.dropout( + weights, p=self.dropout_rate, mode="downscale_in_infer") + out = paddle.matmul(weights, v) + + # combine heads + out = paddle.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.proj_fc(out) + + return out + + +class PrePostProcessLayer(nn.Layer): + """ + PrePostProcessLayer + """ + + def __init__(self, process_cmd, d_model, dropout_rate): + super(PrePostProcessLayer, self).__init__() + self.process_cmd = process_cmd + self.functors = [] + for cmd in self.process_cmd: + if cmd == "a": # add residual connection + self.functors.append(lambda x, y: x + y if y is not None else x) + elif cmd == "n": # add layer normalization + self.functors.append( + self.add_sublayer( + "layer_norm_%d" % len(self.sublayers()), + paddle.nn.LayerNorm( + normalized_shape=d_model, + weight_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.))))) + elif cmd == "d": # add dropout + self.functors.append(lambda x: F.dropout( + x, p=dropout_rate, mode="downscale_in_infer") + if dropout_rate else x) + + def forward(self, x, residual=None): + for i, cmd in enumerate(self.process_cmd): + if cmd == "a": + x = self.functors[i](x, residual) + else: + x = self.functors[i](x) + return x + + +class PrepareEncoder(nn.Layer): + def __init__(self, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0, + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareEncoder, self).__init__() + self.src_emb_dim = src_emb_dim + self.src_max_len = src_max_len + self.emb = paddle.nn.Embedding( + num_embeddings=self.src_max_len, embedding_dim=self.src_emb_dim) + self.dropout_rate = dropout_rate + + def forward(self, src_word, src_pos): + src_word_emb = src_word + src_word_emb = fluid.layers.cast(src_word_emb, 'float32') + src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) + src_pos = paddle.squeeze(src_pos, axis=-1) + src_pos_enc = self.emb(src_pos) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + if self.dropout_rate: + out = F.dropout( + x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") + else: + out = enc_input + return out + + +class PrepareDecoder(nn.Layer): + def __init__(self, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0, + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareDecoder, self).__init__() + self.src_emb_dim = src_emb_dim + """ + self.emb0 = Embedding(num_embeddings=src_vocab_size, + embedding_dim=src_emb_dim) + """ + self.emb0 = paddle.nn.Embedding( + num_embeddings=src_vocab_size, + embedding_dim=self.src_emb_dim, + padding_idx=bos_idx, + weight_attr=paddle.ParamAttr( + name=word_emb_param_name, + initializer=nn.initializer.Normal(0., src_emb_dim**-0.5))) + self.emb1 = paddle.nn.Embedding( + num_embeddings=src_max_len, + embedding_dim=self.src_emb_dim, + weight_attr=paddle.ParamAttr(name=pos_enc_param_name)) + self.dropout_rate = dropout_rate + + def forward(self, src_word, src_pos): + src_word = fluid.layers.cast(src_word, 'int64') + src_word = paddle.squeeze(src_word, axis=-1) + src_word_emb = self.emb0(src_word) + src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) + src_pos = paddle.squeeze(src_pos, axis=-1) + src_pos_enc = self.emb1(src_pos) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + if self.dropout_rate: + out = F.dropout( + x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") + else: + out = enc_input + return out + + +class FFN(nn.Layer): + """ + Feed-Forward Network + """ + + def __init__(self, d_inner_hid, d_model, dropout_rate): + super(FFN, self).__init__() + self.dropout_rate = dropout_rate + self.fc1 = paddle.nn.Linear( + in_features=d_model, out_features=d_inner_hid) + self.fc2 = paddle.nn.Linear( + in_features=d_inner_hid, out_features=d_model) + + def forward(self, x): + hidden = self.fc1(x) + hidden = F.relu(hidden) + if self.dropout_rate: + hidden = F.dropout( + hidden, p=self.dropout_rate, mode="downscale_in_infer") + out = self.fc2(hidden) + return out diff --git a/backend/ppocr/modeling/heads/table_att_head.py b/backend/ppocr/modeling/heads/table_att_head.py new file mode 100644 index 0000000..e354f40 --- /dev/null +++ b/backend/ppocr/modeling/heads/table_att_head.py @@ -0,0 +1,246 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + + +class TableAttentionHead(nn.Layer): + def __init__(self, + in_channels, + hidden_size, + loc_type, + in_max_len=488, + max_text_length=100, + max_elem_length=800, + max_cell_num=500, + **kwargs): + super(TableAttentionHead, self).__init__() + self.input_size = in_channels[-1] + self.hidden_size = hidden_size + self.elem_num = 30 + self.max_text_length = max_text_length + self.max_elem_length = max_elem_length + self.max_cell_num = max_cell_num + + self.structure_attention_cell = AttentionGRUCell( + self.input_size, hidden_size, self.elem_num, use_gru=False) + self.structure_generator = nn.Linear(hidden_size, self.elem_num) + self.loc_type = loc_type + self.in_max_len = in_max_len + + if self.loc_type == 1: + self.loc_generator = nn.Linear(hidden_size, 4) + else: + if self.in_max_len == 640: + self.loc_fea_trans = nn.Linear(400, self.max_elem_length + 1) + elif self.in_max_len == 800: + self.loc_fea_trans = nn.Linear(625, self.max_elem_length + 1) + else: + self.loc_fea_trans = nn.Linear(256, self.max_elem_length + 1) + self.loc_generator = nn.Linear(self.input_size + hidden_size, 4) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None): + # if and else branch are both needed when you want to assign a variable + # if you modify the var in just one branch, then the modification will not work. + fea = inputs[-1] + if len(fea.shape) == 3: + pass + else: + last_shape = int(np.prod(fea.shape[2:])) # gry added + fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape]) + fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + batch_size = fea.shape[0] + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + if self.training and targets is not None: + structure = targets[0] + for i in range(self.max_elem_length + 1): + elem_onehots = self._char_to_onehot( + structure[:, i], onehot_dim=self.elem_num) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + output = paddle.concat(output_hiddens, axis=1) + structure_probs = self.structure_generator(output) + if self.loc_type == 1: + loc_preds = self.loc_generator(output) + loc_preds = F.sigmoid(loc_preds) + else: + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + else: + temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") + structure_probs = None + loc_preds = None + elem_onehots = None + outputs = None + alpha = None + max_elem_length = paddle.to_tensor(self.max_elem_length) + i = 0 + while i < max_elem_length + 1: + elem_onehots = self._char_to_onehot( + temp_elem, onehot_dim=self.elem_num) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + structure_probs_step = self.structure_generator(outputs) + temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") + i += 1 + + output = paddle.concat(output_hiddens, axis=1) + structure_probs = self.structure_generator(output) + structure_probs = F.softmax(structure_probs) + if self.loc_type == 1: + loc_preds = self.loc_generator(output) + loc_preds = F.sigmoid(loc_preds) + else: + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + return {'structure_probs': structure_probs, 'loc_preds': loc_preds} + + +class AttentionGRUCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + return cur_hidden, alpha + + +class AttentionLSTM(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros( + (batch_size, self.hidden_size))) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + + next_input = probs_step.argmax(axis=1) + + targets = next_input + + return probs + + +class AttentionLSTMCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha diff --git a/backend/ppocr/modeling/necks/__init__.py b/backend/ppocr/modeling/necks/__init__.py new file mode 100644 index 0000000..e10b082 --- /dev/null +++ b/backend/ppocr/modeling/necks/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_neck'] + + +def build_neck(config): + from .db_fpn import DBFPN, RSEFPN, LKPAN + from .east_fpn import EASTFPN + from .sast_fpn import SASTFPN + from .rnn import SequenceEncoder + from .pg_fpn import PGFPN + from .table_fpn import TableFPN + from .fpn import FPN + from .fce_fpn import FCEFPN + from .pren_fpn import PRENFPN + support_dict = [ + 'FPN', 'FCEFPN', 'LKPAN', 'DBFPN', 'RSEFPN', 'EASTFPN', 'SASTFPN', + 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN' + ] + + module_name = config.pop('name') + assert module_name in support_dict, Exception('neck only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/modeling/necks/db_fpn.py b/backend/ppocr/modeling/necks/db_fpn.py new file mode 100644 index 0000000..93ed2db --- /dev/null +++ b/backend/ppocr/modeling/necks/db_fpn.py @@ -0,0 +1,358 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../../..'))) + +from ppocr.modeling.backbones.det_mobilenet_v3 import SEModule + + +class DSConv(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding, + stride=1, + groups=None, + if_act=True, + act="relu", + **kwargs): + super(DSConv, self).__init__() + if groups == None: + groups = in_channels + self.if_act = if_act + self.act = act + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False) + + self.bn1 = nn.BatchNorm(num_channels=in_channels, act=None) + + self.conv2 = nn.Conv2D( + in_channels=in_channels, + out_channels=int(in_channels * 4), + kernel_size=1, + stride=1, + bias_attr=False) + + self.bn2 = nn.BatchNorm(num_channels=int(in_channels * 4), act=None) + + self.conv3 = nn.Conv2D( + in_channels=int(in_channels * 4), + out_channels=out_channels, + kernel_size=1, + stride=1, + bias_attr=False) + self._c = [in_channels, out_channels] + if in_channels != out_channels: + self.conv_end = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + bias_attr=False) + + def forward(self, inputs): + + x = self.conv1(inputs) + x = self.bn1(x) + + x = self.conv2(x) + x = self.bn2(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = F.hardswish(x) + else: + print("The activation function({}) is selected incorrectly.". + format(self.act)) + exit() + + x = self.conv3(x) + if self._c[0] != self._c[1]: + x = x + self.conv_end(inputs) + return x + + +class DBFPN(nn.Layer): + def __init__(self, in_channels, out_channels, **kwargs): + super(DBFPN, self).__init__() + self.out_channels = out_channels + weight_attr = paddle.nn.initializer.KaimingUniform() + + self.in2_conv = nn.Conv2D( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in3_conv = nn.Conv2D( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in4_conv = nn.Conv2D( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in5_conv = nn.Conv2D( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p5_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p4_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p3_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p2_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest", align_mode=1) # 1/4 + + p5 = self.p5_conv(in5) + p4 = self.p4_conv(out4) + p3 = self.p3_conv(out3) + p2 = self.p2_conv(out2) + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) + p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) + p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) + + fuse = paddle.concat([p5, p4, p3, p2], axis=1) + return fuse + + +class RSELayer(nn.Layer): + def __init__(self, in_channels, out_channels, kernel_size, shortcut=True): + super(RSELayer, self).__init__() + weight_attr = paddle.nn.initializer.KaimingUniform() + self.out_channels = out_channels + self.in_conv = nn.Conv2D( + in_channels=in_channels, + out_channels=self.out_channels, + kernel_size=kernel_size, + padding=int(kernel_size // 2), + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.se_block = SEModule(self.out_channels) + self.shortcut = shortcut + + def forward(self, ins): + x = self.in_conv(ins) + if self.shortcut: + out = x + self.se_block(x) + else: + out = self.se_block(x) + return out + + +class RSEFPN(nn.Layer): + def __init__(self, in_channels, out_channels, shortcut=True, **kwargs): + super(RSEFPN, self).__init__() + self.out_channels = out_channels + self.ins_conv = nn.LayerList() + self.inp_conv = nn.LayerList() + + for i in range(len(in_channels)): + self.ins_conv.append( + RSELayer( + in_channels[i], + out_channels, + kernel_size=1, + shortcut=shortcut)) + self.inp_conv.append( + RSELayer( + out_channels, + out_channels // 4, + kernel_size=3, + shortcut=shortcut)) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest", align_mode=1) # 1/4 + + p5 = self.inp_conv[3](in5) + p4 = self.inp_conv[2](out4) + p3 = self.inp_conv[1](out3) + p2 = self.inp_conv[0](out2) + + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) + p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) + p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) + + fuse = paddle.concat([p5, p4, p3, p2], axis=1) + return fuse + + +class LKPAN(nn.Layer): + def __init__(self, in_channels, out_channels, mode='large', **kwargs): + super(LKPAN, self).__init__() + self.out_channels = out_channels + weight_attr = paddle.nn.initializer.KaimingUniform() + + self.ins_conv = nn.LayerList() + self.inp_conv = nn.LayerList() + # pan head + self.pan_head_conv = nn.LayerList() + self.pan_lat_conv = nn.LayerList() + + if mode.lower() == 'lite': + p_layer = DSConv + elif mode.lower() == 'large': + p_layer = nn.Conv2D + else: + raise ValueError( + "mode can only be one of ['lite', 'large'], but received {}". + format(mode)) + + for i in range(len(in_channels)): + self.ins_conv.append( + nn.Conv2D( + in_channels=in_channels[i], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + + self.inp_conv.append( + p_layer( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + + if i > 0: + self.pan_head_conv.append( + nn.Conv2D( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + stride=2, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + self.pan_lat_conv.append( + p_layer( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest", align_mode=1) # 1/4 + + f5 = self.inp_conv[3](in5) + f4 = self.inp_conv[2](out4) + f3 = self.inp_conv[1](out3) + f2 = self.inp_conv[0](out2) + + pan3 = f3 + self.pan_head_conv[0](f2) + pan4 = f4 + self.pan_head_conv[1](pan3) + pan5 = f5 + self.pan_head_conv[2](pan4) + + p2 = self.pan_lat_conv[0](f2) + p3 = self.pan_lat_conv[1](pan3) + p4 = self.pan_lat_conv[2](pan4) + p5 = self.pan_lat_conv[3](pan5) + + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) + p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) + p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) + + fuse = paddle.concat([p5, p4, p3, p2], axis=1) + return fuse diff --git a/backend/ppocr/modeling/necks/east_fpn.py b/backend/ppocr/modeling/necks/east_fpn.py new file mode 100644 index 0000000..120ff15 --- /dev/null +++ b/backend/ppocr/modeling/necks/east_fpn.py @@ -0,0 +1,188 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class DeConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.deconv = nn.Conv2DTranspose( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + return x + + +class EASTFPN(nn.Layer): + def __init__(self, in_channels, model_name, **kwargs): + super(EASTFPN, self).__init__() + self.model_name = model_name + if self.model_name == "large": + self.out_channels = 128 + else: + self.out_channels = 64 + self.in_channels = in_channels[::-1] + self.h1_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[1], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_1") + self.h2_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[2], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_2") + self.h3_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[3], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_3") + self.g0_deconv = DeConvBNLayer( + in_channels=self.in_channels[0], + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_0") + self.g1_deconv = DeConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_1") + self.g2_deconv = DeConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_2") + self.g3_conv = ConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_g_3") + + def forward(self, x): + f = x[::-1] + + h = f[0] + g = self.g0_deconv(h) + h = paddle.concat([g, f[1]], axis=1) + h = self.h1_conv(h) + g = self.g1_deconv(h) + h = paddle.concat([g, f[2]], axis=1) + h = self.h2_conv(h) + g = self.g2_deconv(h) + h = paddle.concat([g, f[3]], axis=1) + h = self.h3_conv(h) + g = self.g3_conv(h) + + return g \ No newline at end of file diff --git a/backend/ppocr/modeling/necks/fce_fpn.py b/backend/ppocr/modeling/necks/fce_fpn.py new file mode 100644 index 0000000..954e964 --- /dev/null +++ b/backend/ppocr/modeling/necks/fce_fpn.py @@ -0,0 +1,280 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/ppdet/modeling/necks/fpn.py +""" + +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import XavierUniform +from paddle.nn.initializer import Normal +from paddle.regularizer import L2Decay + +__all__ = ['FCEFPN'] + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + lr_scale=1., + freeze_norm=False, + initializer=Normal( + mean=0., std=0.01)): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn', 'gn'] + + bias_attr = False + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=initializer, learning_rate=1.), + bias_attr=bias_attr) + + norm_lr = 0. if freeze_norm else 1. + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + if norm_type == 'bn': + self.norm = nn.BatchNorm2D( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'sync_bn': + self.norm = nn.SyncBatchNorm( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + weight_attr=param_attr, + bias_attr=bias_attr) + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + return out + + +class FCEFPN(nn.Layer): + """ + Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 + Args: + in_channels (list[int]): input channels of each level which can be + derived from the output shape of backbone by from_config + out_channels (list[int]): output channel of each level + spatial_scales (list[float]): the spatial scales between input feature + maps and original input image which can be derived from the output + shape of backbone by from_config + has_extra_convs (bool): whether to add extra conv to the last level. + default False + extra_stage (int): the number of extra stages added to the last level. + default 1 + use_c5 (bool): Whether to use c5 as the input of extra stage, + otherwise p5 is used. default True + norm_type (string|None): The normalization type in FPN module. If + norm_type is None, norm will not be used after conv and if + norm_type is string, bn, gn, sync_bn are available. default None + norm_decay (float): weight decay for normalization layer weights. + default 0. + freeze_norm (bool): whether to freeze normalization layer. + default False + relu_before_extra_convs (bool): whether to add relu before extra convs. + default False + + """ + + def __init__(self, + in_channels, + out_channels, + spatial_scales=[0.25, 0.125, 0.0625, 0.03125], + has_extra_convs=False, + extra_stage=1, + use_c5=True, + norm_type=None, + norm_decay=0., + freeze_norm=False, + relu_before_extra_convs=True): + super(FCEFPN, self).__init__() + self.out_channels = out_channels + for s in range(extra_stage): + spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] + self.spatial_scales = spatial_scales + self.has_extra_convs = has_extra_convs + self.extra_stage = extra_stage + self.use_c5 = use_c5 + self.relu_before_extra_convs = relu_before_extra_convs + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + + self.lateral_convs = [] + self.fpn_convs = [] + fan = out_channels * 3 * 3 + + # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone + # 0 <= st_stage < ed_stage <= 3 + st_stage = 4 - len(in_channels) + ed_stage = st_stage + len(in_channels) - 1 + for i in range(st_stage, ed_stage + 1): + if i == 3: + lateral_name = 'fpn_inner_res5_sum' + else: + lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) + in_c = in_channels[i - st_stage] + if self.norm_type is not None: + lateral = self.add_sublayer( + lateral_name, + ConvNormLayer( + ch_in=in_c, + ch_out=out_channels, + filter_size=1, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=in_c))) + else: + lateral = self.add_sublayer( + lateral_name, + nn.Conv2D( + in_channels=in_c, + out_channels=out_channels, + kernel_size=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=in_c)))) + self.lateral_convs.append(lateral) + + for i in range(st_stage, ed_stage + 1): + fpn_name = 'fpn_res{}_sum'.format(i + 2) + if self.norm_type is not None: + fpn_conv = self.add_sublayer( + fpn_name, + ConvNormLayer( + ch_in=out_channels, + ch_out=out_channels, + filter_size=3, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=fan))) + else: + fpn_conv = self.add_sublayer( + fpn_name, + nn.Conv2D( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=fan)))) + self.fpn_convs.append(fpn_conv) + + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + if self.has_extra_convs: + for i in range(self.extra_stage): + lvl = ed_stage + 1 + i + if i == 0 and self.use_c5: + in_c = in_channels[-1] + else: + in_c = out_channels + extra_fpn_name = 'fpn_{}'.format(lvl + 2) + if self.norm_type is not None: + extra_fpn_conv = self.add_sublayer( + extra_fpn_name, + ConvNormLayer( + ch_in=in_c, + ch_out=out_channels, + filter_size=3, + stride=2, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=fan))) + else: + extra_fpn_conv = self.add_sublayer( + extra_fpn_name, + nn.Conv2D( + in_channels=in_c, + out_channels=out_channels, + kernel_size=3, + stride=2, + padding=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=fan)))) + self.fpn_convs.append(extra_fpn_conv) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'spatial_scales': [1.0 / i.stride for i in input_shape], + } + + def forward(self, body_feats): + laterals = [] + num_levels = len(body_feats) + + for i in range(num_levels): + laterals.append(self.lateral_convs[i](body_feats[i])) + + for i in range(1, num_levels): + lvl = num_levels - i + upsample = F.interpolate( + laterals[lvl], + scale_factor=2., + mode='nearest', ) + laterals[lvl - 1] += upsample + + fpn_output = [] + for lvl in range(num_levels): + fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) + + if self.extra_stage > 0: + # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) + if not self.has_extra_convs: + assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' + fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + else: + if self.use_c5: + extra_source = body_feats[-1] + else: + extra_source = fpn_output[-1] + fpn_output.append(self.fpn_convs[num_levels](extra_source)) + + for i in range(1, self.extra_stage): + if self.relu_before_extra_convs: + fpn_output.append(self.fpn_convs[num_levels + i](F.relu( + fpn_output[-1]))) + else: + fpn_output.append(self.fpn_convs[num_levels + i]( + fpn_output[-1])) + return fpn_output diff --git a/backend/ppocr/modeling/necks/fpn.py b/backend/ppocr/modeling/necks/fpn.py new file mode 100644 index 0000000..48c85b1 --- /dev/null +++ b/backend/ppocr/modeling/necks/fpn.py @@ -0,0 +1,138 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/neck/fpn.py +""" + +import paddle.nn as nn +import paddle +import math +import paddle.nn.functional as F + + +class Conv_BN_ReLU(nn.Layer): + def __init__(self, + in_planes, + out_planes, + kernel_size=1, + stride=1, + padding=0): + super(Conv_BN_ReLU, self).__init__() + self.conv = nn.Conv2D( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias_attr=False) + self.bn = nn.BatchNorm2D(out_planes, momentum=0.1) + self.relu = nn.ReLU() + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Normal( + 0, math.sqrt(2. / n))) + elif isinstance(m, nn.BatchNorm2D): + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(1.0)) + m.bias = paddle.create_parameter( + shape=m.bias.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(0.0)) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +class FPN(nn.Layer): + def __init__(self, in_channels, out_channels): + super(FPN, self).__init__() + + # Top layer + self.toplayer_ = Conv_BN_ReLU( + in_channels[3], out_channels, kernel_size=1, stride=1, padding=0) + # Lateral layers + self.latlayer1_ = Conv_BN_ReLU( + in_channels[2], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer2_ = Conv_BN_ReLU( + in_channels[1], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer3_ = Conv_BN_ReLU( + in_channels[0], out_channels, kernel_size=1, stride=1, padding=0) + + # Smooth layers + self.smooth1_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth2_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth3_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.out_channels = out_channels * 4 + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Normal( + 0, math.sqrt(2. / n))) + elif isinstance(m, nn.BatchNorm2D): + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(1.0)) + m.bias = paddle.create_parameter( + shape=m.bias.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(0.0)) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def _upsample_add(self, x, y, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + y + + def forward(self, x): + f2, f3, f4, f5 = x + p5 = self.toplayer_(f5) + + f4 = self.latlayer1_(f4) + p4 = self._upsample_add(p5, f4, 2) + p4 = self.smooth1_(p4) + + f3 = self.latlayer2_(f3) + p3 = self._upsample_add(p4, f3, 2) + p3 = self.smooth2_(p3) + + f2 = self.latlayer3_(f2) + p2 = self._upsample_add(p3, f2, 2) + p2 = self.smooth3_(p2) + + p3 = self._upsample(p3, 2) + p4 = self._upsample(p4, 4) + p5 = self._upsample(p5, 8) + + fuse = paddle.concat([p2, p3, p4, p5], axis=1) + return fuse diff --git a/backend/ppocr/modeling/necks/pg_fpn.py b/backend/ppocr/modeling/necks/pg_fpn.py new file mode 100644 index 0000000..3f64539 --- /dev/null +++ b/backend/ppocr/modeling/necks/pg_fpn.py @@ -0,0 +1,314 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance', + use_global_stats=False) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class DeConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=4, + stride=2, + padding=1, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + + self.if_act = if_act + self.act = act + self.deconv = nn.Conv2DTranspose( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance", + use_global_stats=False) + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + return x + + +class PGFPN(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(PGFPN, self).__init__() + num_inputs = [2048, 2048, 1024, 512, 256] + num_outputs = [256, 256, 192, 192, 128] + self.out_channels = 128 + self.conv_bn_layer_1 = ConvBNLayer( + in_channels=3, + out_channels=32, + kernel_size=3, + stride=1, + act=None, + name='FPN_d1') + self.conv_bn_layer_2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + act=None, + name='FPN_d2') + self.conv_bn_layer_3 = ConvBNLayer( + in_channels=256, + out_channels=128, + kernel_size=3, + stride=1, + act=None, + name='FPN_d3') + self.conv_bn_layer_4 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=2, + act=None, + name='FPN_d4') + self.conv_bn_layer_5 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name='FPN_d5') + self.conv_bn_layer_6 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=3, + stride=2, + act=None, + name='FPN_d6') + self.conv_bn_layer_7 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + act='relu', + name='FPN_d7') + self.conv_bn_layer_8 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=1, + stride=1, + act=None, + name='FPN_d8') + + self.conv_h0 = ConvBNLayer( + in_channels=num_inputs[0], + out_channels=num_outputs[0], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(0)) + self.conv_h1 = ConvBNLayer( + in_channels=num_inputs[1], + out_channels=num_outputs[1], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(1)) + self.conv_h2 = ConvBNLayer( + in_channels=num_inputs[2], + out_channels=num_outputs[2], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(2)) + self.conv_h3 = ConvBNLayer( + in_channels=num_inputs[3], + out_channels=num_outputs[3], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(3)) + self.conv_h4 = ConvBNLayer( + in_channels=num_inputs[4], + out_channels=num_outputs[4], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(4)) + + self.dconv0 = DeConvBNLayer( + in_channels=num_outputs[0], + out_channels=num_outputs[0 + 1], + name="dconv_{}".format(0)) + self.dconv1 = DeConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[1 + 1], + act=None, + name="dconv_{}".format(1)) + self.dconv2 = DeConvBNLayer( + in_channels=num_outputs[2], + out_channels=num_outputs[2 + 1], + act=None, + name="dconv_{}".format(2)) + self.dconv3 = DeConvBNLayer( + in_channels=num_outputs[3], + out_channels=num_outputs[3 + 1], + act=None, + name="dconv_{}".format(3)) + self.conv_g1 = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[1], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(1)) + self.conv_g2 = ConvBNLayer( + in_channels=num_outputs[2], + out_channels=num_outputs[2], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(2)) + self.conv_g3 = ConvBNLayer( + in_channels=num_outputs[3], + out_channels=num_outputs[3], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(3)) + self.conv_g4 = ConvBNLayer( + in_channels=num_outputs[4], + out_channels=num_outputs[4], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(4)) + self.convf = ConvBNLayer( + in_channels=num_outputs[4], + out_channels=num_outputs[4], + kernel_size=1, + stride=1, + act=None, + name="conv_f{}".format(4)) + + def forward(self, x): + c0, c1, c2, c3, c4, c5, c6 = x + # FPN_Down_Fusion + f = [c0, c1, c2] + g = [None, None, None] + h = [None, None, None] + h[0] = self.conv_bn_layer_1(f[0]) + h[1] = self.conv_bn_layer_2(f[1]) + h[2] = self.conv_bn_layer_3(f[2]) + + g[0] = self.conv_bn_layer_4(h[0]) + g[1] = paddle.add(g[0], h[1]) + g[1] = F.relu(g[1]) + g[1] = self.conv_bn_layer_5(g[1]) + g[1] = self.conv_bn_layer_6(g[1]) + + g[2] = paddle.add(g[1], h[2]) + g[2] = F.relu(g[2]) + g[2] = self.conv_bn_layer_7(g[2]) + f_down = self.conv_bn_layer_8(g[2]) + + # FPN UP Fusion + f1 = [c6, c5, c4, c3, c2] + g = [None, None, None, None, None] + h = [None, None, None, None, None] + h[0] = self.conv_h0(f1[0]) + h[1] = self.conv_h1(f1[1]) + h[2] = self.conv_h2(f1[2]) + h[3] = self.conv_h3(f1[3]) + h[4] = self.conv_h4(f1[4]) + + g[0] = self.dconv0(h[0]) + g[1] = paddle.add(g[0], h[1]) + g[1] = F.relu(g[1]) + g[1] = self.conv_g1(g[1]) + g[1] = self.dconv1(g[1]) + + g[2] = paddle.add(g[1], h[2]) + g[2] = F.relu(g[2]) + g[2] = self.conv_g2(g[2]) + g[2] = self.dconv2(g[2]) + + g[3] = paddle.add(g[2], h[3]) + g[3] = F.relu(g[3]) + g[3] = self.conv_g3(g[3]) + g[3] = self.dconv3(g[3]) + + g[4] = paddle.add(x=g[3], y=h[4]) + g[4] = F.relu(g[4]) + g[4] = self.conv_g4(g[4]) + f_up = self.convf(g[4]) + f_common = paddle.add(f_down, f_up) + f_common = F.relu(f_common) + return f_common diff --git a/backend/ppocr/modeling/necks/pren_fpn.py b/backend/ppocr/modeling/necks/pren_fpn.py new file mode 100644 index 0000000..afbdcea --- /dev/null +++ b/backend/ppocr/modeling/necks/pren_fpn.py @@ -0,0 +1,163 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Code is refer from: +https://github.com/RuijieJ/pren/blob/main/Nets/Aggregation.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class PoolAggregate(nn.Layer): + def __init__(self, n_r, d_in, d_middle=None, d_out=None): + super(PoolAggregate, self).__init__() + if not d_middle: + d_middle = d_in + if not d_out: + d_out = d_in + + self.d_in = d_in + self.d_middle = d_middle + self.d_out = d_out + self.act = nn.Swish() + + self.n_r = n_r + self.aggs = self._build_aggs() + + def _build_aggs(self): + aggs = [] + for i in range(self.n_r): + aggs.append( + self.add_sublayer( + '{}'.format(i), + nn.Sequential( + ('conv1', nn.Conv2D( + self.d_in, self.d_middle, 3, 2, 1, bias_attr=False) + ), ('bn1', nn.BatchNorm(self.d_middle)), + ('act', self.act), ('conv2', nn.Conv2D( + self.d_middle, self.d_out, 3, 2, 1, bias_attr=False + )), ('bn2', nn.BatchNorm(self.d_out))))) + return aggs + + def forward(self, x): + b = x.shape[0] + outs = [] + for agg in self.aggs: + y = agg(x) + p = F.adaptive_avg_pool2d(y, 1) + outs.append(p.reshape((b, 1, self.d_out))) + out = paddle.concat(outs, 1) + return out + + +class WeightAggregate(nn.Layer): + def __init__(self, n_r, d_in, d_middle=None, d_out=None): + super(WeightAggregate, self).__init__() + if not d_middle: + d_middle = d_in + if not d_out: + d_out = d_in + + self.n_r = n_r + self.d_out = d_out + self.act = nn.Swish() + + self.conv_n = nn.Sequential( + ('conv1', nn.Conv2D( + d_in, d_in, 3, 1, 1, + bias_attr=False)), ('bn1', nn.BatchNorm(d_in)), + ('act1', self.act), ('conv2', nn.Conv2D( + d_in, n_r, 1, bias_attr=False)), ('bn2', nn.BatchNorm(n_r)), + ('act2', nn.Sigmoid())) + self.conv_d = nn.Sequential( + ('conv1', nn.Conv2D( + d_in, d_middle, 3, 1, 1, + bias_attr=False)), ('bn1', nn.BatchNorm(d_middle)), + ('act1', self.act), ('conv2', nn.Conv2D( + d_middle, d_out, 1, + bias_attr=False)), ('bn2', nn.BatchNorm(d_out))) + + def forward(self, x): + b, _, h, w = x.shape + + hmaps = self.conv_n(x) + fmaps = self.conv_d(x) + r = paddle.bmm( + hmaps.reshape((b, self.n_r, h * w)), + fmaps.reshape((b, self.d_out, h * w)).transpose((0, 2, 1))) + return r + + +class GCN(nn.Layer): + def __init__(self, d_in, n_in, d_out=None, n_out=None, dropout=0.1): + super(GCN, self).__init__() + if not d_out: + d_out = d_in + if not n_out: + n_out = d_in + + self.conv_n = nn.Conv1D(n_in, n_out, 1) + self.linear = nn.Linear(d_in, d_out) + self.dropout = nn.Dropout(dropout) + self.act = nn.Swish() + + def forward(self, x): + x = self.conv_n(x) + x = self.dropout(self.linear(x)) + return self.act(x) + + +class PRENFPN(nn.Layer): + def __init__(self, in_channels, n_r, d_model, max_len, dropout): + super(PRENFPN, self).__init__() + assert len(in_channels) == 3, "in_channels' length must be 3." + c1, c2, c3 = in_channels # the depths are from big to small + # build fpn + assert d_model % 3 == 0, "{} can't be divided by 3.".format(d_model) + self.agg_p1 = PoolAggregate(n_r, c1, d_out=d_model // 3) + self.agg_p2 = PoolAggregate(n_r, c2, d_out=d_model // 3) + self.agg_p3 = PoolAggregate(n_r, c3, d_out=d_model // 3) + + self.agg_w1 = WeightAggregate(n_r, c1, 4 * c1, d_model // 3) + self.agg_w2 = WeightAggregate(n_r, c2, 4 * c2, d_model // 3) + self.agg_w3 = WeightAggregate(n_r, c3, 4 * c3, d_model // 3) + + self.gcn_pool = GCN(d_model, n_r, d_model, max_len, dropout) + self.gcn_weight = GCN(d_model, n_r, d_model, max_len, dropout) + + self.out_channels = d_model + + def forward(self, inputs): + f3, f5, f7 = inputs + + rp1 = self.agg_p1(f3) + rp2 = self.agg_p2(f5) + rp3 = self.agg_p3(f7) + rp = paddle.concat([rp1, rp2, rp3], 2) # [b,nr,d] + + rw1 = self.agg_w1(f3) + rw2 = self.agg_w2(f5) + rw3 = self.agg_w3(f7) + rw = paddle.concat([rw1, rw2, rw3], 2) # [b,nr,d] + + y1 = self.gcn_pool(rp) + y2 = self.gcn_weight(rw) + y = 0.5 * (y1 + y2) + return y # [b,max_len,d] diff --git a/backend/ppocr/modeling/necks/rnn.py b/backend/ppocr/modeling/necks/rnn.py new file mode 100644 index 0000000..c8a774b --- /dev/null +++ b/backend/ppocr/modeling/necks/rnn.py @@ -0,0 +1,191 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from ppocr.modeling.heads.rec_ctc_head import get_para_bias_attr +from ppocr.modeling.backbones.rec_svtrnet import Block, ConvBNLayer, trunc_normal_, zeros_, ones_ + + +class Im2Seq(nn.Layer): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + B, C, H, W = x.shape + assert H == 1 + x = x.squeeze(axis=2) + x = x.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + return x + + +class EncoderWithRNN(nn.Layer): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN, self).__init__() + self.out_channels = hidden_size * 2 + self.lstm = nn.LSTM( + in_channels, hidden_size, direction='bidirectional', num_layers=2) + + def forward(self, x): + x, _ = self.lstm(x) + return x + + +class EncoderWithFC(nn.Layer): + def __init__(self, in_channels, hidden_size): + super(EncoderWithFC, self).__init__() + self.out_channels = hidden_size + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=0.00001, k=in_channels) + self.fc = nn.Linear( + in_channels, + hidden_size, + weight_attr=weight_attr, + bias_attr=bias_attr, + name='reduce_encoder_fea') + + def forward(self, x): + x = self.fc(x) + return x + + +class EncoderWithSVTR(nn.Layer): + def __init__( + self, + in_channels, + dims=64, # XS + depth=2, + hidden_dims=120, + use_guide=False, + num_heads=8, + qkv_bias=True, + mlp_ratio=2.0, + drop_rate=0.1, + attn_drop_rate=0.1, + drop_path=0., + qk_scale=None): + super(EncoderWithSVTR, self).__init__() + self.depth = depth + self.use_guide = use_guide + self.conv1 = ConvBNLayer( + in_channels, in_channels // 8, padding=1, act=nn.Swish) + self.conv2 = ConvBNLayer( + in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish) + + self.svtr_block = nn.LayerList([ + Block( + dim=hidden_dims, + num_heads=num_heads, + mixer='Global', + HW=None, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=nn.Swish, + attn_drop=attn_drop_rate, + drop_path=drop_path, + norm_layer='nn.LayerNorm', + epsilon=1e-05, + prenorm=False) for i in range(depth) + ]) + self.norm = nn.LayerNorm(hidden_dims, epsilon=1e-6) + self.conv3 = ConvBNLayer( + hidden_dims, in_channels, kernel_size=1, act=nn.Swish) + # last conv-nxn, the input is concat of input tensor and conv3 output tensor + self.conv4 = ConvBNLayer( + 2 * in_channels, in_channels // 8, padding=1, act=nn.Swish) + + self.conv1x1 = ConvBNLayer( + in_channels // 8, dims, kernel_size=1, act=nn.Swish) + self.out_channels = dims + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + # for use guide + if self.use_guide: + z = x.clone() + z.stop_gradient = True + else: + z = x + # for short cut + h = z + # reduce dim + z = self.conv1(z) + z = self.conv2(z) + # SVTR global block + B, C, H, W = z.shape + z = z.flatten(2).transpose([0, 2, 1]) + for blk in self.svtr_block: + z = blk(z) + z = self.norm(z) + # last stage + z = z.reshape([0, H, W, C]).transpose([0, 3, 1, 2]) + z = self.conv3(z) + z = paddle.concat((h, z), axis=1) + z = self.conv1x1(self.conv4(z)) + return z + + +class SequenceEncoder(nn.Layer): + def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): + super(SequenceEncoder, self).__init__() + self.encoder_reshape = Im2Seq(in_channels) + self.out_channels = self.encoder_reshape.out_channels + self.encoder_type = encoder_type + if encoder_type == 'reshape': + self.only_reshape = True + else: + support_encoder_dict = { + 'reshape': Im2Seq, + 'fc': EncoderWithFC, + 'rnn': EncoderWithRNN, + 'svtr': EncoderWithSVTR + } + assert encoder_type in support_encoder_dict, '{} must in {}'.format( + encoder_type, support_encoder_dict.keys()) + if encoder_type == "svtr": + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, **kwargs) + else: + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size) + self.out_channels = self.encoder.out_channels + self.only_reshape = False + + def forward(self, x): + if self.encoder_type != 'svtr': + x = self.encoder_reshape(x) + if not self.only_reshape: + x = self.encoder(x) + return x + else: + x = self.encoder(x) + x = self.encoder_reshape(x) + return x diff --git a/backend/ppocr/modeling/necks/sast_fpn.py b/backend/ppocr/modeling/necks/sast_fpn.py new file mode 100644 index 0000000..9b60245 --- /dev/null +++ b/backend/ppocr/modeling/necks/sast_fpn.py @@ -0,0 +1,284 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class DeConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.deconv = nn.Conv2DTranspose( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + return x + + +class FPN_Up_Fusion(nn.Layer): + def __init__(self, in_channels): + super(FPN_Up_Fusion, self).__init__() + in_channels = in_channels[::-1] + out_channels = [256, 256, 192, 192, 128] + + self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0') + self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1') + self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2') + self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3') + self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4') + + self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0') + + self.g1_conv = nn.Sequential( + ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'), + DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2') + ) + self.g2_conv = nn.Sequential( + ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'), + DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2') + ) + self.g3_conv = nn.Sequential( + ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'), + DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2') + ) + + self.g4_conv = nn.Sequential( + ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'), + ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2') + ) + + def _add_relu(self, x1, x2): + x = paddle.add(x=x1, y=x2) + x = F.relu(x) + return x + + def forward(self, x): + f = x[2:][::-1] + h0 = self.h0_conv(f[0]) + h1 = self.h1_conv(f[1]) + h2 = self.h2_conv(f[2]) + h3 = self.h3_conv(f[3]) + h4 = self.h4_conv(f[4]) + + g0 = self.g0_conv(h0) + g1 = self._add_relu(g0, h1) + g1 = self.g1_conv(g1) + g2 = self.g2_conv(self._add_relu(g1, h2)) + g3 = self.g3_conv(self._add_relu(g2, h3)) + g4 = self.g4_conv(self._add_relu(g3, h4)) + + return g4 + + +class FPN_Down_Fusion(nn.Layer): + def __init__(self, in_channels): + super(FPN_Down_Fusion, self).__init__() + out_channels = [32, 64, 128] + + self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0') + self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1') + self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2') + + self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0') + + self.g1_conv = nn.Sequential( + ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'), + ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2') + ) + + self.g2_conv = nn.Sequential( + ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'), + ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2') + ) + + def forward(self, x): + f = x[:3] + h0 = self.h0_conv(f[0]) + h1 = self.h1_conv(f[1]) + h2 = self.h2_conv(f[2]) + g0 = self.g0_conv(h0) + g1 = paddle.add(x=g0, y=h1) + g1 = F.relu(g1) + g1 = self.g1_conv(g1) + g2 = paddle.add(x=g1, y=h2) + g2 = F.relu(g2) + g2 = self.g2_conv(g2) + return g2 + + +class Cross_Attention(nn.Layer): + def __init__(self, in_channels): + super(Cross_Attention, self).__init__() + self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta') + self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi') + self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g') + + self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight') + self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc') + + self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight') + self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc') + + self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn') + + def _cal_fweight(self, f, shape): + f_theta, f_phi, f_g = f + #flatten + f_theta = paddle.transpose(f_theta, [0, 2, 3, 1]) + f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128]) + f_phi = paddle.transpose(f_phi, [0, 2, 3, 1]) + f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128]) + f_g = paddle.transpose(f_g, [0, 2, 3, 1]) + f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128]) + #correlation + f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1])) + #scale + f_attn = f_attn / (128**0.5) + f_attn = F.softmax(f_attn) + #weighted sum + f_weight = paddle.matmul(f_attn, f_g) + f_weight = paddle.reshape( + f_weight, [shape[0], shape[1], shape[2], 128]) + return f_weight + + def forward(self, f_common): + f_shape = paddle.shape(f_common) + # print('f_shape: ', f_shape) + + f_theta = self.theta_conv(f_common) + f_phi = self.phi_conv(f_common) + f_g = self.g_conv(f_common) + + ######## horizon ######## + fh_weight = self._cal_fweight([f_theta, f_phi, f_g], + [f_shape[0], f_shape[2], f_shape[3]]) + fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2]) + fh_weight = self.fh_weight_conv(fh_weight) + #short cut + fh_sc = self.fh_sc_conv(f_common) + f_h = F.relu(fh_weight + fh_sc) + + ######## vertical ######## + fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2]) + fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2]) + fv_g = paddle.transpose(f_g, [0, 1, 3, 2]) + fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g], + [f_shape[0], f_shape[3], f_shape[2]]) + fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1]) + fv_weight = self.fv_weight_conv(fv_weight) + #short cut + fv_sc = self.fv_sc_conv(f_common) + f_v = F.relu(fv_weight + fv_sc) + + ######## merge ######## + f_attn = paddle.concat([f_h, f_v], axis=1) + f_attn = self.f_attn_conv(f_attn) + return f_attn + + +class SASTFPN(nn.Layer): + def __init__(self, in_channels, with_cab=False, **kwargs): + super(SASTFPN, self).__init__() + self.in_channels = in_channels + self.with_cab = with_cab + self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels) + self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels) + self.out_channels = 128 + self.cross_attention = Cross_Attention(self.out_channels) + + def forward(self, x): + #down fpn + f_down = self.FPN_Down_Fusion(x) + + #up fpn + f_up = self.FPN_Up_Fusion(x) + + #fusion + f_common = paddle.add(x=f_down, y=f_up) + f_common = F.relu(f_common) + + if self.with_cab: + # print('enhence f_common with CAB.') + f_common = self.cross_attention(f_common) + + return f_common diff --git a/backend/ppocr/modeling/necks/table_fpn.py b/backend/ppocr/modeling/necks/table_fpn.py new file mode 100644 index 0000000..734f15a --- /dev/null +++ b/backend/ppocr/modeling/necks/table_fpn.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class TableFPN(nn.Layer): + def __init__(self, in_channels, out_channels, **kwargs): + super(TableFPN, self).__init__() + self.out_channels = 512 + weight_attr = paddle.nn.initializer.KaimingUniform() + self.in2_conv = nn.Conv2D( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in3_conv = nn.Conv2D( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + stride = 1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in4_conv = nn.Conv2D( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in5_conv = nn.Conv2D( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p5_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p4_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p3_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p2_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.fuse_conv = nn.Conv2D( + in_channels=self.out_channels * 4, + out_channels=512, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), bias_attr=False) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.upsample( + in5, size=in4.shape[2:4], mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, size=in3.shape[2:4], mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, size=in2.shape[2:4], mode="nearest", align_mode=1) # 1/4 + + p4 = F.upsample(out4, size=in5.shape[2:4], mode="nearest", align_mode=1) + p3 = F.upsample(out3, size=in5.shape[2:4], mode="nearest", align_mode=1) + p2 = F.upsample(out2, size=in5.shape[2:4], mode="nearest", align_mode=1) + fuse = paddle.concat([in5, p4, p3, p2], axis=1) + fuse_conv = self.fuse_conv(fuse) * 0.005 + return [c5 + fuse_conv] diff --git a/backend/ppocr/modeling/transforms/__init__.py b/backend/ppocr/modeling/transforms/__init__.py new file mode 100755 index 0000000..405ab3c --- /dev/null +++ b/backend/ppocr/modeling/transforms/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_transform'] + + +def build_transform(config): + from .tps import TPS + from .stn import STN_ON + + support_dict = ['TPS', 'STN_ON'] + + module_name = config.pop('name') + assert module_name in support_dict, Exception( + 'transform only support {}'.format(support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/modeling/transforms/stn.py b/backend/ppocr/modeling/transforms/stn.py new file mode 100644 index 0000000..6f2bdda --- /dev/null +++ b/backend/ppocr/modeling/transforms/stn.py @@ -0,0 +1,135 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np + +from .tps_spatial_transformer import TPSSpatialTransformer + + +def conv3x3_block(in_channels, out_channels, stride=1): + n = 3 * 3 * out_channels + w = math.sqrt(2. / n) + conv_layer = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=nn.initializer.Normal( + mean=0.0, std=w), + bias_attr=nn.initializer.Constant(0)) + block = nn.Sequential(conv_layer, nn.BatchNorm2D(out_channels), nn.ReLU()) + return block + + +class STN(nn.Layer): + def __init__(self, in_channels, num_ctrlpoints, activation='none'): + super(STN, self).__init__() + self.in_channels = in_channels + self.num_ctrlpoints = num_ctrlpoints + self.activation = activation + self.stn_convnet = nn.Sequential( + conv3x3_block(in_channels, 32), #32x64 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(32, 64), #16x32 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(64, 128), # 8*16 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(128, 256), # 4*8 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(256, 256), # 2*4, + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(256, 256)) # 1*2 + self.stn_fc1 = nn.Sequential( + nn.Linear( + 2 * 256, + 512, + weight_attr=nn.initializer.Normal(0, 0.001), + bias_attr=nn.initializer.Constant(0)), + nn.BatchNorm1D(512), + nn.ReLU()) + fc2_bias = self.init_stn() + self.stn_fc2 = nn.Linear( + 512, + num_ctrlpoints * 2, + weight_attr=nn.initializer.Constant(0.0), + bias_attr=nn.initializer.Assign(fc2_bias)) + + def init_stn(self): + margin = 0.01 + sampling_num_per_side = int(self.num_ctrlpoints / 2) + ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side) + ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin + ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + ctrl_points = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32) + if self.activation == 'none': + pass + elif self.activation == 'sigmoid': + ctrl_points = -np.log(1. / ctrl_points - 1.) + ctrl_points = paddle.to_tensor(ctrl_points) + fc2_bias = paddle.reshape( + ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]]) + return fc2_bias + + def forward(self, x): + x = self.stn_convnet(x) + batch_size, _, h, w = x.shape + x = paddle.reshape(x, shape=(batch_size, -1)) + img_feat = self.stn_fc1(x) + x = self.stn_fc2(0.1 * img_feat) + if self.activation == 'sigmoid': + x = F.sigmoid(x) + x = paddle.reshape(x, shape=[-1, self.num_ctrlpoints, 2]) + return img_feat, x + + +class STN_ON(nn.Layer): + def __init__(self, in_channels, tps_inputsize, tps_outputsize, + num_control_points, tps_margins, stn_activation): + super(STN_ON, self).__init__() + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + self.stn_head = STN(in_channels=in_channels, + num_ctrlpoints=num_control_points, + activation=stn_activation) + self.tps_inputsize = tps_inputsize + self.out_channels = in_channels + + def forward(self, image): + stn_input = paddle.nn.functional.interpolate( + image, self.tps_inputsize, mode="bilinear", align_corners=True) + stn_img_feat, ctrl_points = self.stn_head(stn_input) + x, _ = self.tps(image, ctrl_points) + return x diff --git a/backend/ppocr/modeling/transforms/tps.py b/backend/ppocr/modeling/transforms/tps.py new file mode 100644 index 0000000..9bdab0f --- /dev/null +++ b/backend/ppocr/modeling/transforms/tps.py @@ -0,0 +1,308 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/clovaai/deep-text-recognition-benchmark/blob/master/modules/transformation.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + bn_name = "bn_" + name + self.bn = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class LocalizationNetwork(nn.Layer): + def __init__(self, in_channels, num_fiducial, loc_lr, model_name): + super(LocalizationNetwork, self).__init__() + self.F = num_fiducial + F = num_fiducial + if model_name == "large": + num_filters_list = [64, 128, 256, 512] + fc_dim = 256 + else: + num_filters_list = [16, 32, 64, 128] + fc_dim = 64 + + self.block_list = [] + for fno in range(0, len(num_filters_list)): + num_filters = num_filters_list[fno] + name = "loc_conv%d" % fno + conv = self.add_sublayer( + name, + ConvBNLayer( + in_channels=in_channels, + out_channels=num_filters, + kernel_size=3, + act='relu', + name=name)) + self.block_list.append(conv) + if fno == len(num_filters_list) - 1: + pool = nn.AdaptiveAvgPool2D(1) + else: + pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + in_channels = num_filters + self.block_list.append(pool) + name = "loc_fc1" + stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0) + self.fc1 = nn.Linear( + in_channels, + fc_dim, + weight_attr=ParamAttr( + learning_rate=loc_lr, + name=name + "_w", + initializer=nn.initializer.Uniform(-stdv, stdv)), + bias_attr=ParamAttr(name=name + '.b_0'), + name=name) + + # Init fc2 in LocalizationNetwork + initial_bias = self.get_initial_fiducials() + initial_bias = initial_bias.reshape(-1) + name = "loc_fc2" + param_attr = ParamAttr( + learning_rate=loc_lr, + initializer=nn.initializer.Assign(np.zeros([fc_dim, F * 2])), + name=name + "_w") + bias_attr = ParamAttr( + learning_rate=loc_lr, + initializer=nn.initializer.Assign(initial_bias), + name=name + "_b") + self.fc2 = nn.Linear( + fc_dim, + F * 2, + weight_attr=param_attr, + bias_attr=bias_attr, + name=name) + self.out_channels = F * 2 + + def forward(self, x): + """ + Estimating parameters of geometric transformation + Args: + image: input + Return: + batch_C_prime: the matrix of the geometric transformation + """ + B = x.shape[0] + i = 0 + for block in self.block_list: + x = block(x) + x = x.squeeze(axis=2).squeeze(axis=2) + x = self.fc1(x) + + x = F.relu(x) + x = self.fc2(x) + x = x.reshape(shape=[-1, self.F, 2]) + return x + + def get_initial_fiducials(self): + """ see RARE paper Fig. 6 (a) """ + F = self.F + ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2)) + ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2)) + ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2)) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0) + return initial_bias + + +class GridGenerator(nn.Layer): + def __init__(self, in_channels, num_fiducial): + super(GridGenerator, self).__init__() + self.eps = 1e-6 + self.F = num_fiducial + + name = "ex_fc" + initializer = nn.initializer.Constant(value=0.0) + param_attr = ParamAttr( + learning_rate=0.0, initializer=initializer, name=name + "_w") + bias_attr = ParamAttr( + learning_rate=0.0, initializer=initializer, name=name + "_b") + self.fc = nn.Linear( + in_channels, + 6, + weight_attr=param_attr, + bias_attr=bias_attr, + name=name) + + def forward(self, batch_C_prime, I_r_size): + """ + Generate the grid for the grid_sampler. + Args: + batch_C_prime: the matrix of the geometric transformation + I_r_size: the shape of the input image + Return: + batch_P_prime: the grid for the grid_sampler + """ + C = self.build_C_paddle() + P = self.build_P_paddle(I_r_size) + + inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).astype('float32') + P_hat_tensor = self.build_P_hat_paddle( + C, paddle.to_tensor(P)).astype('float32') + + inv_delta_C_tensor.stop_gradient = True + P_hat_tensor.stop_gradient = True + + batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime) + + batch_C_ex_part_tensor.stop_gradient = True + + batch_C_prime_with_zeros = paddle.concat( + [batch_C_prime, batch_C_ex_part_tensor], axis=1) + batch_T = paddle.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros) + batch_P_prime = paddle.matmul(P_hat_tensor, batch_T) + return batch_P_prime + + def build_C_paddle(self): + """ Return coordinates of fiducial points in I_r; C """ + F = self.F + ctrl_pts_x = paddle.linspace(-1.0, 1.0, int(F / 2), dtype='float64') + ctrl_pts_y_top = -1 * paddle.ones([int(F / 2)], dtype='float64') + ctrl_pts_y_bottom = paddle.ones([int(F / 2)], dtype='float64') + ctrl_pts_top = paddle.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = paddle.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + C = paddle.concat([ctrl_pts_top, ctrl_pts_bottom], axis=0) + return C # F x 2 + + def build_P_paddle(self, I_r_size): + I_r_height, I_r_width = I_r_size + I_r_grid_x = (paddle.arange( + -I_r_width, I_r_width, 2, dtype='float64') + 1.0 + ) / paddle.to_tensor(np.array([I_r_width])) + + I_r_grid_y = (paddle.arange( + -I_r_height, I_r_height, 2, dtype='float64') + 1.0 + ) / paddle.to_tensor(np.array([I_r_height])) + + # P: self.I_r_width x self.I_r_height x 2 + P = paddle.stack(paddle.meshgrid(I_r_grid_x, I_r_grid_y), axis=2) + P = paddle.transpose(P, perm=[1, 0, 2]) + # n (= self.I_r_width x self.I_r_height) x 2 + return P.reshape([-1, 2]) + + def build_inv_delta_C_paddle(self, C): + """ Return inv_delta_C which is needed to calculate T """ + F = self.F + hat_eye = paddle.eye(F, dtype='float64') # F x F + hat_C = paddle.norm( + C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye + hat_C = (hat_C**2) * paddle.log(hat_C) + delta_C = paddle.concat( # F+3 x F+3 + [ + paddle.concat( + [paddle.ones( + (F, 1), dtype='float64'), C, hat_C], axis=1), # F x F+3 + paddle.concat( + [ + paddle.zeros( + (2, 3), dtype='float64'), paddle.transpose( + C, perm=[1, 0]) + ], + axis=1), # 2 x F+3 + paddle.concat( + [ + paddle.zeros( + (1, 3), dtype='float64'), paddle.ones( + (1, F), dtype='float64') + ], + axis=1) # 1 x F+3 + ], + axis=0) + inv_delta_C = paddle.inverse(delta_C) + return inv_delta_C # F+3 x F+3 + + def build_P_hat_paddle(self, C, P): + F = self.F + eps = self.eps + n = P.shape[0] # n (= self.I_r_width x self.I_r_height) + # P_tile: n x 2 -> n x 1 x 2 -> n x F x 2 + P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1)) + C_tile = paddle.unsqueeze(C, axis=0) # 1 x F x 2 + P_diff = P_tile - C_tile # n x F x 2 + # rbf_norm: n x F + rbf_norm = paddle.norm(P_diff, p=2, axis=2, keepdim=False) + + # rbf: n x F + rbf = paddle.multiply( + paddle.square(rbf_norm), paddle.log(rbf_norm + eps)) + P_hat = paddle.concat( + [paddle.ones( + (n, 1), dtype='float64'), P, rbf], axis=1) + return P_hat # n x F+3 + + def get_expand_tensor(self, batch_C_prime): + B, H, C = batch_C_prime.shape + batch_C_prime = batch_C_prime.reshape([B, H * C]) + batch_C_ex_part_tensor = self.fc(batch_C_prime) + batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2]) + return batch_C_ex_part_tensor + + +class TPS(nn.Layer): + def __init__(self, in_channels, num_fiducial, loc_lr, model_name): + super(TPS, self).__init__() + self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr, + model_name) + self.grid_generator = GridGenerator(self.loc_net.out_channels, + num_fiducial) + self.out_channels = in_channels + + def forward(self, image): + image.stop_gradient = False + batch_C_prime = self.loc_net(image) + batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:]) + batch_P_prime = batch_P_prime.reshape( + [-1, image.shape[2], image.shape[3], 2]) + batch_I_r = F.grid_sample(x=image, grid=batch_P_prime) + return batch_I_r diff --git a/backend/ppocr/modeling/transforms/tps_spatial_transformer.py b/backend/ppocr/modeling/transforms/tps_spatial_transformer.py new file mode 100644 index 0000000..cb1cb10 --- /dev/null +++ b/backend/ppocr/modeling/transforms/tps_spatial_transformer.py @@ -0,0 +1,156 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np +import itertools + + +def grid_sample(input, grid, canvas=None): + input.stop_gradient = False + output = F.grid_sample(input, grid) + if canvas is None: + return output + else: + input_mask = paddle.ones(shape=input.shape) + output_mask = F.grid_sample(input_mask, grid) + padded_output = output * output_mask + canvas * (1 - output_mask) + return padded_output + + +# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2 +def compute_partial_repr(input_points, control_points): + N = input_points.shape[0] + M = control_points.shape[0] + pairwise_diff = paddle.reshape( + input_points, shape=[N, 1, 2]) - paddle.reshape( + control_points, shape=[1, M, 2]) + # original implementation, very slow + # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance + pairwise_diff_square = pairwise_diff * pairwise_diff + pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, + 1] + repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist) + # fix numerical error for 0 * log(0), substitute all nan with 0 + mask = np.array(repr_matrix != repr_matrix) + repr_matrix[mask] = 0 + return repr_matrix + + +# output_ctrl_pts are specified, according to our task. +def build_output_control_points(num_control_points, margins): + margin_x, margin_y = margins + num_ctrl_pts_per_side = num_control_points // 2 + ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side) + ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y + ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + output_ctrl_pts_arr = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0) + output_ctrl_pts = paddle.to_tensor(output_ctrl_pts_arr) + return output_ctrl_pts + + +class TPSSpatialTransformer(nn.Layer): + def __init__(self, + output_image_size=None, + num_control_points=None, + margins=None): + super(TPSSpatialTransformer, self).__init__() + self.output_image_size = output_image_size + self.num_control_points = num_control_points + self.margins = margins + + self.target_height, self.target_width = output_image_size + target_control_points = build_output_control_points(num_control_points, + margins) + N = num_control_points + + # create padded kernel matrix + forward_kernel = paddle.zeros(shape=[N + 3, N + 3]) + target_control_partial_repr = compute_partial_repr( + target_control_points, target_control_points) + target_control_partial_repr = paddle.cast(target_control_partial_repr, + forward_kernel.dtype) + forward_kernel[:N, :N] = target_control_partial_repr + forward_kernel[:N, -3] = 1 + forward_kernel[-3, :N] = 1 + target_control_points = paddle.cast(target_control_points, + forward_kernel.dtype) + forward_kernel[:N, -2:] = target_control_points + forward_kernel[-2:, :N] = paddle.transpose( + target_control_points, perm=[1, 0]) + # compute inverse matrix + inverse_kernel = paddle.inverse(forward_kernel) + + # create target cordinate matrix + HW = self.target_height * self.target_width + target_coordinate = list( + itertools.product( + range(self.target_height), range(self.target_width))) + target_coordinate = paddle.to_tensor(target_coordinate) # HW x 2 + Y, X = paddle.split( + target_coordinate, target_coordinate.shape[1], axis=1) + Y = Y / (self.target_height - 1) + X = X / (self.target_width - 1) + target_coordinate = paddle.concat( + [X, Y], axis=1) # convert from (y, x) to (x, y) + target_coordinate_partial_repr = compute_partial_repr( + target_coordinate, target_control_points) + target_coordinate_repr = paddle.concat( + [ + target_coordinate_partial_repr, paddle.ones(shape=[HW, 1]), + target_coordinate + ], + axis=1) + + # register precomputed matrices + self.inverse_kernel = inverse_kernel + self.padding_matrix = paddle.zeros(shape=[3, 2]) + self.target_coordinate_repr = target_coordinate_repr + self.target_control_points = target_control_points + + def forward(self, input, source_control_points): + assert source_control_points.ndimension() == 3 + assert source_control_points.shape[1] == self.num_control_points + assert source_control_points.shape[2] == 2 + batch_size = paddle.shape(source_control_points)[0] + + padding_matrix = paddle.expand( + self.padding_matrix, shape=[batch_size, 3, 2]) + Y = paddle.concat([source_control_points, padding_matrix], 1) + mapping_matrix = paddle.matmul(self.inverse_kernel, Y) + source_coordinate = paddle.matmul(self.target_coordinate_repr, + mapping_matrix) + + grid = paddle.reshape( + source_coordinate, + shape=[-1, self.target_height, self.target_width, 2]) + grid = paddle.clip(grid, 0, + 1) # the source_control_points may be out of [0, 1]. + # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1] + grid = 2.0 * grid - 1.0 + output_maps = grid_sample(input, grid, canvas=None) + return output_maps, source_coordinate diff --git a/backend/ppocr/optimizer/__init__.py b/backend/ppocr/optimizer/__init__.py new file mode 100644 index 0000000..a6bd2eb --- /dev/null +++ b/backend/ppocr/optimizer/__init__.py @@ -0,0 +1,62 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import copy +import paddle + +__all__ = ['build_optimizer'] + + +def build_lr_scheduler(lr_config, epochs, step_each_epoch): + from . import learning_rate + lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch}) + lr_name = lr_config.pop('name', 'Const') + lr = getattr(learning_rate, lr_name)(**lr_config)() + return lr + + +def build_optimizer(config, epochs, step_each_epoch, model): + from . import regularizer, optimizer + config = copy.deepcopy(config) + # step1 build lr + lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch) + + # step2 build regularization + if 'regularizer' in config and config['regularizer'] is not None: + reg_config = config.pop('regularizer') + reg_name = reg_config.pop('name') + if not hasattr(regularizer, reg_name): + reg_name += 'Decay' + reg = getattr(regularizer, reg_name)(**reg_config)() + elif 'weight_decay' in config: + reg = config.pop('weight_decay') + else: + reg = None + + # step3 build optimizer + optim_name = config.pop('name') + if 'clip_norm' in config: + clip_norm = config.pop('clip_norm') + grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) + else: + grad_clip = None + optim = getattr(optimizer, optim_name)(learning_rate=lr, + weight_decay=reg, + grad_clip=grad_clip, + **config) + return optim(model), lr diff --git a/backend/ppocr/optimizer/learning_rate.py b/backend/ppocr/optimizer/learning_rate.py new file mode 100644 index 0000000..fe251f3 --- /dev/null +++ b/backend/ppocr/optimizer/learning_rate.py @@ -0,0 +1,310 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from paddle.optimizer import lr +from .lr_scheduler import CyclicalCosineDecay, OneCycleDecay + + +class Linear(object): + """ + Linear learning rate decay + Args: + lr (float): The initial learning rate. It is a python float number. + epochs(int): The decay step size. It determines the decay cycle. + end_lr(float, optional): The minimum final learning rate. Default: 0.0001. + power(float, optional): Power of polynomial. Default: 1.0. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + epochs, + step_each_epoch, + end_lr=0.0, + power=1.0, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Linear, self).__init__() + self.learning_rate = learning_rate + self.epochs = epochs * step_each_epoch + self.end_lr = end_lr + self.power = power + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PolynomialDecay( + learning_rate=self.learning_rate, + decay_steps=self.epochs, + end_lr=self.end_lr, + power=self.power, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class Cosine(object): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Cosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.CosineAnnealingDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class Step(object): + """ + Piecewise learning rate decay + Args: + step_each_epoch(int): steps each epoch + learning_rate (float): The initial learning rate. It is a python float number. + step_size (int): the interval to update. + gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . + It should be less than 1.0. Default: 0.1. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_size, + step_each_epoch, + gamma, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Step, self).__init__() + self.step_size = step_each_epoch * step_size + self.learning_rate = learning_rate + self.gamma = gamma + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.StepDecay( + learning_rate=self.learning_rate, + step_size=self.step_size, + gamma=self.gamma, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class Piecewise(object): + """ + Piecewise learning rate decay + Args: + boundaries(list): A list of steps numbers. The type of element in the list is python int. + values(list): A list of learning rate values that will be picked during different epoch boundaries. + The type of element in the list is python float. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + step_each_epoch, + decay_epochs, + values, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Piecewise, self).__init__() + self.boundaries = [step_each_epoch * e for e in decay_epochs] + self.values = values + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PiecewiseDecay( + boundaries=self.boundaries, + values=self.values, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.values[0], + last_epoch=self.last_epoch) + return learning_rate + + +class CyclicalCosine(object): + """ + Cyclical cosine learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + cycle(int): period of the cosine learning rate + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + cycle, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(CyclicalCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + self.cycle = round(cycle * step_each_epoch) + + def __call__(self): + learning_rate = CyclicalCosineDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + cycle=self.cycle, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class OneCycle(object): + """ + One Cycle learning rate decay + Args: + max_lr(float): Upper learning rate boundaries + epochs(int): total training epochs + step_each_epoch(int): steps each epoch + anneal_strategy(str): {‘cos’, ‘linear’} Specifies the annealing strategy: “cos” for cosine annealing, “linear” for linear annealing. + Default: ‘cos’ + three_phase(bool): If True, use a third phase of the schedule to annihilate the learning rate according to ‘final_div_factor’ + instead of modifying the second phase (the first two phases will be symmetrical about the step indicated by ‘pct_start’). + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + max_lr, + epochs, + step_each_epoch, + anneal_strategy='cos', + three_phase=False, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(OneCycle, self).__init__() + self.max_lr = max_lr + self.epochs = epochs + self.steps_per_epoch = step_each_epoch + self.anneal_strategy = anneal_strategy + self.three_phase = three_phase + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = OneCycleDecay( + max_lr=self.max_lr, + epochs=self.epochs, + steps_per_epoch=self.steps_per_epoch, + anneal_strategy=self.anneal_strategy, + three_phase=self.three_phase, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.max_lr, + last_epoch=self.last_epoch) + return learning_rate + + +class Const(object): + """ + Const learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Const, self).__init__() + self.learning_rate = learning_rate + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = self.learning_rate + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate diff --git a/backend/ppocr/optimizer/lr_scheduler.py b/backend/ppocr/optimizer/lr_scheduler.py new file mode 100644 index 0000000..f62f1f3 --- /dev/null +++ b/backend/ppocr/optimizer/lr_scheduler.py @@ -0,0 +1,162 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from paddle.optimizer.lr import LRScheduler + + +class CyclicalCosineDecay(LRScheduler): + def __init__(self, + learning_rate, + T_max, + cycle=1, + last_epoch=-1, + eta_min=0.0, + verbose=False): + """ + Cyclical cosine learning rate decay + A learning rate which can be referred in https://arxiv.org/pdf/2012.12645.pdf + Args: + learning rate(float): learning rate + T_max(int): maximum epoch num + cycle(int): period of the cosine decay + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + eta_min(float): minimum learning rate during training + verbose(bool): whether to print learning rate for each epoch + """ + super(CyclicalCosineDecay, self).__init__(learning_rate, last_epoch, + verbose) + self.cycle = cycle + self.eta_min = eta_min + + def get_lr(self): + if self.last_epoch == 0: + return self.base_lr + reletive_epoch = self.last_epoch % self.cycle + lr = self.eta_min + 0.5 * (self.base_lr - self.eta_min) * \ + (1 + math.cos(math.pi * reletive_epoch / self.cycle)) + return lr + + +class OneCycleDecay(LRScheduler): + """ + One Cycle learning rate decay + A learning rate which can be referred in https://arxiv.org/abs/1708.07120 + Code refered in https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR + """ + + def __init__(self, + max_lr, + epochs=None, + steps_per_epoch=None, + pct_start=0.3, + anneal_strategy='cos', + div_factor=25., + final_div_factor=1e4, + three_phase=False, + last_epoch=-1, + verbose=False): + + # Validate total_steps + if epochs <= 0 or not isinstance(epochs, int): + raise ValueError( + "Expected positive integer epochs, but got {}".format(epochs)) + if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int): + raise ValueError( + "Expected positive integer steps_per_epoch, but got {}".format( + steps_per_epoch)) + self.total_steps = epochs * steps_per_epoch + + self.max_lr = max_lr + self.initial_lr = self.max_lr / div_factor + self.min_lr = self.initial_lr / final_div_factor + + if three_phase: + self._schedule_phases = [ + { + 'end_step': float(pct_start * self.total_steps) - 1, + 'start_lr': self.initial_lr, + 'end_lr': self.max_lr, + }, + { + 'end_step': float(2 * pct_start * self.total_steps) - 2, + 'start_lr': self.max_lr, + 'end_lr': self.initial_lr, + }, + { + 'end_step': self.total_steps - 1, + 'start_lr': self.initial_lr, + 'end_lr': self.min_lr, + }, + ] + else: + self._schedule_phases = [ + { + 'end_step': float(pct_start * self.total_steps) - 1, + 'start_lr': self.initial_lr, + 'end_lr': self.max_lr, + }, + { + 'end_step': self.total_steps - 1, + 'start_lr': self.max_lr, + 'end_lr': self.min_lr, + }, + ] + + # Validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError( + "Expected float between 0 and 1 pct_start, but got {}".format( + pct_start)) + + # Validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError( + "anneal_strategy must by one of 'cos' or 'linear', instead got {}". + format(anneal_strategy)) + elif anneal_strategy == 'cos': + self.anneal_func = self._annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = self._annealing_linear + + super(OneCycleDecay, self).__init__(max_lr, last_epoch, verbose) + + def _annealing_cos(self, start, end, pct): + "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." + cos_out = math.cos(math.pi * pct) + 1 + return end + (start - end) / 2.0 * cos_out + + def _annealing_linear(self, start, end, pct): + "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0." + return (end - start) * pct + start + + def get_lr(self): + computed_lr = 0.0 + step_num = self.last_epoch + + if step_num > self.total_steps: + raise ValueError( + "Tried to step {} times. The specified number of total steps is {}" + .format(step_num + 1, self.total_steps)) + start_step = 0 + for i, phase in enumerate(self._schedule_phases): + end_step = phase['end_step'] + if step_num <= end_step or i == len(self._schedule_phases) - 1: + pct = (step_num - start_step) / (end_step - start_step) + computed_lr = self.anneal_func(phase['start_lr'], + phase['end_lr'], pct) + break + start_step = phase['end_step'] + + return computed_lr diff --git a/backend/ppocr/optimizer/optimizer.py b/backend/ppocr/optimizer/optimizer.py new file mode 100644 index 0000000..dd8544e --- /dev/null +++ b/backend/ppocr/optimizer/optimizer.py @@ -0,0 +1,234 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from paddle import optimizer as optim + + +class Momentum(object): + """ + Simple Momentum optimizer with velocity state. + Args: + learning_rate (float|Variable) - The learning rate used to update parameters. + Can be a float value or a Variable with one float value as data element. + momentum (float) - Momentum factor. + regularization (WeightDecayRegularizer, optional) - The strategy of regularization. + """ + + def __init__(self, + learning_rate, + momentum, + weight_decay=None, + grad_clip=None, + **args): + super(Momentum, self).__init__() + self.learning_rate = learning_rate + self.momentum = momentum + self.weight_decay = weight_decay + self.grad_clip = grad_clip + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Momentum( + learning_rate=self.learning_rate, + momentum=self.momentum, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=train_params) + return opt + + +class Adam(object): + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + lazy_mode=False, + **kwargs): + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + self.lazy_mode = lazy_mode + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Adam( + learning_rate=self.learning_rate, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + lazy_mode=self.lazy_mode, + parameters=train_params) + return opt + + +class RMSProp(object): + """ + Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method. + Args: + learning_rate (float|Variable) - The learning rate used to update parameters. + Can be a float value or a Variable with one float value as data element. + momentum (float) - Momentum factor. + rho (float) - rho value in equation. + epsilon (float) - avoid division by zero, default is 1e-6. + regularization (WeightDecayRegularizer, optional) - The strategy of regularization. + """ + + def __init__(self, + learning_rate, + momentum=0.0, + rho=0.95, + epsilon=1e-6, + weight_decay=None, + grad_clip=None, + **args): + super(RMSProp, self).__init__() + self.learning_rate = learning_rate + self.momentum = momentum + self.rho = rho + self.epsilon = epsilon + self.weight_decay = weight_decay + self.grad_clip = grad_clip + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.RMSProp( + learning_rate=self.learning_rate, + momentum=self.momentum, + rho=self.rho, + epsilon=self.epsilon, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=train_params) + return opt + + +class Adadelta(object): + def __init__(self, + learning_rate=0.001, + epsilon=1e-08, + rho=0.95, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + **kwargs): + self.learning_rate = learning_rate + self.epsilon = epsilon + self.rho = rho + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Adadelta( + learning_rate=self.learning_rate, + epsilon=self.epsilon, + rho=self.rho, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + parameters=train_params) + return opt + + +class AdamW(object): + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + weight_decay=0.01, + multi_precision=False, + grad_clip=None, + no_weight_decay_name=None, + one_dim_param_no_weight_decay=False, + name=None, + lazy_mode=False, + **args): + super().__init__() + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.grad_clip = grad_clip + self.weight_decay = 0.01 if weight_decay is None else weight_decay + self.grad_clip = grad_clip + self.name = name + self.lazy_mode = lazy_mode + self.multi_precision = multi_precision + self.no_weight_decay_name_list = no_weight_decay_name.split( + ) if no_weight_decay_name else [] + self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay + + def __call__(self, model): + parameters = [ + param for param in model.parameters() if param.trainable is True + ] + + self.no_weight_decay_param_name_list = [ + p.name for n, p in model.named_parameters() + if any(nd in n for nd in self.no_weight_decay_name_list) + ] + + if self.one_dim_param_no_weight_decay: + self.no_weight_decay_param_name_list += [ + p.name for n, p in model.named_parameters() if len(p.shape) == 1 + ] + + opt = optim.AdamW( + learning_rate=self.learning_rate, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon, + parameters=parameters, + weight_decay=self.weight_decay, + multi_precision=self.multi_precision, + grad_clip=self.grad_clip, + name=self.name, + lazy_mode=self.lazy_mode, + apply_decay_param_fun=self._apply_decay_param_fun) + return opt + + def _apply_decay_param_fun(self, name): + return name not in self.no_weight_decay_param_name_list diff --git a/backend/ppocr/optimizer/regularizer.py b/backend/ppocr/optimizer/regularizer.py new file mode 100644 index 0000000..2ce68f7 --- /dev/null +++ b/backend/ppocr/optimizer/regularizer.py @@ -0,0 +1,51 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import paddle + + +class L1Decay(object): + """ + L1 Weight Decay Regularization, which encourages the weights to be sparse. + Args: + factor(float): regularization coeff. Default:0.0. + """ + + def __init__(self, factor=0.0): + super(L1Decay, self).__init__() + self.coeff = factor + + def __call__(self): + reg = paddle.regularizer.L1Decay(self.coeff) + return reg + + +class L2Decay(object): + """ + L2 Weight Decay Regularization, which helps to prevent the model over-fitting. + Args: + factor(float): regularization coeff. Default:0.0. + """ + + def __init__(self, factor=0.0): + super(L2Decay, self).__init__() + self.coeff = float(factor) + + def __call__(self): + return self.coeff \ No newline at end of file diff --git a/backend/ppocr/postprocess/__init__.py b/backend/ppocr/postprocess/__init__.py new file mode 100644 index 0000000..f50b5f1 --- /dev/null +++ b/backend/ppocr/postprocess/__init__.py @@ -0,0 +1,61 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ['build_post_process'] + +from .db_postprocess import DBPostProcess, DistillationDBPostProcess +from .east_postprocess import EASTPostProcess +from .sast_postprocess import SASTPostProcess +from .fce_postprocess import FCEPostProcess +from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ + DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \ + SEEDLabelDecode, PRENLabelDecode +from .cls_postprocess import ClsPostProcess +from .pg_postprocess import PGPostProcess +from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess +from .vqa_token_re_layoutlm_postprocess import VQAReTokenLayoutLMPostProcess + + +def build_post_process(config, global_config=None): + support_dict = [ + 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'FCEPostProcess', + 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', + 'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode', + 'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode', + 'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess', + 'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode', + 'DistillationSARLabelDecode' + ] + + if config['name'] == 'PSEPostProcess': + from .pse_postprocess import PSEPostProcess + support_dict.append('PSEPostProcess') + + config = copy.deepcopy(config) + module_name = config.pop('name') + if module_name == "None": + return + if global_config is not None: + config.update(global_config) + assert module_name in support_dict, Exception( + 'post process only support {}'.format(support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/backend/ppocr/postprocess/cls_postprocess.py b/backend/ppocr/postprocess/cls_postprocess.py new file mode 100644 index 0000000..9a27ba0 --- /dev/null +++ b/backend/ppocr/postprocess/cls_postprocess.py @@ -0,0 +1,42 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle + + +class ClsPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, label_list=None, key=None, **kwargs): + super(ClsPostProcess, self).__init__() + self.label_list = label_list + self.key = key + + def __call__(self, preds, label=None, *args, **kwargs): + if self.key is not None: + preds = preds[self.key] + + label_list = self.label_list + if label_list is None: + label_list = {idx: idx for idx in range(preds.shape[-1])} + + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + pred_idxs = preds.argmax(axis=1) + decode_out = [(label_list[idx], preds[i, idx]) + for i, idx in enumerate(pred_idxs)] + if label is None: + return decode_out + label = [(label_list[idx], 1.0) for idx in label] + return decode_out, label diff --git a/backend/ppocr/postprocess/db_postprocess.py b/backend/ppocr/postprocess/db_postprocess.py new file mode 100755 index 0000000..27b428e --- /dev/null +++ b/backend/ppocr/postprocess/db_postprocess.py @@ -0,0 +1,220 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refered from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import paddle +from shapely.geometry import Polygon +import pyclipper + + +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=2.0, + use_dilation=False, + score_mode="fast", + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score = self.box_score_fast(pred, points.reshape(-1, 2)) + else: + score = self.box_score_slow(pred, contour) + if self.box_thresh > score: + continue + + box = self.unclip(points).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.astype(np.int16)) + scores.append(score) + return np.array(boxes, dtype=np.int16), scores + + def unclip(self, box): + unclip_ratio = self.unclip_ratio + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def box_score_slow(self, bitmap, contour): + ''' + box_score_slow: use polyon mean score as the mean score + ''' + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if isinstance(pred, paddle.Tensor): + pred = pred.numpy() + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate( + np.array(segmentation[batch_index]).astype(np.uint8), + self.dilation_kernel) + else: + mask = segmentation[batch_index] + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, + src_w, src_h) + + boxes_batch.append({'points': boxes}) + return boxes_batch + + +class DistillationDBPostProcess(object): + def __init__(self, + model_name=["student"], + key=None, + thresh=0.3, + box_thresh=0.6, + max_candidates=1000, + unclip_ratio=1.5, + use_dilation=False, + score_mode="fast", + **kwargs): + self.model_name = model_name + self.key = key + self.post_process = DBPostProcess( + thresh=thresh, + box_thresh=box_thresh, + max_candidates=max_candidates, + unclip_ratio=unclip_ratio, + use_dilation=use_dilation, + score_mode=score_mode) + + def __call__(self, predicts, shape_list): + results = {} + for k in self.model_name: + results[k] = self.post_process(predicts[k], shape_list=shape_list) + return results diff --git a/backend/ppocr/postprocess/east_postprocess.py b/backend/ppocr/postprocess/east_postprocess.py new file mode 100755 index 0000000..c194c81 --- /dev/null +++ b/backend/ppocr/postprocess/east_postprocess.py @@ -0,0 +1,143 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from .locality_aware_nms import nms_locality +import cv2 +import paddle + +import os +import sys + + +class EASTPostProcess(object): + """ + The post process for EAST. + """ + + def __init__(self, + score_thresh=0.8, + cover_thresh=0.1, + nms_thresh=0.2, + **kwargs): + + self.score_thresh = score_thresh + self.cover_thresh = cover_thresh + self.nms_thresh = nms_thresh + + def restore_rectangle_quad(self, origin, geometry): + """ + Restore rectangle from quadrangle. + """ + # quad + origin_concat = np.concatenate( + (origin, origin, origin, origin), axis=1) # (n, 8) + pred_quads = origin_concat - geometry + pred_quads = pred_quads.reshape((-1, 4, 2)) # (n, 4, 2) + return pred_quads + + def detect(self, + score_map, + geo_map, + score_thresh=0.8, + cover_thresh=0.1, + nms_thresh=0.2): + """ + restore text boxes from score map and geo map + """ + + score_map = score_map[0] + geo_map = np.swapaxes(geo_map, 1, 0) + geo_map = np.swapaxes(geo_map, 1, 2) + # filter the score map + xy_text = np.argwhere(score_map > score_thresh) + if len(xy_text) == 0: + return [] + # sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 0])] + #restore quad proposals + text_box_restored = self.restore_rectangle_quad( + xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :]) + boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32) + boxes[:, :8] = text_box_restored.reshape((-1, 8)) + boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]] + + try: + import lanms + boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh) + except: + print( + 'you should install lanms by pip3 install lanms-nova to speed up nms_locality' + ) + boxes = nms_locality(boxes.astype(np.float64), nms_thresh) + if boxes.shape[0] == 0: + return [] + # Here we filter some low score boxes by the average score map, + # this is different from the orginal paper. + for i, box in enumerate(boxes): + mask = np.zeros_like(score_map, dtype=np.uint8) + cv2.fillPoly(mask, box[:8].reshape( + (-1, 4, 2)).astype(np.int32) // 4, 1) + boxes[i, 8] = cv2.mean(score_map, mask)[0] + boxes = boxes[boxes[:, 8] > cover_thresh] + return boxes + + def sort_poly(self, p): + """ + Sort polygons. + """ + min_axis = np.argmin(np.sum(p, axis=1)) + p = p[[min_axis, (min_axis + 1) % 4,\ + (min_axis + 2) % 4, (min_axis + 3) % 4]] + if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]): + return p + else: + return p[[0, 3, 2, 1]] + + def __call__(self, outs_dict, shape_list): + score_list = outs_dict['f_score'] + geo_list = outs_dict['f_geo'] + if isinstance(score_list, paddle.Tensor): + score_list = score_list.numpy() + geo_list = geo_list.numpy() + img_num = len(shape_list) + dt_boxes_list = [] + for ino in range(img_num): + score = score_list[ino] + geo = geo_list[ino] + boxes = self.detect( + score_map=score, + geo_map=geo, + score_thresh=self.score_thresh, + cover_thresh=self.cover_thresh, + nms_thresh=self.nms_thresh) + boxes_norm = [] + if len(boxes) > 0: + h, w = score.shape[1:] + src_h, src_w, ratio_h, ratio_w = shape_list[ino] + boxes = boxes[:, :8].reshape((-1, 4, 2)) + boxes[:, :, 0] /= ratio_w + boxes[:, :, 1] /= ratio_h + for i_box, box in enumerate(boxes): + box = self.sort_poly(box.astype(np.int32)) + if np.linalg.norm(box[0] - box[1]) < 5 \ + or np.linalg.norm(box[3] - box[0]) < 5: + continue + boxes_norm.append(box) + dt_boxes_list.append({'points': np.array(boxes_norm)}) + return dt_boxes_list diff --git a/backend/ppocr/postprocess/fce_postprocess.py b/backend/ppocr/postprocess/fce_postprocess.py new file mode 100755 index 0000000..8e0716f --- /dev/null +++ b/backend/ppocr/postprocess/fce_postprocess.py @@ -0,0 +1,241 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py +""" + +import cv2 +import paddle +import numpy as np +from numpy.fft import ifft +from ppocr.utils.poly_nms import poly_nms, valid_boundary + + +def fill_hole(input_mask): + h, w = input_mask.shape + canvas = np.zeros((h + 2, w + 2), np.uint8) + canvas[1:h + 1, 1:w + 1] = input_mask.copy() + + mask = np.zeros((h + 4, w + 4), np.uint8) + + cv2.floodFill(canvas, mask, (0, 0), 1) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + + return ~canvas | input_mask + + +def fourier2poly(fourier_coeff, num_reconstr_points=50): + """ Inverse Fourier transform + Args: + fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1), + with n and k being candidates number and Fourier degree + respectively. + num_reconstr_points (int): Number of reconstructed polygon points. + Returns: + Polygons (ndarray): The reconstructed polygons shaped (n, n') + """ + + a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex') + k = (len(fourier_coeff[0]) - 1) // 2 + + a[:, 0:k + 1] = fourier_coeff[:, k:] + a[:, -k:] = fourier_coeff[:, :k] + + poly_complex = ifft(a) * num_reconstr_points + polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2)) + polygon[:, :, 0] = poly_complex.real + polygon[:, :, 1] = poly_complex.imag + return polygon.astype('int32').reshape((len(fourier_coeff), -1)) + + +class FCEPostProcess(object): + """ + The post process for FCENet. + """ + + def __init__(self, + scales, + fourier_degree=5, + num_reconstr_points=50, + decoding_type='fcenet', + score_thr=0.3, + nms_thr=0.1, + alpha=1.0, + beta=1.0, + box_type='poly', + **kwargs): + + self.scales = scales + self.fourier_degree = fourier_degree + self.num_reconstr_points = num_reconstr_points + self.decoding_type = decoding_type + self.score_thr = score_thr + self.nms_thr = nms_thr + self.alpha = alpha + self.beta = beta + self.box_type = box_type + + def __call__(self, preds, shape_list): + score_maps = [] + for key, value in preds.items(): + if isinstance(value, paddle.Tensor): + value = value.numpy() + cls_res = value[:, :4, :, :] + reg_res = value[:, 4:, :, :] + score_maps.append([cls_res, reg_res]) + + return self.get_boundary(score_maps, shape_list) + + def resize_boundary(self, boundaries, scale_factor): + """Rescale boundaries via scale_factor. + + Args: + boundaries (list[list[float]]): The boundary list. Each boundary + with size 2k+1 with k>=4. + scale_factor(ndarray): The scale factor of size (4,). + + Returns: + boundaries (list[list[float]]): The scaled boundaries. + """ + boxes = [] + scores = [] + for b in boundaries: + sz = len(b) + valid_boundary(b, True) + scores.append(b[-1]) + b = (np.array(b[:sz - 1]) * + (np.tile(scale_factor[:2], int( + (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist() + boxes.append(np.array(b).reshape([-1, 2])) + + return np.array(boxes, dtype=np.float32), scores + + def get_boundary(self, score_maps, shape_list): + assert len(score_maps) == len(self.scales) + boundaries = [] + for idx, score_map in enumerate(score_maps): + scale = self.scales[idx] + boundaries = boundaries + self._get_boundary_single(score_map, + scale) + + # nms + boundaries = poly_nms(boundaries, self.nms_thr) + boundaries, scores = self.resize_boundary( + boundaries, (1 / shape_list[0, 2:]).tolist()[::-1]) + + boxes_batch = [dict(points=boundaries, scores=scores)] + return boxes_batch + + def _get_boundary_single(self, score_map, scale): + assert len(score_map) == 2 + assert score_map[1].shape[1] == 4 * self.fourier_degree + 2 + + return self.fcenet_decode( + preds=score_map, + fourier_degree=self.fourier_degree, + num_reconstr_points=self.num_reconstr_points, + scale=scale, + alpha=self.alpha, + beta=self.beta, + box_type=self.box_type, + score_thr=self.score_thr, + nms_thr=self.nms_thr) + + def fcenet_decode(self, + preds, + fourier_degree, + num_reconstr_points, + scale, + alpha=1.0, + beta=2.0, + box_type='poly', + score_thr=0.3, + nms_thr=0.1): + """Decoding predictions of FCENet to instances. + + Args: + preds (list(Tensor)): The head output tensors. + fourier_degree (int): The maximum Fourier transform degree k. + num_reconstr_points (int): The points number of the polygon + reconstructed from predicted Fourier coefficients. + scale (int): The down-sample scale of the prediction. + alpha (float) : The parameter to calculate final scores. Score_{final} + = (Score_{text region} ^ alpha) + * (Score_{text center region}^ beta) + beta (float) : The parameter to calculate final score. + box_type (str): Boundary encoding type 'poly' or 'quad'. + score_thr (float) : The threshold used to filter out the final + candidates. + nms_thr (float) : The threshold of nms. + + Returns: + boundaries (list[list[float]]): The instance boundary and confidence + list. + """ + assert isinstance(preds, list) + assert len(preds) == 2 + assert box_type in ['poly', 'quad'] + + cls_pred = preds[0][0] + tr_pred = cls_pred[0:2] + tcl_pred = cls_pred[2:] + + reg_pred = preds[1][0].transpose([1, 2, 0]) + x_pred = reg_pred[:, :, :2 * fourier_degree + 1] + y_pred = reg_pred[:, :, 2 * fourier_degree + 1:] + + score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta) + tr_pred_mask = (score_pred) > score_thr + tr_mask = fill_hole(tr_pred_mask) + + tr_contours, _ = cv2.findContours( + tr_mask.astype(np.uint8), cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE) # opencv4 + + mask = np.zeros_like(tr_mask) + boundaries = [] + for cont in tr_contours: + deal_map = mask.copy().astype(np.int8) + cv2.drawContours(deal_map, [cont], -1, 1, -1) + + score_map = score_pred * deal_map + score_mask = score_map > 0 + xy_text = np.argwhere(score_mask) + dxy = xy_text[:, 1] + xy_text[:, 0] * 1j + + x, y = x_pred[score_mask], y_pred[score_mask] + c = x + y * 1j + c[:, fourier_degree] = c[:, fourier_degree] + dxy + c *= scale + + polygons = fourier2poly(c, num_reconstr_points) + score = score_map[score_mask].reshape(-1, 1) + polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr) + + boundaries = boundaries + polygons + + boundaries = poly_nms(boundaries, nms_thr) + + if box_type == 'quad': + new_boundaries = [] + for boundary in boundaries: + poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32) + score = boundary[-1] + points = cv2.boxPoints(cv2.minAreaRect(poly)) + points = np.int0(points) + new_boundaries.append(points.reshape(-1).tolist() + [score]) + boundaries = new_boundaries + + return boundaries diff --git a/backend/ppocr/postprocess/locality_aware_nms.py b/backend/ppocr/postprocess/locality_aware_nms.py new file mode 100644 index 0000000..d305ef6 --- /dev/null +++ b/backend/ppocr/postprocess/locality_aware_nms.py @@ -0,0 +1,200 @@ +""" +Locality aware nms. +This code is refered from: https://github.com/songdejia/EAST/blob/master/locality_aware_nms.py +""" + +import numpy as np +from shapely.geometry import Polygon + + +def intersection(g, p): + """ + Intersection. + """ + g = Polygon(g[:8].reshape((4, 2))) + p = Polygon(p[:8].reshape((4, 2))) + g = g.buffer(0) + p = p.buffer(0) + if not g.is_valid or not p.is_valid: + return 0 + inter = Polygon(g).intersection(Polygon(p)).area + union = g.area + p.area - inter + if union == 0: + return 0 + else: + return inter / union + + +def intersection_iog(g, p): + """ + Intersection_iog. + """ + g = Polygon(g[:8].reshape((4, 2))) + p = Polygon(p[:8].reshape((4, 2))) + if not g.is_valid or not p.is_valid: + return 0 + inter = Polygon(g).intersection(Polygon(p)).area + #union = g.area + p.area - inter + union = p.area + if union == 0: + print("p_area is very small") + return 0 + else: + return inter / union + + +def weighted_merge(g, p): + """ + Weighted merge. + """ + g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8]) + g[8] = (g[8] + p[8]) + return g + + +def standard_nms(S, thres): + """ + Standard nms. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return S[keep] + + +def standard_nms_inds(S, thres): + """ + Standard nms, retun inds. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return keep + + +def nms(S, thres): + """ + nms. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return keep + + +def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2): + """ + soft_nms + :para boxes_in, N x 9 (coords + score) + :para threshould, eliminate cases min score(0.001) + :para Nt_thres, iou_threshi + :para sigma, gaussian weght + :method, linear or gaussian + """ + boxes = boxes_in.copy() + N = boxes.shape[0] + if N is None or N < 1: + return np.array([]) + pos, maxpos = 0, 0 + weight = 0.0 + inds = np.arange(N) + tbox, sbox = boxes[0].copy(), boxes[0].copy() + for i in range(N): + maxscore = boxes[i, 8] + maxpos = i + tbox = boxes[i].copy() + ti = inds[i] + pos = i + 1 + #get max box + while pos < N: + if maxscore < boxes[pos, 8]: + maxscore = boxes[pos, 8] + maxpos = pos + pos = pos + 1 + #add max box as a detection + boxes[i, :] = boxes[maxpos, :] + inds[i] = inds[maxpos] + #swap + boxes[maxpos, :] = tbox + inds[maxpos] = ti + tbox = boxes[i].copy() + pos = i + 1 + #NMS iteration + while pos < N: + sbox = boxes[pos].copy() + ts_iou_val = intersection(tbox, sbox) + if ts_iou_val > 0: + if method == 1: + if ts_iou_val > Nt_thres: + weight = 1 - ts_iou_val + else: + weight = 1 + elif method == 2: + weight = np.exp(-1.0 * ts_iou_val**2 / sigma) + else: + if ts_iou_val > Nt_thres: + weight = 0 + else: + weight = 1 + boxes[pos, 8] = weight * boxes[pos, 8] + #if box score falls below thresold, discard the box by + #swaping last box update N + if boxes[pos, 8] < threshold: + boxes[pos, :] = boxes[N - 1, :] + inds[pos] = inds[N - 1] + N = N - 1 + pos = pos - 1 + pos = pos + 1 + + return boxes[:N] + + +def nms_locality(polys, thres=0.3): + """ + locality aware nms of EAST + :param polys: a N*9 numpy array. first 8 coordinates, then prob + :return: boxes after nms + """ + S = [] + p = None + for g in polys: + if p is not None and intersection(g, p) > thres: + p = weighted_merge(g, p) + else: + if p is not None: + S.append(p) + p = g + if p is not None: + S.append(p) + + if len(S) == 0: + return np.array([]) + return standard_nms(np.array(S), thres) + + +if __name__ == '__main__': + # 343,350,448,135,474,143,369,359 + print( + Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]])) + .area) \ No newline at end of file diff --git a/backend/ppocr/postprocess/pg_postprocess.py b/backend/ppocr/postprocess/pg_postprocess.py new file mode 100644 index 0000000..0b14551 --- /dev/null +++ b/backend/ppocr/postprocess/pg_postprocess.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) +from ppocr.utils.e2e_utils.pgnet_pp_utils import PGNet_PostProcess + + +class PGPostProcess(object): + """ + The post process for PGNet. + """ + + def __init__(self, character_dict_path, valid_set, score_thresh, mode, + **kwargs): + self.character_dict_path = character_dict_path + self.valid_set = valid_set + self.score_thresh = score_thresh + self.mode = mode + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def __call__(self, outs_dict, shape_list): + post = PGNet_PostProcess(self.character_dict_path, self.valid_set, + self.score_thresh, outs_dict, shape_list) + if self.mode == 'fast': + data = post.pg_postprocess_fast() + else: + data = post.pg_postprocess_slow() + return data diff --git a/backend/ppocr/postprocess/pse_postprocess/__init__.py b/backend/ppocr/postprocess/pse_postprocess/__init__.py new file mode 100644 index 0000000..680473b --- /dev/null +++ b/backend/ppocr/postprocess/pse_postprocess/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pse_postprocess import PSEPostProcess \ No newline at end of file diff --git a/backend/ppocr/postprocess/pse_postprocess/pse/README.md b/backend/ppocr/postprocess/pse_postprocess/pse/README.md new file mode 100644 index 0000000..6a19d5d --- /dev/null +++ b/backend/ppocr/postprocess/pse_postprocess/pse/README.md @@ -0,0 +1,6 @@ +## 编译 +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/post_processing/pse +```python +python3 setup.py build_ext --inplace +``` diff --git a/backend/ppocr/postprocess/pse_postprocess/pse/__init__.py b/backend/ppocr/postprocess/pse_postprocess/pse/__init__.py new file mode 100644 index 0000000..1903a91 --- /dev/null +++ b/backend/ppocr/postprocess/pse_postprocess/pse/__init__.py @@ -0,0 +1,29 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os +import subprocess + +python_path = sys.executable + +ori_path = os.getcwd() +os.chdir('ppocr/postprocess/pse_postprocess/pse') +if subprocess.call( + '{} setup.py build_ext --inplace'.format(python_path), shell=True) != 0: + raise RuntimeError( + 'Cannot compile pse: {}, if your system is windows, you need to install all the default components of `desktop development using C++` in visual studio 2019+'. + format(os.path.dirname(os.path.realpath(__file__)))) +os.chdir(ori_path) + +from .pse import pse diff --git a/backend/ppocr/postprocess/pse_postprocess/pse/pse.pyx b/backend/ppocr/postprocess/pse_postprocess/pse/pse.pyx new file mode 100644 index 0000000..b2be49e --- /dev/null +++ b/backend/ppocr/postprocess/pse_postprocess/pse/pse.pyx @@ -0,0 +1,70 @@ + +import numpy as np +import cv2 +cimport numpy as np +cimport cython +cimport libcpp +cimport libcpp.pair +cimport libcpp.queue +from libcpp.pair cimport * +from libcpp.queue cimport * + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels, + np.ndarray[np.int32_t, ndim=2] label, + int kernel_num, + int label_num, + float min_area=0): + cdef np.ndarray[np.int32_t, ndim=2] pred + pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32) + + for label_idx in range(1, label_num): + if np.sum(label == label_idx) < min_area: + label[label == label_idx] = 0 + + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef np.int16_t* dx = [-1, 1, 0, 0] + cdef np.int16_t* dy = [0, 0, -1, 1] + cdef np.int16_t tmpx, tmpy + + points = np.array(np.where(label > 0)).transpose((1, 0)) + for point_idx in range(points.shape[0]): + tmpx, tmpy = points[point_idx, 0], points[point_idx, 1] + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = label[tmpx, tmpy] + + cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur + cdef int cur_label + for kernel_idx in range(kernel_num - 1, -1, -1): + while not que.empty(): + cur = que.front() + que.pop() + cur_label = pred[cur.first, cur.second] + + is_edge = True + for j in range(4): + tmpx = cur.first + dx[j] + tmpy = cur.second + dy[j] + if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]: + continue + if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0: + continue + + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = cur_label + is_edge = False + if is_edge: + nxt_que.push(cur) + + que, nxt_que = nxt_que, que + + return pred + +def pse(kernels, min_area): + kernel_num = kernels.shape[0] + label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4) + return _pse(kernels[:-1], label, kernel_num, label_num, min_area) \ No newline at end of file diff --git a/backend/ppocr/postprocess/pse_postprocess/pse/setup.py b/backend/ppocr/postprocess/pse_postprocess/pse/setup.py new file mode 100644 index 0000000..0374678 --- /dev/null +++ b/backend/ppocr/postprocess/pse_postprocess/pse/setup.py @@ -0,0 +1,14 @@ +from distutils.core import setup, Extension +from Cython.Build import cythonize +import numpy + +setup(ext_modules=cythonize(Extension( + 'pse', + sources=['pse.pyx'], + language='c++', + include_dirs=[numpy.get_include()], + library_dirs=[], + libraries=[], + extra_compile_args=['-O3'], + extra_link_args=[] +))) diff --git a/backend/ppocr/postprocess/pse_postprocess/pse_postprocess.py b/backend/ppocr/postprocess/pse_postprocess/pse_postprocess.py new file mode 100755 index 0000000..34f1b8c --- /dev/null +++ b/backend/ppocr/postprocess/pse_postprocess/pse_postprocess.py @@ -0,0 +1,118 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import paddle +from paddle.nn import functional as F + +from ppocr.postprocess.pse_postprocess.pse import pse + + +class PSEPostProcess(object): + """ + The post process for PSE. + """ + + def __init__(self, + thresh=0.5, + box_thresh=0.85, + min_area=16, + box_type='quad', + scale=4, + **kwargs): + assert box_type in ['quad', 'poly'], 'Only quad and poly is supported' + self.thresh = thresh + self.box_thresh = box_thresh + self.min_area = min_area + self.box_type = box_type + self.scale = scale + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if not isinstance(pred, paddle.Tensor): + pred = paddle.to_tensor(pred) + pred = F.interpolate( + pred, scale_factor=4 // self.scale, mode='bilinear') + + score = F.sigmoid(pred[:, 0, :, :]) + + kernels = (pred > self.thresh).astype('float32') + text_mask = kernels[:, 0, :, :] + kernels[:, 0:, :, :] = kernels[:, 0:, :, :] * text_mask + + score = score.numpy() + kernels = kernels.numpy().astype(np.uint8) + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + boxes, scores = self.boxes_from_bitmap(score[batch_index], + kernels[batch_index], + shape_list[batch_index]) + + boxes_batch.append({'points': boxes, 'scores': scores}) + return boxes_batch + + def boxes_from_bitmap(self, score, kernels, shape): + label = pse(kernels, self.min_area) + return self.generate_box(score, label, shape) + + def generate_box(self, score, label, shape): + src_h, src_w, ratio_h, ratio_w = shape + label_num = np.max(label) + 1 + + boxes = [] + scores = [] + for i in range(1, label_num): + ind = label == i + points = np.array(np.where(ind)).transpose((1, 0))[:, ::-1] + + if points.shape[0] < self.min_area: + label[ind] = 0 + continue + + score_i = np.mean(score[ind]) + if score_i < self.box_thresh: + label[ind] = 0 + continue + + if self.box_type == 'quad': + rect = cv2.minAreaRect(points) + bbox = cv2.boxPoints(rect) + elif self.box_type == 'poly': + box_height = np.max(points[:, 1]) + 10 + box_width = np.max(points[:, 0]) + 10 + + mask = np.zeros((box_height, box_width), np.uint8) + mask[points[:, 1], points[:, 0]] = 255 + + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + bbox = np.squeeze(contours[0], 1) + else: + raise NotImplementedError + + bbox[:, 0] = np.clip(np.round(bbox[:, 0] / ratio_w), 0, src_w) + bbox[:, 1] = np.clip(np.round(bbox[:, 1] / ratio_h), 0, src_h) + boxes.append(bbox) + scores.append(score_i) + return boxes, scores diff --git a/backend/ppocr/postprocess/rec_postprocess.py b/backend/ppocr/postprocess/rec_postprocess.py new file mode 100644 index 0000000..bf0fd89 --- /dev/null +++ b/backend/ppocr/postprocess/rec_postprocess.py @@ -0,0 +1,754 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.nn import functional as F +import re + + +class BaseRecLabelDecode(object): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False): + self.beg_str = "sos" + self.end_str = "eos" + + self.character_str = [] + if character_dict_path is None: + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + else: + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def add_special_char(self, dict_character): + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + selection = np.ones(len(text_index[batch_idx]), dtype=bool) + if is_remove_duplicate: + selection[1:] = text_index[batch_idx][1:] != text_index[ + batch_idx][:-1] + for ignored_token in ignored_tokens: + selection &= text_index[batch_idx] != ignored_token + + char_list = [ + self.character[text_id] + for text_id in text_index[batch_idx][selection] + ] + if text_prob is not None: + conf_list = text_prob[batch_idx][selection] + else: + conf_list = [1] * len(selection) + if len(conf_list) == 0: + conf_list = [0] + + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def get_ignored_tokens(self): + return [0] # for ctc blank + + +class CTCLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, tuple) or isinstance(preds, list): + preds = preds[-1] + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character + + +class DistillationCTCLabelDecode(CTCLabelDecode): + """ + Convert + Convert between text-label and text-index + """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + model_name=["student"], + key=None, + multi_head=False, + **kwargs): + super(DistillationCTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + if not isinstance(model_name, list): + model_name = [model_name] + self.model_name = model_name + + self.key = key + self.multi_head = multi_head + + def __call__(self, preds, label=None, *args, **kwargs): + output = dict() + for name in self.model_name: + pred = preds[name] + if self.key is not None: + pred = pred[self.key] + if self.multi_head and isinstance(pred, dict): + pred = pred['ctc'] + output[name] = super().__call__(pred, label=label, *args, **kwargs) + return output + + +class NRTRLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): + super(NRTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + + if len(preds) == 2: + preds_id = preds[0] + preds_prob = preds[1] + if isinstance(preds_id, paddle.Tensor): + preds_id = preds_id.numpy() + if isinstance(preds_prob, paddle.Tensor): + preds_prob = preds_prob.numpy() + if preds_id[0][0] == 2: + preds_idx = preds_id[:, 1:] + preds_prob = preds_prob[:, 1:] + else: + preds_idx = preds_id + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + else: + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] == 3: # end + break + try: + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + except: + continue + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text.lower(), np.mean(conf_list).tolist())) + return result_list + + +class AttnLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(AttnLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SEEDLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SEEDLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.padding_str = "padding" + self.end_str = "eos" + self.unknown = "unknown" + dict_character = dict_character + [ + self.end_str, self.padding_str, self.unknown + ] + return dict_character + + def get_ignored_tokens(self): + end_idx = self.get_beg_end_flag_idx("eos") + return [end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "sos": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "eos": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" % beg_or_end + return idx + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + [end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + preds_idx = preds["rec_pred"] + if isinstance(preds_idx, paddle.Tensor): + preds_idx = preds_idx.numpy() + if "rec_pred_scores" in preds: + preds_idx = preds["rec_pred"] + preds_prob = preds["rec_pred_scores"] + else: + preds_idx = preds["rec_pred"].argmax(axis=2) + preds_prob = preds["rec_pred"].max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + +class SRNLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SRNLabelDecode, self).__init__(character_dict_path, + use_space_char) + self.max_text_length = kwargs.get('max_text_length', 25) + + def __call__(self, preds, label=None, *args, **kwargs): + pred = preds['predict'] + char_num = len(self.character_str) + 2 + if isinstance(pred, paddle.Tensor): + pred = pred.numpy() + pred = np.reshape(pred, [-1, char_num]) + + preds_idx = np.argmax(pred, axis=1) + preds_prob = np.max(pred, axis=1) + + preds_idx = np.reshape(preds_idx, [-1, self.max_text_length]) + + preds_prob = np.reshape(preds_prob, [-1, self.max_text_length]) + + text = self.decode(preds_idx, preds_prob) + + if label is None: + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + return text + label = self.decode(label) + return text, label + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def add_special_char(self, dict_character): + dict_character = dict_character + [self.beg_str, self.end_str] + return dict_character + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class TableLabelDecode(object): + """ """ + + def __init__(self, character_dict_path, **kwargs): + list_character, list_elem = self.load_char_elem_dict( + character_dict_path) + list_character = self.add_special_char(list_character) + list_elem = self.add_special_char(list_elem) + self.dict_character = {} + self.dict_idx_character = {} + for i, char in enumerate(list_character): + self.dict_idx_character[i] = char + self.dict_character[char] = i + self.dict_elem = {} + self.dict_idx_elem = {} + for i, elem in enumerate(list_elem): + self.dict_idx_elem[i] = elem + self.dict_elem[elem] = i + + def load_char_elem_dict(self, character_dict_path): + list_character = [] + list_elem = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split( + "\t") + character_num = int(substr[0]) + elem_num = int(substr[1]) + for cno in range(1, 1 + character_num): + character = lines[cno].decode('utf-8').strip("\n").strip("\r\n") + list_character.append(character) + for eno in range(1 + character_num, 1 + character_num + elem_num): + elem = lines[eno].decode('utf-8').strip("\n").strip("\r\n") + list_elem.append(elem) + return list_character, list_elem + + def add_special_char(self, list_character): + self.beg_str = "sos" + self.end_str = "eos" + list_character = [self.beg_str] + list_character + [self.end_str] + return list_character + + def __call__(self, preds): + structure_probs = preds['structure_probs'] + loc_preds = preds['loc_preds'] + if isinstance(structure_probs, paddle.Tensor): + structure_probs = structure_probs.numpy() + if isinstance(loc_preds, paddle.Tensor): + loc_preds = loc_preds.numpy() + structure_idx = structure_probs.argmax(axis=2) + structure_probs = structure_probs.max(axis=2) + structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode( + structure_idx, structure_probs, 'elem') + res_html_code_list = [] + res_loc_list = [] + batch_num = len(structure_str) + for bno in range(batch_num): + res_loc = [] + for sno in range(len(structure_str[bno])): + text = structure_str[bno][sno] + if text in ['', ' 0 and tmp_elem_idx == end_idx: + break + if tmp_elem_idx in ignored_tokens: + continue + + char_list.append(current_dict[tmp_elem_idx]) + elem_pos_list.append(idx) + score_list.append(structure_probs[batch_idx, idx]) + elem_idx_list.append(tmp_elem_idx) + result_list.append(char_list) + result_pos_list.append(elem_pos_list) + result_score_list.append(score_list) + result_elem_idx_list.append(elem_idx_list) + return result_list, result_pos_list, result_score_list, result_elem_idx_list + + def get_ignored_tokens(self, char_or_elem): + beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem) + end_idx = self.get_beg_end_flag_idx("end", char_or_elem) + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end, char_or_elem): + if char_or_elem == "char": + if beg_or_end == "beg": + idx = self.dict_character[self.beg_str] + elif beg_or_end == "end": + idx = self.dict_character[self.end_str] + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \ + % beg_or_end + elif char_or_elem == "elem": + if beg_or_end == "beg": + idx = self.dict_elem[self.beg_str] + elif beg_or_end == "end": + idx = self.dict_elem[self.end_str] + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \ + % beg_or_end + else: + assert False, "Unsupport type %s in char_or_elem" \ + % char_or_elem + return idx + + +class SARLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SARLabelDecode, self).__init__(character_dict_path, + use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', False) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx == 0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] + + +class DistillationSARLabelDecode(SARLabelDecode): + """ + Convert + Convert between text-label and text-index + """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + model_name=["student"], + key=None, + multi_head=False, + **kwargs): + super(DistillationSARLabelDecode, self).__init__(character_dict_path, + use_space_char) + if not isinstance(model_name, list): + model_name = [model_name] + self.model_name = model_name + + self.key = key + self.multi_head = multi_head + + def __call__(self, preds, label=None, *args, **kwargs): + output = dict() + for name in self.model_name: + pred = preds[name] + if self.key is not None: + pred = pred[self.key] + if self.multi_head and isinstance(pred, dict): + pred = pred['sar'] + output[name] = super().__call__(pred, label=label, *args, **kwargs) + return output + + +class PRENLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(PRENLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + padding_str = '' # 0 + end_str = '' # 1 + unknown_str = '' # 2 + + dict_character = [padding_str, end_str, unknown_str] + dict_character + self.padding_idx = 0 + self.end_idx = 1 + self.unknown_idx = 2 + + return dict_character + + def decode(self, text_index, text_prob=None): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] == self.end_idx: + break + if text_index[batch_idx][idx] in \ + [self.padding_idx, self.unknown_idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + + text = ''.join(char_list) + if len(text) > 0: + result_list.append((text, np.mean(conf_list).tolist())) + else: + # here confidence of empty recog result is 1 + result_list.append(('', 1)) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob) + if label is None: + return text + label = self.decode(label) + return text, label diff --git a/backend/ppocr/postprocess/sast_postprocess.py b/backend/ppocr/postprocess/sast_postprocess.py new file mode 100755 index 0000000..bee75c0 --- /dev/null +++ b/backend/ppocr/postprocess/sast_postprocess.py @@ -0,0 +1,355 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) + +import numpy as np +from .locality_aware_nms import nms_locality +import paddle +import cv2 +import time + + +class SASTPostProcess(object): + """ + The post process for SAST. + """ + + def __init__(self, + score_thresh=0.5, + nms_thresh=0.2, + sample_pts_num=2, + shrink_ratio_of_width=0.3, + expand_scale=1.0, + tcl_map_thresh=0.5, + **kwargs): + + self.score_thresh = score_thresh + self.nms_thresh = nms_thresh + self.sample_pts_num = sample_pts_num + self.shrink_ratio_of_width = shrink_ratio_of_width + self.expand_scale = expand_scale + self.tcl_map_thresh = tcl_map_thresh + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def point_pair2poly(self, point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + # constract poly + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2) + + def shrink_quad_along_width(self, + quad, + begin_width_ratio=0., + end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def expand_poly_along_width(self, poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = self.shrink_quad_along_width(left_quad, left_ratio, + 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = self.shrink_quad_along_width(right_quad, 0.0, + right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + def restore_quad(self, tcl_map, tcl_map_thresh, tvo_map): + """Restore quad.""" + xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh) + xy_text = xy_text[:, ::-1] # (n, 2) + + # Sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 1])] + + scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0] + scores = scores[:, np.newaxis] + + # Restore + point_num = int(tvo_map.shape[-1] / 2) + assert point_num == 4 + tvo_map = tvo_map[xy_text[:, 1], xy_text[:, 0], :] + xy_text_tile = np.tile(xy_text, (1, point_num)) # (n, point_num * 2) + quads = xy_text_tile - tvo_map + + return scores, quads, xy_text + + def quad_area(self, quad): + """ + compute area of a quad. + """ + edge = [(quad[1][0] - quad[0][0]) * (quad[1][1] + quad[0][1]), + (quad[2][0] - quad[1][0]) * (quad[2][1] + quad[1][1]), + (quad[3][0] - quad[2][0]) * (quad[3][1] + quad[2][1]), + (quad[0][0] - quad[3][0]) * (quad[0][1] + quad[3][1])] + return np.sum(edge) / 2. + + def nms(self, dets): + if self.is_python35: + import lanms + dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh) + else: + dets = nms_locality(dets, self.nms_thresh) + return dets + + def cluster_by_quads_tco(self, tcl_map, tcl_map_thresh, quads, tco_map): + """ + Cluster pixels in tcl_map based on quads. + """ + instance_count = quads.shape[0] + 1 # contain background + instance_label_map = np.zeros(tcl_map.shape[:2], dtype=np.int32) + if instance_count == 1: + return instance_count, instance_label_map + + # predict text center + xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh) + n = xy_text.shape[0] + xy_text = xy_text[:, ::-1] # (n, 2) + tco = tco_map[xy_text[:, 1], xy_text[:, 0], :] # (n, 2) + pred_tc = xy_text - tco + + # get gt text center + m = quads.shape[0] + gt_tc = np.mean(quads, axis=1) # (m, 2) + + pred_tc_tile = np.tile(pred_tc[:, np.newaxis, :], + (1, m, 1)) # (n, m, 2) + gt_tc_tile = np.tile(gt_tc[np.newaxis, :, :], (n, 1, 1)) # (n, m, 2) + dist_mat = np.linalg.norm(pred_tc_tile - gt_tc_tile, axis=2) # (n, m) + xy_text_assign = np.argmin(dist_mat, axis=1) + 1 # (n,) + + instance_label_map[xy_text[:, 1], xy_text[:, 0]] = xy_text_assign + return instance_count, instance_label_map + + def estimate_sample_pts_num(self, quad, xy_text): + """ + Estimate sample points number. + """ + eh = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[1] - quad[2])) / 2.0 + ew = (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) / 2.0 + + dense_sample_pts_num = max(2, int(ew)) + dense_xy_center_line = xy_text[np.linspace( + 0, + xy_text.shape[0] - 1, + dense_sample_pts_num, + endpoint=True, + dtype=np.float32).astype(np.int32)] + + dense_xy_center_line_diff = dense_xy_center_line[ + 1:] - dense_xy_center_line[:-1] + estimate_arc_len = np.sum( + np.linalg.norm( + dense_xy_center_line_diff, axis=1)) + + sample_pts_num = max(2, int(estimate_arc_len / eh)) + return sample_pts_num + + def detect_sast(self, + tcl_map, + tvo_map, + tbo_map, + tco_map, + ratio_w, + ratio_h, + src_w, + src_h, + shrink_ratio_of_width=0.3, + tcl_map_thresh=0.5, + offset_expand=1.0, + out_strid=4.0): + """ + first resize the tcl_map, tvo_map and tbo_map to the input_size, then restore the polys + """ + # restore quad + scores, quads, xy_text = self.restore_quad(tcl_map, tcl_map_thresh, + tvo_map) + dets = np.hstack((quads, scores)).astype(np.float32, copy=False) + dets = self.nms(dets) + if dets.shape[0] == 0: + return [] + quads = dets[:, :-1].reshape(-1, 4, 2) + + # Compute quad area + quad_areas = [] + for quad in quads: + quad_areas.append(-self.quad_area(quad)) + + # instance segmentation + # instance_count, instance_label_map = cv2.connectedComponents(tcl_map.astype(np.uint8), connectivity=8) + instance_count, instance_label_map = self.cluster_by_quads_tco( + tcl_map, tcl_map_thresh, quads, tco_map) + + # restore single poly with tcl instance. + poly_list = [] + for instance_idx in range(1, instance_count): + xy_text = np.argwhere(instance_label_map == instance_idx)[:, ::-1] + quad = quads[instance_idx - 1] + q_area = quad_areas[instance_idx - 1] + if q_area < 5: + continue + + # + len1 = float(np.linalg.norm(quad[0] - quad[1])) + len2 = float(np.linalg.norm(quad[1] - quad[2])) + min_len = min(len1, len2) + if min_len < 3: + continue + + # filter small CC + if xy_text.shape[0] <= 0: + continue + + # filter low confidence instance + xy_text_scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0] + if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.1: + # if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.05: + continue + + # sort xy_text + left_center_pt = np.array( + [[(quad[0, 0] + quad[-1, 0]) / 2.0, + (quad[0, 1] + quad[-1, 1]) / 2.0]]) # (1, 2) + right_center_pt = np.array( + [[(quad[1, 0] + quad[2, 0]) / 2.0, + (quad[1, 1] + quad[2, 1]) / 2.0]]) # (1, 2) + proj_unit_vec = (right_center_pt - left_center_pt) / \ + (np.linalg.norm(right_center_pt - left_center_pt) + 1e-6) + proj_value = np.sum(xy_text * proj_unit_vec, axis=1) + xy_text = xy_text[np.argsort(proj_value)] + + # Sample pts in tcl map + if self.sample_pts_num == 0: + sample_pts_num = self.estimate_sample_pts_num(quad, xy_text) + else: + sample_pts_num = self.sample_pts_num + xy_center_line = xy_text[np.linspace( + 0, + xy_text.shape[0] - 1, + sample_pts_num, + endpoint=True, + dtype=np.float32).astype(np.int32)] + + point_pair_list = [] + for x, y in xy_center_line: + # get corresponding offset + offset = tbo_map[y, x, :].reshape(2, 2) + if offset_expand != 1.0: + offset_length = np.linalg.norm( + offset, axis=1, keepdims=True) + expand_length = np.clip( + offset_length * (offset_expand - 1), + a_min=0.5, + a_max=3.0) + offset_detal = offset / offset_length * expand_length + offset = offset + offset_detal + # original point + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * out_strid / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + # ndarry: (x, 2), expand poly along width + detected_poly = self.point_pair2poly(point_pair_list) + detected_poly = self.expand_poly_along_width(detected_poly, + shrink_ratio_of_width) + detected_poly[:, 0] = np.clip( + detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip( + detected_poly[:, 1], a_min=0, a_max=src_h) + poly_list.append(detected_poly) + + return poly_list + + def __call__(self, outs_dict, shape_list): + score_list = outs_dict['f_score'] + border_list = outs_dict['f_border'] + tvo_list = outs_dict['f_tvo'] + tco_list = outs_dict['f_tco'] + if isinstance(score_list, paddle.Tensor): + score_list = score_list.numpy() + border_list = border_list.numpy() + tvo_list = tvo_list.numpy() + tco_list = tco_list.numpy() + + img_num = len(shape_list) + poly_lists = [] + for ino in range(img_num): + p_score = score_list[ino].transpose((1, 2, 0)) + p_border = border_list[ino].transpose((1, 2, 0)) + p_tvo = tvo_list[ino].transpose((1, 2, 0)) + p_tco = tco_list[ino].transpose((1, 2, 0)) + src_h, src_w, ratio_h, ratio_w = shape_list[ino] + + poly_list = self.detect_sast( + p_score, + p_tvo, + p_border, + p_tco, + ratio_w, + ratio_h, + src_w, + src_h, + shrink_ratio_of_width=self.shrink_ratio_of_width, + tcl_map_thresh=self.tcl_map_thresh, + offset_expand=self.expand_scale) + poly_lists.append({'points': np.array(poly_list)}) + + return poly_lists diff --git a/backend/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py b/backend/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py new file mode 100644 index 0000000..1d55d13 --- /dev/null +++ b/backend/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle + + +class VQAReTokenLayoutLMPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, **kwargs): + super(VQAReTokenLayoutLMPostProcess, self).__init__() + + def __call__(self, preds, label=None, *args, **kwargs): + if label is not None: + return self._metric(preds, label) + else: + return self._infer(preds, *args, **kwargs) + + def _metric(self, preds, label): + return preds['pred_relations'], label[6], label[5] + + def _infer(self, preds, *args, **kwargs): + ser_results = kwargs['ser_results'] + entity_idx_dict_batch = kwargs['entity_idx_dict_batch'] + pred_relations = preds['pred_relations'] + + # merge relations and ocr info + results = [] + for pred_relation, ser_result, entity_idx_dict in zip( + pred_relations, ser_results, entity_idx_dict_batch): + result = [] + used_tail_id = [] + for relation in pred_relation: + if relation['tail_id'] in used_tail_id: + continue + used_tail_id.append(relation['tail_id']) + ocr_info_head = ser_result[entity_idx_dict[relation['head_id']]] + ocr_info_tail = ser_result[entity_idx_dict[relation['tail_id']]] + result.append((ocr_info_head, ocr_info_tail)) + results.append(result) + return results diff --git a/backend/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py b/backend/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py new file mode 100644 index 0000000..782cdea --- /dev/null +++ b/backend/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py @@ -0,0 +1,93 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import paddle +from ppocr.utils.utility import load_vqa_bio_label_maps + + +class VQASerTokenLayoutLMPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, class_path, **kwargs): + super(VQASerTokenLayoutLMPostProcess, self).__init__() + label2id_map, self.id2label_map = load_vqa_bio_label_maps(class_path) + + self.label2id_map_for_draw = dict() + for key in label2id_map: + if key.startswith("I-"): + self.label2id_map_for_draw[key] = label2id_map["B" + key[1:]] + else: + self.label2id_map_for_draw[key] = label2id_map[key] + + self.id2label_map_for_show = dict() + for key in self.label2id_map_for_draw: + val = self.label2id_map_for_draw[key] + if key == "O": + self.id2label_map_for_show[val] = key + if key.startswith("B-") or key.startswith("I-"): + self.id2label_map_for_show[val] = key[2:] + else: + self.id2label_map_for_show[val] = key + + def __call__(self, preds, batch=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + if batch is not None: + return self._metric(preds, batch[1]) + else: + return self._infer(preds, **kwargs) + + def _metric(self, preds, label): + pred_idxs = preds.argmax(axis=2) + decode_out_list = [[] for _ in range(pred_idxs.shape[0])] + label_decode_out_list = [[] for _ in range(pred_idxs.shape[0])] + + for i in range(pred_idxs.shape[0]): + for j in range(pred_idxs.shape[1]): + if label[i, j] != -100: + label_decode_out_list[i].append(self.id2label_map[label[i, + j]]) + decode_out_list[i].append(self.id2label_map[pred_idxs[i, + j]]) + return decode_out_list, label_decode_out_list + + def _infer(self, preds, attention_masks, segment_offset_ids, ocr_infos): + results = [] + + for pred, attention_mask, segment_offset_id, ocr_info in zip( + preds, attention_masks, segment_offset_ids, ocr_infos): + pred = np.argmax(pred, axis=1) + pred = [self.id2label_map[idx] for idx in pred] + + for idx in range(len(segment_offset_id)): + if idx == 0: + start_id = 0 + else: + start_id = segment_offset_id[idx - 1] + + end_id = segment_offset_id[idx] + + curr_pred = pred[start_id:end_id] + curr_pred = [self.label2id_map_for_draw[p] for p in curr_pred] + + if len(curr_pred) <= 0: + pred_id = 0 + else: + counts = np.bincount(curr_pred) + pred_id = np.argmax(counts) + ocr_info[idx]["pred_id"] = int(pred_id) + ocr_info[idx]["pred"] = self.id2label_map_for_show[int(pred_id)] + results.append(ocr_info) + return results diff --git a/backend/ppocr/utils/__init__.py b/backend/ppocr/utils/__init__.py new file mode 100755 index 0000000..abf198b --- /dev/null +++ b/backend/ppocr/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/backend/ppocr/utils/dict/ar_dict.txt b/backend/ppocr/utils/dict/ar_dict.txt new file mode 100644 index 0000000..fc63802 --- /dev/null +++ b/backend/ppocr/utils/dict/ar_dict.txt @@ -0,0 +1,117 @@ +a +r +b +i +c +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +l +6 +3 +9 +. +j +p +ا +ل +م +ر +ج +و +ح +ي +ة +5 +8 +7 +أ +ب +ض +4 +ك +س +ه +ث +ن +ط +ع +ت +غ +خ +ف +ئ +ز +إ +د +ص +ظ +ذ +ش +ى +ق +ؤ +آ +ء +s +e +n +w +t +u +z +d +A +N +G +h +o +E +T +H +O +B +y +F +U +J +X +W +P +Z +M +k +q +Y +Q +D +f +K +x +' +% +- +# +@ +! +& +$ +, +: +é +? ++ +É +( + diff --git a/backend/ppocr/utils/dict/arabic_dict.txt b/backend/ppocr/utils/dict/arabic_dict.txt new file mode 100644 index 0000000..916d421 --- /dev/null +++ b/backend/ppocr/utils/dict/arabic_dict.txt @@ -0,0 +1,161 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ء +آ +أ +ؤ +إ +ئ +ا +ب +ة +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ك +ل +م +ن +ه +و +ى +ي +ً +ٌ +ٍ +َ +ُ +ِ +ّ +ْ +ٓ +ٔ +ٰ +ٱ +ٹ +پ +چ +ڈ +ڑ +ژ +ک +ڭ +گ +ں +ھ +ۀ +ہ +ۂ +ۃ +ۆ +ۇ +ۈ +ۋ +ی +ې +ے +ۓ +ە +١ +٢ +٣ +٤ +٥ +٦ +٧ +٨ +٩ diff --git a/backend/ppocr/utils/dict/be_dict.txt b/backend/ppocr/utils/dict/be_dict.txt new file mode 100644 index 0000000..f8458ba --- /dev/null +++ b/backend/ppocr/utils/dict/be_dict.txt @@ -0,0 +1,145 @@ +b +e +_ +i +m +g +/ +2 +0 +I +L +S +V +R +C +1 +v +a +l +6 +9 +4 +3 +. +j +p +п +а +з +б +у +г +н +ц +ь +8 +м +л +і +о +ў +ы +7 +5 +М +х +с +р +ф +я +е +д +ж +ю +ч +й +к +Д +в +Б +т +І +ш +ё +э +К +Л +Н +А +Ж +Г +В +П +З +Е +О +Р +С +У +Ё +Й +Т +Ч +Э +Ц +Ю +Ш +Ф +Х +Я +Ь +Ы +Ў +s +c +n +w +M +o +t +T +E +A +B +u +h +y +k +r +H +d +Y +O +U +F +f +x +D +G +N +K +P +z +J +X +W +Z +Q +% +- +q +@ +' +! +# +& +, +: +$ +( +? +é ++ +É + diff --git a/backend/ppocr/utils/dict/bg_dict.txt b/backend/ppocr/utils/dict/bg_dict.txt new file mode 100644 index 0000000..84713c3 --- /dev/null +++ b/backend/ppocr/utils/dict/bg_dict.txt @@ -0,0 +1,140 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ь +ю +я + diff --git a/backend/ppocr/utils/dict/ch_dict.txt b/backend/ppocr/utils/dict/ch_dict.txt new file mode 100644 index 0000000..84b885d --- /dev/null +++ b/backend/ppocr/utils/dict/ch_dict.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/backend/ppocr/utils/dict/chinese_cht_dict.txt b/backend/ppocr/utils/dict/chinese_cht_dict.txt new file mode 100644 index 0000000..cc1aa47 --- /dev/null +++ b/backend/ppocr/utils/dict/chinese_cht_dict.txt @@ -0,0 +1,8421 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¥ +® +° +± +² +´ +· +» +É +Ë +Ó +× +Ü +à +á +ä +è +é +ì +í +ò +ó +÷ +ú +ü +ā +ē +ī +ō +ū +ǐ +ǒ +ɔ +ɡ +ʌ +ˋ +Λ +Ο +Φ +Ω +α +β +ε +θ +μ +π +З +И +Й +П +Я +г +— +‖ +‘ +’ +“ +” +• +… +‧ +′ +″ +※ +℃ +№ +™ +Ⅱ +Ⅲ +Ⅳ +← +↑ +→ +↓ +⇋ +∈ +∑ +√ +∞ +∣ +∧ +∩ +∫ +∶ +≈ +≠ +≤ +≥ +⊙ +⊥ +① +② +③ +④ +⑧ +⑴ +⑵ +⑶ +─ +│ +┅ +┌ +├ +█ +▎ +▏ +▕ +■ +□ +▪ +▲ +△ +▼ +◆ +◇ +○ +◎ +● +◥ +★ +☆ +❋ +❤ +  +、 +。 +〇 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〔 +〕 +〖 +〗 +の +サ +シ +ジ +マ +ㄱ +ㆍ +㎏ +㎡ +㐂 +㐱 +㙟 +㴪 +㸃 +䖝 +䝉 +䰾 +䲁 +一 +丁 +七 +丄 +丈 +三 +上 +下 +丌 +不 +与 +丏 +丐 +丑 +且 +丕 +世 +丘 +丙 +丞 +丟 +両 +並 +丨 +丫 +中 +丰 +串 +丶 +丸 +丹 +主 +丼 +丿 +乂 +乃 +久 +么 +之 +乍 +乎 +乏 +乒 +乓 +乖 +乗 +乘 +乙 +乚 +乜 +九 +乞 +也 +乩 +乭 +乳 +乸 +乹 +乾 +亀 +亂 +亅 +了 +予 +亊 +事 +二 +亍 +云 +互 +亓 +五 +井 +亘 +些 +亜 +亞 +亟 +亠 +亡 +亢 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +亰 +亳 +亶 +亹 +人 +亻 +什 +仁 +仂 +仃 +仄 +仇 +仉 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +仛 +仝 +仞 +仟 +仡 +代 +令 +以 +仨 +仫 +仮 +仰 +仲 +仳 +仵 +件 +仺 +任 +仼 +份 +仿 +企 +伃 +伈 +伉 +伊 +伋 +伍 +伎 +伏 +伐 +休 +伕 +伙 +伝 +伢 +伯 +估 +伱 +伴 +伶 +伷 +伸 +伺 +似 +伽 +伾 +佀 +佁 +佃 +但 +佇 +佈 +佉 +佋 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佣 +佤 +佧 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佹 +佺 +佼 +佾 +使 +侁 +侃 +侄 +侅 +來 +侈 +侊 +例 +侍 +侏 +侑 +侖 +侗 +侘 +侚 +供 +依 +侞 +価 +侮 +侯 +侵 +侶 +侷 +侹 +便 +俁 +係 +促 +俄 +俅 +俊 +俋 +俌 +俍 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +俛 +保 +俞 +俟 +俠 +信 +俬 +修 +俯 +俱 +俳 +俴 +俵 +俶 +俸 +俺 +俽 +俾 +倆 +倈 +倉 +個 +倌 +倍 +們 +倒 +倓 +倔 +倖 +倗 +倘 +候 +倚 +倜 +倞 +借 +倡 +倢 +倣 +値 +倦 +倧 +倩 +倪 +倫 +倬 +倭 +倮 +倻 +值 +偁 +偃 +假 +偈 +偉 +偊 +偌 +偍 +偎 +偏 +偓 +偕 +做 +停 +健 +偪 +偲 +側 +偵 +偶 +偷 +偸 +偽 +傀 +傃 +傅 +傈 +傉 +傍 +傑 +傒 +傕 +傖 +傘 +備 +傜 +傢 +傣 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +僉 +僊 +働 +像 +僑 +僔 +僕 +僖 +僙 +僚 +僜 +僡 +僧 +僩 +僭 +僮 +僰 +僱 +僳 +僴 +僵 +價 +僻 +儀 +儁 +儂 +億 +儆 +儇 +儈 +儉 +儋 +儐 +儒 +儔 +儕 +儘 +儚 +儞 +償 +儡 +儥 +儦 +優 +儫 +儱 +儲 +儷 +儺 +儻 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +児 +兒 +兔 +兕 +兗 +兜 +入 +內 +全 +兩 +兪 +八 +公 +六 +兮 +共 +兵 +其 +具 +典 +兼 +兿 +冀 +冂 +円 +冇 +冉 +冊 +再 +冏 +冑 +冒 +冕 +冖 +冗 +冚 +冠 +冢 +冤 +冥 +冧 +冨 +冪 +冫 +冬 +冮 +冰 +冴 +冶 +冷 +冼 +冽 +凃 +凄 +准 +凈 +凋 +凌 +凍 +凖 +凜 +凝 +凞 +几 +凡 +処 +凪 +凬 +凰 +凱 +凳 +凵 +凶 +凸 +凹 +出 +函 +刀 +刁 +刂 +刃 +刄 +分 +切 +刈 +刊 +刎 +刑 +划 +列 +初 +判 +別 +刦 +刧 +刨 +利 +刪 +刮 +到 +制 +刷 +券 +刺 +刻 +刼 +剁 +剃 +則 +削 +剋 +剌 +前 +剎 +剏 +剔 +剖 +剛 +剝 +剡 +剣 +剩 +剪 +剮 +副 +割 +創 +剿 +劃 +劄 +劇 +劈 +劉 +劊 +劌 +劍 +劑 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劬 +劭 +劵 +効 +劼 +劾 +勁 +勃 +勅 +勇 +勉 +勐 +勑 +勒 +勔 +動 +勖 +勗 +勘 +務 +勛 +勝 +勞 +募 +勢 +勣 +勤 +勦 +勰 +勱 +勲 +勳 +勵 +勷 +勸 +勺 +勻 +勾 +勿 +匂 +匄 +包 +匆 +匈 +匋 +匍 +匏 +匐 +匕 +化 +北 +匙 +匚 +匝 +匠 +匡 +匣 +匪 +匯 +匱 +匸 +匹 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卋 +卍 +卐 +卑 +卒 +卓 +協 +南 +博 +卜 +卞 +卟 +占 +卡 +卣 +卦 +卧 +卩 +卬 +卮 +卯 +印 +危 +卲 +即 +卵 +卷 +卸 +卹 +卺 +卻 +卽 +卿 +厄 +厓 +厔 +厙 +厚 +厝 +原 +厥 +厭 +厰 +厲 +厴 +厶 +去 +參 +叄 +又 +叉 +及 +友 +反 +収 +叔 +叕 +取 +受 +叛 +叟 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +司 +叻 +叼 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吔 +吖 +君 +吝 +吞 +吟 +吠 +吡 +吥 +否 +吧 +吩 +含 +吮 +吱 +吲 +吳 +吵 +吶 +吸 +吹 +吻 +吼 +吾 +呀 +呂 +呃 +呈 +呉 +告 +呋 +呎 +呢 +呤 +呦 +周 +呱 +味 +呵 +呷 +呸 +呼 +命 +呾 +咀 +咁 +咂 +咄 +咅 +咆 +咋 +和 +咎 +咑 +咒 +咔 +咕 +咖 +咗 +咘 +咚 +咟 +咤 +咥 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咼 +咽 +咾 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哌 +哎 +哏 +哐 +哖 +哚 +哞 +員 +哥 +哦 +哨 +哩 +哪 +哭 +哮 +哱 +哲 +哺 +哼 +唃 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唘 +唧 +唫 +唬 +唭 +售 +唯 +唱 +唳 +唵 +唷 +唸 +唻 +唾 +啁 +啃 +啄 +商 +啉 +啊 +啍 +問 +啓 +啖 +啚 +啜 +啞 +啟 +啡 +啣 +啤 +啥 +啦 +啪 +啫 +啯 +啰 +啱 +啲 +啵 +啶 +啷 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喈 +喉 +喊 +喋 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喢 +喦 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +喹 +喻 +喼 +嗄 +嗅 +嗆 +嗇 +嗊 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗞 +嗡 +嗢 +嗣 +嗦 +嗨 +嗩 +嗪 +嗮 +嗯 +嗲 +嗶 +嗹 +嗽 +嘀 +嘅 +嘆 +嘉 +嘌 +嘍 +嘎 +嘏 +嘔 +嘗 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘥 +嘧 +嘩 +嘬 +嘮 +嘯 +嘰 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噍 +噏 +噓 +噗 +噝 +噠 +噢 +噤 +噥 +噦 +器 +噩 +噪 +噬 +噯 +噰 +噲 +噴 +噶 +噸 +噹 +噻 +嚇 +嚈 +嚎 +嚏 +嚐 +嚒 +嚓 +嚕 +嚗 +嚙 +嚞 +嚟 +嚤 +嚦 +嚧 +嚨 +嚩 +嚮 +嚳 +嚴 +嚶 +嚷 +嚼 +嚿 +囀 +囂 +囃 +囉 +囊 +囍 +囑 +囒 +囓 +囗 +囚 +四 +囝 +回 +因 +囡 +団 +囤 +囧 +囪 +囮 +囯 +困 +囲 +図 +囶 +囷 +囹 +固 +囿 +圂 +圃 +圄 +圈 +圉 +國 +圍 +圏 +園 +圓 +圖 +圗 +團 +圜 +土 +圧 +在 +圩 +圪 +圭 +圯 +地 +圳 +圻 +圾 +址 +均 +坊 +坋 +坌 +坍 +坎 +坐 +坑 +坖 +坡 +坣 +坤 +坦 +坨 +坩 +坪 +坫 +坬 +坭 +坮 +坯 +坳 +坵 +坶 +坷 +坻 +垂 +垃 +垈 +型 +垍 +垓 +垕 +垚 +垛 +垞 +垟 +垠 +垢 +垣 +垮 +垯 +垰 +垵 +垸 +垻 +垿 +埃 +埅 +埇 +埈 +埋 +埌 +城 +埏 +埒 +埔 +埕 +埗 +埜 +域 +埠 +埡 +埤 +埧 +埨 +埪 +埭 +埮 +埴 +埵 +執 +培 +基 +埻 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堈 +堉 +堊 +堍 +堖 +堝 +堡 +堤 +堦 +堪 +堮 +堯 +堰 +報 +場 +堵 +堷 +堺 +塀 +塅 +塆 +塊 +塋 +塌 +塍 +塏 +塑 +塔 +塗 +塘 +塙 +塜 +塞 +塡 +塢 +塤 +塨 +塩 +填 +塬 +塭 +塰 +塱 +塲 +塵 +塹 +塽 +塾 +墀 +境 +墅 +墉 +墊 +墎 +墓 +増 +墘 +墜 +增 +墟 +墡 +墣 +墨 +墩 +墫 +墬 +墮 +墱 +墳 +墺 +墼 +墾 +壁 +壄 +壆 +壇 +壋 +壌 +壎 +壐 +壑 +壓 +壔 +壕 +壘 +壙 +壞 +壟 +壠 +壢 +壤 +壩 +士 +壬 +壯 +壱 +壴 +壹 +壺 +壽 +夀 +夆 +変 +夊 +夋 +夌 +夏 +夔 +夕 +外 +夙 +多 +夜 +夠 +夢 +夤 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +夯 +失 +夷 +夾 +奀 +奄 +奇 +奈 +奉 +奎 +奏 +奐 +契 +奓 +奔 +奕 +套 +奘 +奚 +奠 +奢 +奣 +奧 +奩 +奪 +奫 +奭 +奮 +女 +奴 +奶 +她 +好 +妀 +妁 +如 +妃 +妄 +妊 +妍 +妏 +妑 +妒 +妓 +妖 +妙 +妝 +妞 +妠 +妤 +妥 +妧 +妨 +妭 +妮 +妯 +妲 +妳 +妸 +妹 +妺 +妻 +妾 +姀 +姁 +姃 +姆 +姈 +姉 +姊 +始 +姌 +姍 +姐 +姑 +姒 +姓 +委 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姮 +姵 +姶 +姸 +姻 +姿 +威 +娃 +娉 +娋 +娌 +娍 +娎 +娑 +娖 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娫 +娳 +娶 +娸 +娼 +娽 +婀 +婁 +婆 +婉 +婊 +婑 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婯 +婷 +婺 +婻 +婼 +婿 +媃 +媄 +媊 +媐 +媒 +媓 +媖 +媗 +媚 +媛 +媜 +媞 +媧 +媭 +媯 +媲 +媳 +媺 +媼 +媽 +媾 +媿 +嫁 +嫂 +嫄 +嫈 +嫉 +嫌 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫪 +嫲 +嫳 +嫵 +嫺 +嫻 +嬅 +嬈 +嬉 +嬋 +嬌 +嬗 +嬛 +嬝 +嬡 +嬤 +嬨 +嬪 +嬬 +嬭 +嬰 +嬴 +嬸 +嬾 +嬿 +孀 +孃 +孆 +孋 +孌 +子 +孑 +孔 +孕 +孖 +字 +存 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +孩 +孫 +孬 +孮 +孰 +孳 +孵 +學 +孺 +孻 +孽 +孿 +宀 +它 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +実 +客 +宣 +室 +宥 +宦 +宧 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寀 +寁 +寂 +寄 +寅 +密 +寇 +寈 +寊 +富 +寐 +寒 +寓 +寔 +寕 +寖 +寗 +寘 +寛 +寜 +寞 +察 +寡 +寢 +寤 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寯 +寰 +寳 +寵 +寶 +寸 +寺 +対 +封 +専 +尃 +射 +將 +專 +尉 +尊 +尋 +對 +導 +小 +尐 +少 +尓 +尕 +尖 +尗 +尙 +尚 +尢 +尤 +尨 +尪 +尬 +就 +尷 +尹 +尺 +尻 +尼 +尾 +尿 +局 +屁 +屄 +居 +屆 +屇 +屈 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +屓 +展 +屚 +屜 +屠 +屢 +層 +履 +屬 +屭 +屯 +山 +屹 +屺 +屻 +岀 +岈 +岌 +岐 +岑 +岔 +岡 +岢 +岣 +岧 +岩 +岪 +岫 +岬 +岰 +岱 +岳 +岵 +岷 +岸 +岻 +峁 +峅 +峇 +峋 +峍 +峒 +峘 +峙 +峚 +峠 +峨 +峩 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峼 +峽 +崁 +崆 +崇 +崈 +崋 +崍 +崎 +崐 +崑 +崒 +崔 +崖 +崗 +崘 +崙 +崚 +崛 +崞 +崟 +崠 +崢 +崤 +崧 +崩 +崬 +崮 +崱 +崴 +崵 +崶 +崽 +嵇 +嵊 +嵋 +嵌 +嵎 +嵐 +嵒 +嵕 +嵖 +嵗 +嵙 +嵛 +嵜 +嵨 +嵩 +嵬 +嵮 +嵯 +嵰 +嵴 +嵻 +嵿 +嶁 +嶂 +嶃 +嶄 +嶇 +嶋 +嶌 +嶍 +嶒 +嶔 +嶗 +嶝 +嶠 +嶢 +嶦 +嶧 +嶪 +嶬 +嶰 +嶲 +嶴 +嶷 +嶸 +嶺 +嶼 +嶽 +巂 +巄 +巆 +巋 +巌 +巍 +巎 +巑 +巒 +巔 +巖 +巘 +巛 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巫 +差 +巰 +己 +已 +巳 +巴 +巶 +巷 +巻 +巽 +巾 +巿 +市 +布 +帆 +希 +帑 +帔 +帕 +帖 +帘 +帙 +帚 +帛 +帝 +帡 +帢 +帥 +師 +席 +帯 +帰 +帳 +帶 +帷 +常 +帽 +幀 +幃 +幄 +幅 +幌 +幔 +幕 +幗 +幚 +幛 +幟 +幡 +幢 +幣 +幪 +幫 +干 +平 +年 +幵 +幷 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +庀 +庁 +広 +庇 +床 +序 +底 +庖 +店 +庚 +府 +庠 +庢 +庥 +度 +座 +庫 +庭 +庲 +庵 +庶 +康 +庸 +庹 +庼 +庾 +廁 +廂 +廄 +廆 +廈 +廉 +廊 +廋 +廌 +廍 +廑 +廓 +廔 +廕 +廖 +廙 +廚 +廝 +廞 +廟 +廠 +廡 +廢 +廣 +廧 +廨 +廩 +廬 +廰 +廱 +廳 +延 +廷 +廸 +建 +廻 +廼 +廿 +弁 +弄 +弅 +弇 +弈 +弉 +弊 +弋 +弍 +式 +弐 +弒 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弢 +弦 +弧 +弨 +弩 +弭 +弱 +張 +強 +弸 +弼 +弾 +彀 +彄 +彅 +彆 +彈 +彊 +彌 +彎 +彐 +彔 +彖 +彗 +彘 +彙 +彜 +彞 +彠 +彡 +形 +彣 +彤 +彥 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彳 +彷 +役 +彼 +彿 +往 +征 +徂 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +得 +徘 +徙 +徜 +從 +徠 +御 +徧 +徨 +復 +循 +徫 +徬 +徭 +微 +徳 +徴 +徵 +德 +徸 +徹 +徽 +心 +忄 +必 +忉 +忌 +忍 +忐 +忑 +忒 +志 +忘 +忙 +応 +忝 +忞 +忠 +快 +忬 +忯 +忱 +忳 +念 +忻 +忽 +忿 +怍 +怎 +怒 +怕 +怖 +怙 +怛 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +恁 +恂 +恃 +恆 +恊 +恍 +恐 +恕 +恙 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恬 +恭 +息 +恰 +恵 +恿 +悄 +悅 +悆 +悉 +悌 +悍 +悔 +悖 +悚 +悛 +悝 +悞 +悟 +悠 +患 +悧 +您 +悪 +悰 +悲 +悳 +悵 +悶 +悸 +悼 +情 +惆 +惇 +惑 +惔 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惣 +惦 +惰 +惱 +惲 +想 +惶 +惹 +惺 +愁 +愃 +愆 +愈 +愉 +愍 +意 +愐 +愒 +愔 +愕 +愚 +愛 +愜 +感 +愣 +愧 +愨 +愫 +愭 +愴 +愷 +愼 +愾 +愿 +慄 +慈 +態 +慌 +慎 +慕 +慘 +慚 +慜 +慟 +慢 +慣 +慥 +慧 +慨 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憍 +憎 +憐 +憑 +憓 +憕 +憙 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憺 +憻 +憾 +懂 +懃 +懇 +懈 +應 +懋 +懌 +懍 +懐 +懣 +懦 +懮 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懽 +懾 +懿 +戀 +戇 +戈 +戊 +戌 +戍 +戎 +成 +我 +戒 +戔 +戕 +或 +戙 +戚 +戛 +戟 +戡 +戢 +戥 +戦 +戩 +截 +戮 +戰 +戱 +戲 +戳 +戴 +戶 +戸 +戻 +戽 +戾 +房 +所 +扁 +扆 +扇 +扈 +扉 +手 +扌 +才 +扎 +扒 +打 +扔 +托 +扙 +扛 +扞 +扣 +扥 +扦 +扭 +扮 +扯 +扳 +扶 +批 +扼 +找 +承 +技 +抃 +抄 +抇 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抦 +披 +抬 +抱 +抵 +抹 +抻 +押 +抽 +抿 +拂 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拏 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拝 +拡 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拺 +拼 +拽 +拾 +拿 +持 +指 +按 +挎 +挑 +挖 +挙 +挨 +挪 +挫 +振 +挲 +挵 +挹 +挺 +挻 +挾 +捂 +捆 +捉 +捌 +捍 +捎 +捏 +捐 +捒 +捕 +捜 +捦 +捧 +捨 +捩 +捫 +捭 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掄 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掞 +掟 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掾 +揀 +揄 +揆 +揉 +揍 +描 +提 +插 +揔 +揖 +揚 +換 +握 +揪 +揭 +揮 +援 +揸 +揺 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搠 +搢 +搪 +搬 +搭 +搳 +搴 +搵 +搶 +搽 +搾 +摂 +摒 +摔 +摘 +摜 +摞 +摟 +摠 +摧 +摩 +摭 +摯 +摳 +摴 +摵 +摶 +摸 +摹 +摺 +摻 +摽 +撃 +撇 +撈 +撐 +撒 +撓 +撕 +撖 +撙 +撚 +撞 +撣 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撳 +撻 +撼 +撾 +撿 +擀 +擁 +擂 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擠 +擢 +擥 +擦 +擬 +擯 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攔 +攖 +攘 +攜 +攝 +攞 +攢 +攣 +攤 +攪 +攫 +攬 +支 +攴 +攵 +收 +攷 +攸 +改 +攻 +攽 +放 +政 +故 +效 +敍 +敎 +敏 +救 +敔 +敕 +敖 +敗 +敘 +教 +敝 +敞 +敟 +敢 +散 +敦 +敫 +敬 +敭 +敲 +整 +敵 +敷 +數 +敻 +敾 +斂 +斃 +文 +斌 +斎 +斐 +斑 +斕 +斖 +斗 +料 +斛 +斜 +斝 +斟 +斡 +斤 +斥 +斧 +斬 +斯 +新 +斷 +方 +於 +施 +斿 +旁 +旂 +旃 +旄 +旅 +旉 +旋 +旌 +旎 +族 +旖 +旗 +旙 +旛 +旡 +既 +日 +旦 +旨 +早 +旬 +旭 +旱 +旲 +旳 +旺 +旻 +旼 +旽 +旾 +旿 +昀 +昂 +昃 +昆 +昇 +昉 +昊 +昌 +昍 +明 +昏 +昐 +易 +昔 +昕 +昚 +昛 +昜 +昝 +昞 +星 +映 +昡 +昣 +昤 +春 +昧 +昨 +昪 +昫 +昭 +是 +昰 +昱 +昴 +昵 +昶 +昺 +晁 +時 +晃 +晈 +晉 +晊 +晏 +晗 +晙 +晚 +晛 +晝 +晞 +晟 +晤 +晦 +晧 +晨 +晩 +晪 +晫 +晭 +普 +景 +晰 +晳 +晴 +晶 +晷 +晸 +智 +晾 +暃 +暄 +暅 +暇 +暈 +暉 +暊 +暌 +暎 +暏 +暐 +暑 +暕 +暖 +暗 +暘 +暝 +暟 +暠 +暢 +暦 +暨 +暫 +暮 +暱 +暲 +暴 +暸 +暹 +暻 +暾 +曄 +曅 +曆 +曇 +曉 +曌 +曔 +曖 +曙 +曜 +曝 +曠 +曦 +曧 +曨 +曩 +曬 +曮 +曰 +曲 +曳 +更 +曶 +曷 +書 +曹 +曺 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朊 +朋 +服 +朏 +朐 +朓 +朔 +朕 +朖 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朱 +朴 +朵 +朶 +朽 +朿 +杁 +杉 +杋 +杌 +李 +杏 +材 +村 +杓 +杖 +杙 +杜 +杞 +束 +杠 +杣 +杤 +杧 +杬 +杭 +杯 +東 +杲 +杳 +杴 +杵 +杷 +杻 +杼 +松 +板 +极 +枇 +枉 +枋 +枏 +析 +枕 +枖 +林 +枚 +枛 +果 +枝 +枠 +枡 +枯 +枰 +枱 +枲 +枳 +架 +枷 +枸 +枹 +枼 +柁 +柃 +柄 +柉 +柊 +柎 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柝 +柞 +柟 +查 +柩 +柬 +柯 +柰 +柱 +柳 +柴 +柵 +柶 +柷 +査 +柾 +柿 +栃 +栄 +栐 +栒 +栓 +栜 +栝 +栞 +校 +栢 +栨 +栩 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桀 +桁 +桂 +桃 +桄 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桕 +桖 +桙 +桜 +桝 +桫 +桱 +桲 +桴 +桶 +桷 +桼 +桿 +梀 +梁 +梂 +梃 +梅 +梆 +梉 +梏 +梓 +梔 +梗 +梘 +條 +梟 +梠 +梢 +梣 +梧 +梨 +梫 +梭 +梯 +械 +梱 +梳 +梵 +梶 +梽 +棄 +棆 +棉 +棋 +棍 +棐 +棒 +棓 +棕 +棖 +棗 +棘 +棚 +棛 +棟 +棠 +棡 +棣 +棧 +棨 +棩 +棪 +棫 +森 +棱 +棲 +棵 +棶 +棹 +棺 +棻 +棼 +棽 +椅 +椆 +椇 +椋 +植 +椎 +椏 +椒 +椙 +椥 +椪 +椰 +椲 +椴 +椵 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楗 +楙 +楚 +楝 +楞 +楠 +楡 +楢 +楣 +楤 +楦 +楧 +楨 +楫 +業 +楮 +楯 +楳 +極 +楷 +楸 +楹 +楽 +楿 +概 +榆 +榊 +榍 +榎 +榑 +榔 +榕 +榖 +榗 +榘 +榛 +榜 +榞 +榢 +榣 +榤 +榦 +榧 +榨 +榫 +榭 +榮 +榲 +榴 +榷 +榻 +榿 +槀 +槁 +槃 +槊 +構 +槌 +槍 +槎 +槐 +槓 +槔 +槗 +様 +槙 +槤 +槩 +槭 +槰 +槱 +槲 +槳 +槺 +槻 +槼 +槽 +槿 +樀 +樁 +樂 +樅 +樆 +樊 +樋 +樑 +樓 +樗 +樘 +標 +樞 +樟 +模 +樣 +樨 +権 +樫 +樵 +樸 +樹 +樺 +樻 +樽 +樾 +橄 +橇 +橈 +橋 +橐 +橒 +橓 +橘 +橙 +橚 +機 +橡 +橢 +橪 +橫 +橿 +檀 +檄 +檇 +檉 +檊 +檎 +檐 +檔 +檗 +檜 +檞 +檠 +檡 +檢 +檣 +檦 +檨 +檫 +檬 +檯 +檳 +檵 +檸 +檻 +檽 +櫂 +櫃 +櫆 +櫈 +櫓 +櫚 +櫛 +櫞 +櫟 +櫥 +櫨 +櫪 +櫱 +櫸 +櫻 +櫾 +櫿 +欄 +欉 +權 +欏 +欒 +欖 +欞 +欠 +次 +欣 +欥 +欲 +欸 +欹 +欺 +欽 +款 +歆 +歇 +歉 +歊 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歿 +殂 +殃 +殄 +殆 +殉 +殊 +殑 +殖 +殘 +殛 +殞 +殟 +殤 +殭 +殮 +殯 +殲 +殳 +段 +殷 +殺 +殻 +殼 +殿 +毀 +毅 +毆 +毉 +毋 +毌 +母 +毎 +每 +毐 +毒 +毓 +比 +毖 +毗 +毘 +毛 +毫 +毬 +毯 +毴 +毸 +毽 +毿 +氂 +氈 +氍 +氏 +氐 +民 +氓 +氖 +気 +氘 +氙 +氚 +氛 +氟 +氣 +氦 +氧 +氨 +氪 +氫 +氬 +氮 +氯 +氰 +水 +氵 +氷 +永 +氹 +氻 +氽 +氾 +汀 +汁 +求 +汊 +汎 +汐 +汕 +汗 +汛 +汜 +汝 +汞 +江 +池 +污 +汧 +汨 +汩 +汪 +汭 +汰 +汲 +汴 +汶 +決 +汽 +汾 +沁 +沂 +沃 +沄 +沅 +沆 +沇 +沈 +沉 +沌 +沍 +沏 +沐 +沒 +沓 +沔 +沖 +沘 +沙 +沚 +沛 +沜 +沢 +沨 +沫 +沭 +沮 +沯 +沱 +河 +沸 +油 +沺 +治 +沼 +沽 +沾 +沿 +況 +泂 +泄 +泆 +泇 +泉 +泊 +泌 +泐 +泓 +泔 +法 +泖 +泗 +泚 +泛 +泠 +泡 +波 +泣 +泥 +泩 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +洄 +洋 +洌 +洎 +洗 +洙 +洛 +洞 +洢 +洣 +洤 +津 +洨 +洩 +洪 +洮 +洱 +洲 +洳 +洵 +洸 +洹 +洺 +活 +洽 +派 +流 +浄 +浙 +浚 +浛 +浜 +浞 +浟 +浠 +浡 +浣 +浤 +浥 +浦 +浩 +浪 +浮 +浯 +浴 +浵 +海 +浸 +浹 +涅 +涇 +消 +涉 +涌 +涎 +涑 +涓 +涔 +涕 +涙 +涪 +涫 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淄 +淅 +淆 +淇 +淋 +淌 +淍 +淎 +淏 +淑 +淓 +淖 +淘 +淙 +淚 +淛 +淝 +淞 +淠 +淡 +淤 +淥 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +淯 +淰 +深 +淳 +淵 +淶 +混 +淸 +淹 +淺 +添 +淼 +淽 +渃 +清 +済 +渉 +渋 +渕 +渙 +渚 +減 +渝 +渟 +渠 +渡 +渣 +渤 +渥 +渦 +渫 +測 +渭 +港 +渲 +渴 +游 +渺 +渼 +渽 +渾 +湃 +湄 +湉 +湊 +湍 +湓 +湔 +湖 +湘 +湛 +湜 +湞 +湟 +湣 +湥 +湧 +湫 +湮 +湯 +湳 +湴 +湼 +満 +溁 +溇 +溈 +溉 +溋 +溎 +溏 +源 +準 +溙 +溜 +溝 +溟 +溢 +溥 +溦 +溧 +溪 +溫 +溯 +溱 +溲 +溴 +溵 +溶 +溺 +溼 +滀 +滁 +滂 +滄 +滅 +滇 +滈 +滉 +滋 +滌 +滎 +滏 +滑 +滓 +滔 +滕 +滘 +滙 +滝 +滬 +滯 +滲 +滴 +滷 +滸 +滹 +滻 +滽 +滾 +滿 +漁 +漂 +漆 +漇 +漈 +漎 +漏 +漓 +演 +漕 +漚 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漴 +漵 +漷 +漸 +漼 +漾 +漿 +潁 +潑 +潔 +潘 +潛 +潞 +潟 +潢 +潤 +潭 +潮 +潯 +潰 +潲 +潺 +潼 +潽 +潾 +潿 +澀 +澁 +澂 +澄 +澆 +澇 +澈 +澉 +澋 +澌 +澍 +澎 +澔 +澗 +澠 +澡 +澣 +澤 +澥 +澧 +澪 +澮 +澯 +澱 +澳 +澶 +澹 +澻 +激 +濁 +濂 +濃 +濉 +濊 +濋 +濕 +濘 +濙 +濛 +濞 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濰 +濱 +濲 +濶 +濺 +濼 +濾 +瀁 +瀅 +瀆 +瀉 +瀍 +瀏 +瀑 +瀔 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀠 +瀣 +瀦 +瀧 +瀨 +瀬 +瀰 +瀲 +瀴 +瀶 +瀹 +瀾 +灃 +灊 +灌 +灑 +灘 +灝 +灞 +灡 +灣 +灤 +灧 +火 +灰 +灴 +灸 +灼 +災 +炁 +炅 +炆 +炊 +炎 +炒 +炔 +炕 +炘 +炙 +炟 +炣 +炤 +炫 +炬 +炭 +炮 +炯 +炱 +炲 +炳 +炷 +炸 +為 +炻 +烈 +烉 +烊 +烋 +烏 +烒 +烔 +烘 +烙 +烜 +烝 +烤 +烯 +烱 +烴 +烷 +烹 +烺 +烽 +焃 +焄 +焉 +焊 +焌 +焓 +焗 +焙 +焚 +焜 +焞 +無 +焦 +焯 +焰 +焱 +焴 +然 +焻 +焼 +焿 +煇 +煉 +煊 +煌 +煎 +煐 +煒 +煔 +煕 +煖 +煙 +煚 +煜 +煞 +煠 +煤 +煥 +煦 +照 +煨 +煩 +煬 +煮 +煲 +煳 +煵 +煶 +煸 +煽 +熄 +熅 +熇 +熈 +熊 +熏 +熒 +熔 +熖 +熗 +熘 +熙 +熜 +熟 +熠 +熤 +熥 +熨 +熬 +熯 +熱 +熲 +熳 +熵 +熹 +熺 +熼 +熾 +熿 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燏 +燐 +燒 +燔 +燕 +燘 +燙 +燚 +燜 +燝 +營 +燥 +燦 +燧 +燫 +燬 +燭 +燮 +燴 +燹 +燻 +燼 +燾 +燿 +爀 +爆 +爌 +爍 +爐 +爔 +爚 +爛 +爝 +爨 +爪 +爬 +爭 +爯 +爰 +爲 +爵 +父 +爸 +爹 +爺 +爻 +爽 +爾 +爿 +牁 +牂 +牆 +片 +版 +牌 +牒 +牕 +牖 +牘 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牧 +物 +牯 +牲 +特 +牻 +牼 +牽 +犀 +犁 +犂 +犇 +犍 +犎 +犖 +犛 +犢 +犧 +犨 +犬 +犯 +犰 +犴 +犽 +狀 +狂 +狄 +狍 +狎 +狐 +狒 +狓 +狗 +狙 +狛 +狟 +狠 +狡 +狦 +狨 +狩 +狳 +狶 +狷 +狸 +狹 +狻 +狼 +猁 +猄 +猇 +猊 +猗 +猙 +猛 +猜 +猝 +猞 +猢 +猥 +猨 +猩 +猳 +猴 +猶 +猷 +猺 +猻 +猾 +猿 +獁 +獃 +獄 +獅 +獇 +獎 +獏 +獐 +獒 +獠 +獢 +獣 +獨 +獬 +獮 +獯 +獰 +獲 +獴 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玀 +玄 +玆 +率 +玉 +王 +玎 +玏 +玓 +玕 +玖 +玗 +玘 +玙 +玟 +玠 +玡 +玢 +玥 +玧 +玨 +玩 +玫 +玭 +玲 +玳 +玶 +玷 +玹 +玻 +玾 +珀 +珂 +珅 +珈 +珉 +珊 +珌 +珍 +珎 +珏 +珖 +珙 +珝 +珞 +珠 +珡 +珣 +珤 +珥 +珦 +珧 +珩 +珪 +班 +珮 +珵 +珹 +珺 +珽 +現 +琁 +球 +琄 +琅 +理 +琇 +琉 +琊 +琍 +琎 +琚 +琛 +琡 +琢 +琤 +琥 +琦 +琨 +琪 +琬 +琮 +琯 +琰 +琱 +琳 +琴 +琵 +琶 +琹 +琺 +琿 +瑀 +瑁 +瑂 +瑄 +瑅 +瑆 +瑈 +瑊 +瑋 +瑑 +瑒 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑝 +瑞 +瑟 +瑠 +瑢 +瑣 +瑤 +瑥 +瑧 +瑨 +瑩 +瑪 +瑭 +瑯 +瑰 +瑱 +瑳 +瑴 +瑺 +瑾 +璀 +璁 +璃 +璄 +璆 +璇 +璈 +璉 +璋 +璌 +璐 +璕 +璘 +璙 +璚 +璜 +璞 +璟 +璠 +璡 +璣 +璥 +璦 +璧 +璨 +璩 +璪 +璫 +璬 +璮 +環 +璱 +璵 +璸 +璹 +璽 +璿 +瓈 +瓊 +瓌 +瓏 +瓑 +瓔 +瓖 +瓘 +瓚 +瓛 +瓜 +瓞 +瓠 +瓢 +瓣 +瓤 +瓦 +瓮 +瓴 +瓶 +瓷 +瓿 +甂 +甄 +甌 +甍 +甑 +甕 +甘 +甙 +甚 +甜 +生 +甡 +產 +産 +甥 +甦 +用 +甩 +甪 +甫 +甬 +甯 +田 +由 +甲 +申 +男 +甸 +甹 +町 +甾 +畀 +畇 +畈 +畊 +畋 +界 +畎 +畏 +畐 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +畦 +畧 +番 +畫 +畬 +畯 +異 +畲 +畳 +畵 +當 +畷 +畸 +畹 +畿 +疃 +疆 +疇 +疊 +疋 +疌 +疍 +疏 +疑 +疒 +疕 +疙 +疚 +疝 +疣 +疤 +疥 +疫 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痊 +痍 +痔 +痕 +痘 +痙 +痛 +痞 +痟 +痠 +痢 +痣 +痤 +痧 +痩 +痰 +痱 +痲 +痴 +痹 +痺 +痿 +瘀 +瘁 +瘊 +瘋 +瘍 +瘓 +瘙 +瘜 +瘞 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘰 +瘴 +瘺 +癀 +療 +癆 +癇 +癌 +癒 +癖 +癘 +癜 +癟 +癡 +癢 +癤 +癥 +癩 +癬 +癭 +癮 +癯 +癰 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皐 +皓 +皖 +皙 +皚 +皛 +皝 +皞 +皮 +皰 +皴 +皷 +皸 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盋 +盌 +盎 +盒 +盔 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盦 +盧 +盨 +盩 +盪 +盫 +目 +盯 +盱 +盲 +直 +盷 +相 +盹 +盺 +盼 +盾 +眀 +省 +眉 +看 +県 +眙 +眛 +眜 +眞 +真 +眠 +眥 +眨 +眩 +眭 +眯 +眵 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睇 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睭 +睹 +睺 +睽 +睾 +睿 +瞄 +瞅 +瞋 +瞌 +瞎 +瞑 +瞓 +瞞 +瞢 +瞥 +瞧 +瞪 +瞫 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞽 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矞 +矢 +矣 +知 +矧 +矩 +短 +矮 +矯 +石 +矸 +矽 +砂 +砋 +砌 +砍 +砒 +研 +砝 +砢 +砥 +砦 +砧 +砩 +砫 +砭 +砮 +砯 +砰 +砲 +砳 +破 +砵 +砷 +砸 +砼 +硂 +硃 +硅 +硇 +硏 +硐 +硒 +硓 +硚 +硜 +硝 +硤 +硨 +硫 +硬 +硭 +硯 +硼 +碁 +碇 +碉 +碌 +碎 +碑 +碓 +碕 +碗 +碘 +碚 +碟 +碡 +碣 +碧 +碩 +碪 +碭 +碰 +碲 +碳 +碴 +碶 +碸 +確 +碻 +碼 +碽 +碾 +磁 +磅 +磊 +磋 +磐 +磔 +磕 +磘 +磙 +磚 +磜 +磡 +磨 +磪 +磬 +磯 +磱 +磲 +磵 +磷 +磺 +磻 +磾 +礁 +礄 +礎 +礐 +礑 +礒 +礙 +礠 +礦 +礪 +礫 +礬 +礮 +礱 +礴 +示 +礻 +礽 +社 +祀 +祁 +祂 +祆 +祇 +祈 +祉 +祋 +祏 +祐 +祓 +祕 +祖 +祗 +祙 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祥 +祧 +票 +祭 +祹 +祺 +祼 +祿 +禁 +禃 +禇 +禍 +禎 +福 +禑 +禓 +禔 +禕 +禘 +禛 +禟 +禠 +禤 +禦 +禧 +禨 +禩 +禪 +禮 +禰 +禱 +禵 +禹 +禺 +禼 +禽 +禾 +禿 +秀 +私 +秈 +秉 +秋 +科 +秒 +秕 +秘 +租 +秠 +秣 +秤 +秦 +秧 +秩 +秭 +秳 +秸 +移 +稀 +稅 +稈 +稉 +程 +稍 +稑 +稔 +稗 +稘 +稙 +稚 +稜 +稞 +稟 +稠 +種 +稱 +稲 +稷 +稹 +稺 +稻 +稼 +稽 +稾 +稿 +穀 +穂 +穆 +穈 +穉 +穌 +積 +穎 +穗 +穟 +穠 +穡 +穢 +穣 +穩 +穫 +穰 +穴 +穵 +究 +穹 +空 +穿 +突 +窄 +窅 +窈 +窋 +窒 +窕 +窖 +窗 +窘 +窟 +窠 +窣 +窨 +窩 +窪 +窮 +窯 +窰 +窶 +窺 +窿 +竄 +竅 +竇 +竈 +竊 +立 +竑 +站 +竜 +竟 +章 +竣 +童 +竦 +竩 +竭 +端 +競 +竹 +竺 +竻 +竿 +笄 +笆 +笈 +笏 +笑 +笘 +笙 +笛 +笞 +笠 +笥 +符 +笨 +笩 +笪 +第 +笭 +笮 +笯 +笱 +笳 +笹 +筅 +筆 +等 +筊 +筋 +筌 +筍 +筏 +筐 +筒 +答 +策 +筘 +筠 +筥 +筦 +筧 +筬 +筭 +筱 +筲 +筳 +筵 +筶 +筷 +筻 +箆 +箇 +箋 +箍 +箏 +箐 +箑 +箒 +箔 +箕 +算 +箜 +管 +箬 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篊 +篋 +篌 +篔 +篙 +篝 +篠 +篡 +篤 +篥 +篦 +篩 +篪 +篭 +篯 +篳 +篷 +簀 +簃 +簇 +簉 +簋 +簍 +簑 +簕 +簗 +簞 +簠 +簡 +簧 +簪 +簫 +簷 +簸 +簹 +簺 +簽 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籙 +籛 +籜 +籝 +籟 +籠 +籣 +籤 +籥 +籪 +籬 +籮 +籲 +米 +籽 +籾 +粄 +粉 +粍 +粑 +粒 +粕 +粗 +粘 +粟 +粢 +粥 +粦 +粧 +粩 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糀 +糅 +糊 +糌 +糍 +糎 +糕 +糖 +糙 +糜 +糝 +糞 +糟 +糠 +糢 +糧 +糬 +糯 +糰 +糴 +糶 +糸 +糹 +糺 +系 +糾 +紀 +紂 +約 +紅 +紆 +紇 +紈 +紉 +紊 +紋 +納 +紐 +紑 +紓 +純 +紕 +紗 +紘 +紙 +級 +紛 +紜 +紝 +紞 +素 +紡 +索 +紫 +紮 +累 +細 +紱 +紲 +紳 +紵 +紹 +紺 +紿 +終 +絃 +組 +絆 +経 +絎 +結 +絕 +絛 +絜 +絞 +絡 +絢 +給 +絨 +絪 +絮 +統 +絲 +絳 +絵 +絶 +絹 +絺 +綁 +綃 +綈 +綉 +綎 +綏 +經 +綖 +継 +続 +綜 +綝 +綞 +綠 +綢 +綣 +綦 +綧 +綫 +綬 +維 +綮 +綰 +綱 +網 +綳 +綴 +綸 +綺 +綻 +綽 +綾 +綿 +緁 +緃 +緄 +緈 +緊 +緋 +総 +緑 +緒 +緖 +緘 +線 +緜 +緝 +緞 +締 +緡 +緣 +緤 +編 +緩 +緬 +緯 +緱 +緲 +練 +緹 +緻 +縂 +縄 +縈 +縉 +縊 +縕 +縛 +縝 +縞 +縠 +縡 +縣 +縤 +縫 +縮 +縯 +縱 +縴 +縵 +縷 +縹 +縻 +總 +績 +繁 +繃 +繆 +繇 +繒 +織 +繕 +繖 +繙 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繰 +繳 +繹 +繻 +繼 +繽 +繾 +纁 +纂 +纈 +續 +纍 +纏 +纓 +纔 +纕 +纖 +纘 +纛 +纜 +缐 +缶 +缸 +缺 +缽 +罃 +罄 +罅 +罈 +罉 +罌 +罍 +罐 +罔 +罕 +罘 +罟 +罡 +罨 +罩 +罪 +置 +罰 +罱 +署 +罳 +罵 +罶 +罷 +罹 +罽 +羂 +羅 +羆 +羈 +羊 +羋 +羌 +美 +羔 +羕 +羗 +羙 +羚 +羞 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羰 +羱 +羲 +羸 +羹 +羽 +羿 +翀 +翁 +翂 +翃 +翅 +翊 +翌 +翎 +翏 +習 +翔 +翕 +翙 +翜 +翟 +翠 +翡 +翥 +翦 +翩 +翬 +翮 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +而 +耍 +耎 +耐 +耑 +耒 +耔 +耕 +耗 +耘 +耙 +耜 +耦 +耨 +耬 +耳 +耵 +耶 +耷 +耽 +耿 +聃 +聆 +聊 +聒 +聖 +聘 +聚 +聞 +聟 +聨 +聯 +聰 +聱 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肖 +肘 +肚 +肛 +肜 +肝 +肟 +股 +肢 +肥 +肩 +肪 +肫 +肯 +肱 +育 +肸 +肹 +肺 +肼 +肽 +胂 +胃 +胄 +胅 +胇 +胊 +背 +胍 +胎 +胖 +胗 +胙 +胚 +胛 +胝 +胞 +胡 +胤 +胥 +胬 +胭 +胰 +胱 +胳 +胴 +胸 +胺 +胼 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脒 +脖 +脘 +脛 +脣 +脩 +脫 +脬 +脭 +脯 +脲 +脳 +脷 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腧 +腩 +腫 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腿 +膀 +膂 +膈 +膊 +膏 +膚 +膛 +膜 +膝 +膠 +膣 +膥 +膦 +膨 +膩 +膮 +膳 +膺 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臏 +臘 +臚 +臞 +臟 +臠 +臣 +臧 +臨 +自 +臭 +臯 +至 +致 +臺 +臻 +臼 +臾 +舂 +舅 +與 +興 +舉 +舊 +舌 +舍 +舎 +舒 +舔 +舖 +舘 +舛 +舜 +舞 +舟 +舢 +舥 +舨 +舩 +航 +舫 +般 +舲 +舵 +舶 +舷 +舸 +船 +舺 +艅 +艇 +艉 +艋 +艎 +艏 +艔 +艘 +艙 +艚 +艦 +艮 +良 +艱 +色 +艶 +艷 +艸 +艽 +艾 +艿 +芃 +芊 +芋 +芍 +芎 +芑 +芒 +芘 +芙 +芛 +芝 +芡 +芥 +芨 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芴 +芷 +芸 +芹 +芻 +芽 +芾 +苄 +苅 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苝 +苞 +苟 +苡 +苣 +苤 +若 +苦 +苧 +苪 +苫 +苯 +英 +苳 +苴 +苷 +苺 +苻 +苼 +苾 +茀 +茁 +茂 +范 +茄 +茅 +茆 +茇 +茈 +茉 +茌 +茗 +茘 +茚 +茛 +茜 +茝 +茨 +茫 +茬 +茭 +茮 +茯 +茱 +茲 +茴 +茵 +茶 +茷 +茸 +茹 +茺 +茼 +荀 +荃 +荅 +荇 +草 +荊 +荎 +荏 +荒 +荔 +荖 +荘 +荳 +荷 +荸 +荻 +荼 +荽 +莆 +莉 +莊 +莎 +莒 +莓 +莕 +莖 +莘 +莙 +莛 +莜 +莞 +莠 +莢 +莧 +莨 +莩 +莪 +莫 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菉 +菊 +菌 +菍 +菏 +菑 +菓 +菔 +菖 +菘 +菜 +菝 +菟 +菠 +菡 +菥 +菩 +菪 +菫 +華 +菰 +菱 +菲 +菴 +菶 +菸 +菹 +菺 +菼 +菽 +菾 +萁 +萃 +萄 +萇 +萊 +萌 +萍 +萎 +萐 +萘 +萜 +萠 +萡 +萣 +萩 +萬 +萭 +萱 +萵 +萸 +萹 +萼 +落 +葃 +葆 +葉 +葊 +葎 +葑 +葒 +著 +葙 +葚 +葛 +葜 +葝 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葰 +葳 +葵 +葶 +葷 +葺 +蒂 +蒄 +蒍 +蒎 +蒐 +蒓 +蒔 +蒗 +蒙 +蒜 +蒞 +蒟 +蒡 +蒢 +蒤 +蒧 +蒨 +蒭 +蒯 +蒲 +蒴 +蒸 +蒹 +蒺 +蒻 +蒼 +蒽 +蒾 +蒿 +蓀 +蓁 +蓂 +蓄 +蓆 +蓉 +蓋 +蓍 +蓑 +蓓 +蓖 +蓘 +蓚 +蓧 +蓨 +蓪 +蓬 +蓭 +蓮 +蓯 +蓳 +蓼 +蓽 +蓿 +蔆 +蔎 +蔑 +蔓 +蔔 +蔕 +蔗 +蔘 +蔚 +蔝 +蔞 +蔡 +蔣 +蔥 +蔦 +蔬 +蔭 +蔴 +蔵 +蔻 +蔽 +蕁 +蕃 +蕅 +蕈 +蕉 +蕊 +蕎 +蕑 +蕒 +蕖 +蕘 +蕙 +蕚 +蕟 +蕡 +蕢 +蕤 +蕨 +蕩 +蕪 +蕭 +蕷 +蕹 +蕺 +蕻 +蕾 +薀 +薄 +薆 +薇 +薈 +薊 +薌 +薏 +薐 +薑 +薔 +薗 +薘 +薙 +薛 +薜 +薞 +薟 +薡 +薦 +薨 +薩 +薪 +薫 +薬 +薯 +薰 +薲 +薷 +薸 +薹 +薺 +薾 +薿 +藁 +藉 +藍 +藎 +藏 +藐 +藔 +藕 +藜 +藝 +藟 +藤 +藥 +藦 +藨 +藩 +藪 +藶 +藸 +藹 +藺 +藻 +藿 +蘂 +蘄 +蘅 +蘆 +蘇 +蘊 +蘋 +蘐 +蘑 +蘓 +蘗 +蘘 +蘚 +蘞 +蘢 +蘧 +蘩 +蘭 +蘵 +蘶 +蘸 +蘼 +蘿 +虉 +虎 +虐 +虓 +虔 +處 +虖 +虛 +虜 +虞 +號 +虢 +虧 +虨 +虯 +虱 +虵 +虹 +虺 +虻 +蚆 +蚊 +蚋 +蚌 +蚍 +蚓 +蚖 +蚜 +蚝 +蚡 +蚢 +蚣 +蚤 +蚧 +蚨 +蚩 +蚪 +蚯 +蚱 +蚴 +蚵 +蚶 +蚺 +蚼 +蛀 +蛄 +蛇 +蛉 +蛋 +蛍 +蛐 +蛑 +蛔 +蛙 +蛛 +蛞 +蛟 +蛤 +蛭 +蛯 +蛸 +蛹 +蛺 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜉 +蜊 +蜍 +蜑 +蜒 +蜓 +蜘 +蜚 +蜛 +蜜 +蜞 +蜢 +蜣 +蜥 +蜨 +蜮 +蜯 +蜱 +蜴 +蜷 +蜻 +蜾 +蜿 +蝀 +蝌 +蝍 +蝎 +蝓 +蝕 +蝗 +蝘 +蝙 +蝚 +蝟 +蝠 +蝣 +蝤 +蝦 +蝨 +蝮 +蝯 +蝰 +蝲 +蝴 +蝶 +蝸 +蝽 +螂 +螃 +螄 +螅 +螈 +螋 +融 +螐 +螔 +螞 +螟 +螠 +螢 +螣 +螥 +螫 +螭 +螯 +螳 +螶 +螺 +螻 +螽 +螾 +蟀 +蟄 +蟅 +蟆 +蟊 +蟋 +蟌 +蟎 +蟑 +蟒 +蟜 +蟠 +蟥 +蟪 +蟫 +蟬 +蟯 +蟲 +蟳 +蟴 +蟶 +蟹 +蟻 +蟾 +蠂 +蠃 +蠄 +蠅 +蠆 +蠊 +蠋 +蠍 +蠐 +蠑 +蠓 +蠔 +蠕 +蠖 +蠘 +蠙 +蠟 +蠡 +蠢 +蠣 +蠱 +蠲 +蠵 +蠶 +蠷 +蠹 +蠻 +血 +衂 +衆 +行 +衍 +衎 +術 +衕 +衖 +街 +衙 +衚 +衛 +衜 +衝 +衞 +衡 +衢 +衣 +表 +衩 +衫 +衰 +衲 +衷 +衽 +衾 +衿 +袁 +袂 +袈 +袋 +袍 +袓 +袖 +袛 +袞 +袤 +袪 +被 +袱 +袴 +袾 +裁 +裂 +裊 +裎 +裒 +裔 +裕 +裖 +裘 +裙 +補 +裝 +裟 +裡 +裨 +裬 +裱 +裳 +裴 +裵 +裸 +裹 +製 +裾 +裿 +褀 +褂 +複 +褌 +褍 +褎 +褐 +褒 +褓 +褔 +褘 +褙 +褚 +褞 +褥 +褧 +褪 +褫 +褭 +褲 +褶 +褸 +褻 +襄 +襌 +襖 +襞 +襟 +襠 +襤 +襦 +襪 +襯 +襲 +襴 +襶 +襻 +襾 +西 +要 +覃 +覆 +覇 +覈 +見 +覌 +規 +覓 +視 +覚 +覡 +覦 +覧 +親 +覬 +覲 +観 +覺 +覽 +覿 +觀 +角 +觔 +觙 +觚 +觜 +解 +觭 +觱 +觴 +觶 +觸 +觿 +言 +訁 +訂 +訃 +訇 +計 +訊 +訌 +討 +訏 +訐 +訒 +訓 +訔 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訴 +訶 +診 +註 +証 +訾 +詁 +詆 +詈 +詐 +詒 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +詥 +試 +詧 +詩 +詫 +詭 +詮 +詰 +話 +該 +詳 +詵 +詹 +詼 +誄 +誅 +誇 +誌 +認 +誒 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誴 +誹 +誼 +誾 +調 +談 +請 +諍 +諏 +諒 +論 +諗 +諜 +諟 +諠 +諡 +諤 +諦 +諧 +諪 +諫 +諭 +諮 +諱 +諲 +諳 +諴 +諶 +諷 +諸 +諺 +諼 +諾 +謀 +謁 +謂 +謄 +謇 +謊 +謌 +謎 +謏 +謐 +謔 +謖 +謗 +謙 +謚 +講 +謜 +謝 +謠 +謢 +謤 +謨 +謩 +謫 +謬 +謳 +謹 +謾 +證 +譏 +譓 +譔 +識 +譙 +譚 +譜 +譞 +警 +譫 +譬 +譭 +譯 +議 +譲 +譳 +譴 +護 +譽 +譿 +讀 +讃 +變 +讌 +讎 +讓 +讖 +讙 +讚 +讜 +讞 +谷 +谿 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豕 +豚 +象 +豢 +豨 +豪 +豫 +豬 +豳 +豸 +豹 +豺 +豿 +貂 +貅 +貉 +貊 +貌 +貐 +貒 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貤 +貧 +貨 +販 +貪 +貫 +責 +貭 +貮 +貯 +貲 +貳 +貴 +貶 +買 +貸 +貺 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賒 +賓 +賔 +賕 +賚 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +賨 +質 +賬 +賭 +賴 +賹 +賺 +賻 +購 +賽 +賾 +贄 +贅 +贇 +贈 +贊 +贌 +贍 +贏 +贓 +贔 +贖 +贛 +赤 +赦 +赧 +赫 +赬 +赭 +走 +赳 +赴 +起 +趁 +超 +越 +趐 +趕 +趖 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趺 +趼 +趾 +跅 +跆 +跋 +跌 +跏 +跑 +跖 +跗 +跛 +距 +跟 +跡 +跣 +跤 +跨 +跩 +跪 +路 +跳 +踎 +踏 +踐 +踝 +踞 +踢 +踩 +踰 +踴 +踹 +踺 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹕 +蹙 +蹟 +蹠 +蹤 +蹦 +蹬 +蹭 +蹯 +蹲 +蹴 +蹶 +蹺 +蹻 +蹼 +躁 +躂 +躄 +躉 +躋 +躍 +躑 +躒 +躔 +躝 +躪 +身 +躬 +躰 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軎 +軒 +軔 +軛 +軟 +転 +軫 +軲 +軸 +軹 +軺 +軻 +軼 +軽 +軾 +較 +輄 +輅 +載 +輋 +輒 +輓 +輔 +輕 +輛 +輝 +輞 +輟 +輥 +輦 +輩 +輪 +輬 +輭 +輯 +輶 +輸 +輻 +輾 +輿 +轀 +轂 +轄 +轅 +轆 +轉 +轍 +轎 +轘 +轝 +轟 +轤 +辛 +辜 +辟 +辣 +辦 +辧 +辨 +辭 +辮 +辯 +辰 +辱 +農 +辵 +辺 +辻 +込 +迂 +迄 +迅 +迎 +近 +返 +迢 +迤 +迥 +迦 +迪 +迫 +迭 +迮 +述 +迴 +迵 +迷 +迸 +迺 +追 +退 +送 +逃 +逄 +逅 +逆 +逈 +逋 +逌 +逍 +逎 +透 +逐 +逑 +途 +逕 +逖 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逤 +逨 +逮 +逯 +進 +逴 +逵 +逸 +逹 +逺 +逼 +逾 +遁 +遂 +遄 +遇 +遊 +運 +遍 +過 +遏 +遐 +遒 +道 +達 +違 +遘 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遨 +適 +遭 +遮 +遯 +遲 +遴 +遵 +遶 +遷 +選 +遹 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邉 +邊 +邋 +邏 +邑 +邕 +邗 +邙 +邛 +邠 +邡 +邢 +那 +邦 +邨 +邪 +邯 +邰 +邱 +邲 +邳 +邴 +邵 +邸 +邽 +邾 +郁 +郃 +郄 +郅 +郇 +郊 +郋 +郎 +郗 +郛 +郜 +郝 +郞 +郟 +郡 +郢 +郤 +部 +郪 +郫 +郭 +郯 +郳 +郴 +郵 +郷 +都 +郾 +郿 +鄂 +鄃 +鄄 +鄆 +鄉 +鄋 +鄑 +鄒 +鄔 +鄖 +鄗 +鄘 +鄙 +鄚 +鄜 +鄞 +鄠 +鄢 +鄣 +鄤 +鄧 +鄩 +鄫 +鄭 +鄯 +鄰 +鄱 +鄲 +鄳 +鄴 +鄺 +酃 +酆 +酈 +酉 +酊 +酋 +酌 +配 +酎 +酏 +酐 +酒 +酔 +酗 +酚 +酞 +酡 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酴 +酵 +酶 +酷 +酸 +酺 +酼 +醁 +醂 +醃 +醅 +醇 +醉 +醋 +醌 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醢 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +醾 +醿 +釀 +釁 +釆 +采 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釒 +釓 +釔 +釕 +釗 +釘 +釙 +釚 +釜 +針 +釣 +釤 +釦 +釧 +釩 +釪 +釭 +釴 +釵 +釷 +釹 +釺 +鈀 +鈁 +鈄 +鈇 +鈈 +鈉 +鈊 +鈍 +鈏 +鈐 +鈑 +鈔 +鈕 +鈖 +鈞 +鈢 +鈣 +鈥 +鈦 +鈫 +鈮 +鈰 +鈳 +鈴 +鈷 +鈸 +鈹 +鈺 +鈾 +鈿 +鉀 +鉄 +鉅 +鉆 +鉈 +鉉 +鉋 +鉌 +鉍 +鉏 +鉑 +鉓 +鉗 +鉚 +鉛 +鉞 +鉟 +鉤 +鉦 +鉬 +鉭 +鉲 +鉶 +鉷 +鉸 +鉻 +鉾 +鉿 +銀 +銂 +銃 +銅 +銋 +銍 +銑 +銓 +銕 +銖 +銘 +銚 +銜 +銠 +銣 +銥 +銦 +銨 +銩 +銪 +銫 +銬 +銭 +銱 +銲 +銳 +銶 +銷 +銹 +銻 +銼 +銾 +鋁 +鋅 +鋆 +鋇 +鋌 +鋏 +鋐 +鋒 +鋕 +鋗 +鋙 +鋡 +鋤 +鋥 +鋦 +鋨 +鋪 +鋮 +鋯 +鋰 +鋱 +鋳 +鋶 +鋸 +鋹 +鋼 +錀 +錄 +錏 +錐 +錒 +錕 +錘 +錚 +錞 +錟 +錠 +錡 +錢 +錦 +錨 +錫 +錬 +錮 +錯 +錳 +錶 +錸 +錻 +鍀 +鍇 +鍈 +鍉 +鍊 +鍋 +鍍 +鍏 +鍔 +鍘 +鍛 +鍝 +鍟 +鍠 +鍥 +鍩 +鍬 +鍱 +鍳 +鍵 +鍶 +鍷 +鍺 +鍼 +鍾 +鎂 +鎅 +鎊 +鎌 +鎏 +鎓 +鎔 +鎖 +鎗 +鎘 +鎚 +鎛 +鎢 +鎣 +鎦 +鎧 +鎪 +鎬 +鎭 +鎮 +鎰 +鎳 +鎵 +鎻 +鏃 +鏇 +鏈 +鏊 +鏌 +鏐 +鏑 +鏓 +鏖 +鏗 +鏘 +鏜 +鏝 +鏞 +鏟 +鏡 +鏢 +鏤 +鏦 +鏳 +鏴 +鏵 +鏷 +鏻 +鏽 +鐃 +鐇 +鐈 +鐓 +鐔 +鐘 +鐙 +鐠 +鐡 +鐤 +鐦 +鐧 +鐫 +鐬 +鐭 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鐽 +鐿 +鑀 +鑁 +鑂 +鑄 +鑅 +鑊 +鑌 +鑑 +鑒 +鑛 +鑠 +鑣 +鑨 +鑪 +鑫 +鑭 +鑰 +鑲 +鑴 +鑷 +鑼 +鑽 +鑾 +鑿 +長 +門 +閂 +閃 +閆 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閜 +閞 +閟 +関 +閣 +閥 +閦 +閨 +閩 +閬 +閭 +閰 +閱 +閶 +閹 +閻 +閼 +閾 +閿 +闆 +闇 +闈 +闊 +闋 +闌 +闍 +闐 +闓 +闔 +闕 +闖 +闘 +關 +闞 +闡 +闢 +闥 +阜 +阝 +阡 +阪 +阭 +阮 +阯 +阱 +防 +阻 +阿 +陀 +陁 +陂 +附 +陋 +陌 +降 +限 +陔 +陘 +陛 +陜 +陝 +陞 +陟 +陡 +院 +陣 +除 +陪 +陬 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隄 +隅 +隆 +隈 +隊 +隋 +隍 +階 +隔 +隕 +隗 +隘 +隙 +際 +障 +隣 +隧 +隨 +險 +隰 +隱 +隲 +隳 +隴 +隷 +隸 +隹 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雝 +雞 +離 +難 +雨 +雩 +雪 +雫 +雯 +雱 +雲 +零 +雷 +雹 +電 +需 +霄 +霅 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霙 +霜 +霞 +霤 +霧 +霨 +霰 +露 +霶 +霸 +霹 +霽 +霾 +靁 +靂 +靄 +靈 +靉 +靑 +青 +靖 +靚 +靛 +靜 +非 +靠 +靡 +面 +革 +靫 +靬 +靭 +靳 +靴 +靶 +靺 +靼 +鞅 +鞆 +鞋 +鞍 +鞏 +鞘 +鞞 +鞠 +鞣 +鞥 +鞦 +鞨 +鞭 +鞮 +鞴 +韁 +韃 +韆 +韋 +韌 +韑 +韓 +韙 +韜 +韞 +韠 +韡 +韭 +韮 +音 +韶 +韺 +韻 +韾 +響 +頁 +頂 +頃 +項 +順 +須 +頊 +頌 +頍 +頎 +頏 +預 +頑 +頒 +頓 +頔 +頗 +領 +頜 +頠 +頡 +頤 +頦 +頫 +頭 +頰 +頴 +頵 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顒 +顓 +顔 +顕 +顗 +願 +顙 +顛 +類 +顥 +顧 +顫 +顯 +顰 +顱 +顳 +顴 +風 +颮 +颯 +颱 +颶 +颺 +颼 +飄 +飆 +飈 +飛 +食 +飠 +飡 +飢 +飥 +飩 +飪 +飫 +飬 +飭 +飮 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餄 +餅 +餉 +養 +餌 +餎 +餐 +餒 +餓 +餗 +餘 +餚 +餛 +餞 +餠 +餡 +館 +餮 +餵 +餺 +餾 +餿 +饃 +饅 +饋 +饌 +饑 +饒 +饕 +饗 +饞 +饟 +饢 +首 +馗 +馘 +香 +馛 +馥 +馦 +馨 +馬 +馭 +馮 +馯 +馱 +馳 +馴 +馼 +駁 +駄 +駅 +駆 +駐 +駑 +駒 +駔 +駕 +駘 +駙 +駛 +駝 +駟 +駢 +駭 +駰 +駱 +駿 +騁 +騂 +騄 +騅 +騋 +騎 +騏 +験 +騖 +騙 +騤 +騨 +騫 +騭 +騮 +騰 +騶 +騷 +騾 +驁 +驃 +驄 +驅 +驊 +驌 +驍 +驎 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驤 +驥 +驩 +驪 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髀 +髂 +髎 +髏 +髑 +髒 +髓 +體 +高 +髙 +髡 +髦 +髪 +髭 +髮 +髯 +髲 +髷 +髹 +髻 +鬃 +鬄 +鬅 +鬆 +鬍 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬨 +鬩 +鬪 +鬬 +鬮 +鬯 +鬱 +鬲 +鬹 +鬻 +鬼 +魁 +魂 +魃 +魄 +魅 +魈 +魋 +魍 +魎 +魏 +魔 +魕 +魘 +魚 +魛 +魞 +魟 +魣 +魨 +魩 +魮 +魯 +魴 +魷 +鮀 +鮁 +鮃 +鮄 +鮊 +鮋 +鮍 +鮐 +鮑 +鮒 +鮓 +鮗 +鮜 +鮟 +鮠 +鮡 +鮣 +鮨 +鮪 +鮫 +鮭 +鮮 +鮰 +鮸 +鮹 +鮻 +鯀 +鯁 +鯃 +鯇 +鯉 +鯊 +鯏 +鯒 +鯓 +鯔 +鯕 +鯖 +鯗 +鯙 +鯛 +鯡 +鯢 +鯤 +鯧 +鯨 +鯪 +鯭 +鯮 +鯰 +鯶 +鯷 +鯻 +鯽 +鯿 +鰂 +鰃 +鰆 +鰈 +鰉 +鰍 +鰏 +鰒 +鰓 +鰕 +鰗 +鰛 +鰜 +鰟 +鰣 +鰤 +鰧 +鰨 +鰩 +鰭 +鰮 +鰱 +鰲 +鰳 +鰶 +鰷 +鰹 +鰺 +鰻 +鰼 +鰾 +鱀 +鱂 +鱅 +鱇 +鱈 +鱉 +鱊 +鱒 +鱓 +鱔 +鱖 +鱗 +鱘 +鱚 +鱝 +鱟 +鱠 +鱣 +鱥 +鱧 +鱨 +鱬 +鱮 +鱰 +鱲 +鱵 +鱷 +鱸 +鱺 +鱻 +鳥 +鳧 +鳩 +鳯 +鳰 +鳳 +鳴 +鳶 +鳽 +鴆 +鴇 +鴉 +鴒 +鴓 +鴕 +鴗 +鴛 +鴝 +鴞 +鴟 +鴡 +鴣 +鴦 +鴨 +鴫 +鴯 +鴰 +鴴 +鴻 +鴿 +鵂 +鵄 +鵎 +鵐 +鵑 +鵒 +鵓 +鵙 +鵜 +鵝 +鵞 +鵟 +鵠 +鵡 +鵪 +鵬 +鵯 +鵰 +鵲 +鵵 +鵼 +鵾 +鶆 +鶇 +鶉 +鶏 +鶒 +鶓 +鶘 +鶚 +鶡 +鶥 +鶩 +鶬 +鶯 +鶲 +鶴 +鶹 +鶺 +鶻 +鶼 +鶿 +鷂 +鷄 +鷉 +鷎 +鷓 +鷗 +鷙 +鷚 +鷟 +鷥 +鷦 +鷫 +鷯 +鷲 +鷳 +鷸 +鷹 +鷺 +鸊 +鸌 +鸐 +鸑 +鸕 +鸘 +鸚 +鸛 +鸜 +鸝 +鸞 +鹮 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麅 +麇 +麈 +麊 +麋 +麐 +麒 +麓 +麗 +麝 +麞 +麟 +麥 +麩 +麪 +麯 +麴 +麵 +麹 +麺 +麻 +麼 +麽 +麾 +麿 +黁 +黃 +黇 +黌 +黍 +黎 +黏 +黐 +黑 +黒 +黔 +默 +黙 +黛 +黜 +黝 +點 +黟 +黥 +黧 +黨 +黯 +黴 +黶 +黻 +黼 +黽 +黿 +鼂 +鼇 +鼈 +鼉 +鼎 +鼐 +鼒 +鼓 +鼕 +鼙 +鼠 +鼢 +鼩 +鼬 +鼯 +鼱 +鼴 +鼷 +鼻 +鼽 +鼾 +齊 +齋 +齒 +齕 +齡 +齣 +齦 +齧 +齲 +齶 +龍 +龎 +龐 +龑 +龔 +龕 +龜 +龝 +龠 +龢 +郎 +凉 +﹑ +﹗ +﹝ +﹞ +﹢ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +K +L +M +N +O +P +R +S +T +U +V +W +Y +Z +[ +] +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +r +s +t +u +z +{ +| +} +~ +¥ +𣇉 + diff --git a/backend/ppocr/utils/dict/cyrillic_dict.txt b/backend/ppocr/utils/dict/cyrillic_dict.txt new file mode 100644 index 0000000..2b6f664 --- /dev/null +++ b/backend/ppocr/utils/dict/cyrillic_dict.txt @@ -0,0 +1,163 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +Ё +Є +І +Ј +Љ +Ў +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я +ё +ђ +є +і +ј +љ +њ +ћ +ў +џ +Ґ +ґ diff --git a/backend/ppocr/utils/dict/devanagari_dict.txt b/backend/ppocr/utils/dict/devanagari_dict.txt new file mode 100644 index 0000000..f559230 --- /dev/null +++ b/backend/ppocr/utils/dict/devanagari_dict.txt @@ -0,0 +1,167 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +॒ +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/backend/ppocr/utils/dict/en_dict.txt b/backend/ppocr/utils/dict/en_dict.txt new file mode 100644 index 0000000..7677d31 --- /dev/null +++ b/backend/ppocr/utils/dict/en_dict.txt @@ -0,0 +1,95 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ + diff --git a/backend/ppocr/utils/dict/es_dict.txt b/backend/ppocr/utils/dict/es_dict.txt new file mode 100644 index 0000000..f195f1e --- /dev/null +++ b/backend/ppocr/utils/dict/es_dict.txt @@ -0,0 +1,110 @@ +x +i +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +a +l +3 +6 +4 +5 +. +j +p + +Q +u +e +r +o +8 +7 +n +c +9 +t +b +é +q +d +ó +y +F +s +, +O +í +T +f +" +U +M +h +: +P +H +A +E +D +z +N +á +ñ +ú +% +; +è ++ +Y +- +B +G +( +) +¿ +? +w +¡ +! +X +É +K +k +Á +ü +Ú +« +» +J +' +ö +W +Z +º +Ö +­ +[ +] +Ç +ç +à +ä +û +ò +Í +ê +ô +ø +ª diff --git a/backend/ppocr/utils/dict/fa_dict.txt b/backend/ppocr/utils/dict/fa_dict.txt new file mode 100644 index 0000000..2328fbd --- /dev/null +++ b/backend/ppocr/utils/dict/fa_dict.txt @@ -0,0 +1,136 @@ +f +a +_ +i +m +g +/ +1 +3 +I +L +S +V +R +C +2 +0 +v +l +6 +8 +5 +. +j +p +و +د +ر +ك +ن +ش +ه +ا +4 +9 +ی +ج +ِ +7 +غ +ل +س +ز +ّ +ت +ک +گ +ي +م +ب +ف +چ +خ +ق +ژ +آ +ص +پ +َ +ع +ئ +ح +ٔ +ض +ُ +ذ +أ +ى +ط +ظ +ث +ة +ً +ء +ؤ +ْ +ۀ +إ +ٍ +ٌ +ٰ +ٓ +ٱ +s +c +e +n +w +N +E +W +Y +D +O +H +A +d +z +r +T +G +o +t +x +h +b +B +M +Z +u +P +F +y +q +U +K +k +J +Q +' +X +# +? +% +$ +, +: +& +! +- +( +É +@ +é ++ + diff --git a/backend/ppocr/utils/dict/french_dict.txt b/backend/ppocr/utils/dict/french_dict.txt new file mode 100644 index 0000000..e8f657d --- /dev/null +++ b/backend/ppocr/utils/dict/french_dict.txt @@ -0,0 +1,136 @@ +f +e +n +c +h +_ +i +m +g +/ +r +v +a +l +t +w +o +d +6 +1 +. +p +B +u +2 +à +3 +R +y +4 +U +E +A +5 +P +O +S +T +D +7 +Z +8 +I +N +L +G +M +H +0 +J +K +- +9 +F +C +V +é +X +' +s +Q +: +è +x +b +Y +Œ +É +z +W +Ç +È +k +Ô +ô +€ +À +Ê +q +ù +° +ê +î +* + +j +" +, +â +% +û +ç +ü +? +! +; +ö +( +) +ï +º +ó +ø +å ++ +™ +á +Ë +< +² +Á +Î +& +@ +œ +ε +Ü +ë +[ +] +í +ò +Ö +ä +ß +« +» +ú +ñ +æ +µ +³ +Å +$ +# + diff --git a/backend/ppocr/utils/dict/german_dict.txt b/backend/ppocr/utils/dict/german_dict.txt new file mode 100644 index 0000000..5e121af --- /dev/null +++ b/backend/ppocr/utils/dict/german_dict.txt @@ -0,0 +1,143 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +£ +§ +­ +° +´ +µ +· +º +¿ +Á +Ä +Å +É +Ï +Ô +Ö +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ï +ñ +ò +ó +ô +ö +ø +ù +ú +û +ü +ō +Š +Ÿ +ʒ +β +δ +з +Ṡ +‘ +€ +© +ª +« +¬ diff --git a/backend/ppocr/utils/dict/hi_dict.txt b/backend/ppocr/utils/dict/hi_dict.txt new file mode 100644 index 0000000..8dfedb5 --- /dev/null +++ b/backend/ppocr/utils/dict/hi_dict.txt @@ -0,0 +1,162 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/backend/ppocr/utils/dict/it_dict.txt b/backend/ppocr/utils/dict/it_dict.txt new file mode 100644 index 0000000..e692c6d --- /dev/null +++ b/backend/ppocr/utils/dict/it_dict.txt @@ -0,0 +1,118 @@ +i +t +_ +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +7 +8 +9 +6 +. +j +p + +e +r +o +d +s +n +3 +4 +P +u +c +A +- +, +" +z +h +f +b +q +ì +' +à +O +è +G +ù +é +ò +; +F +E +B +N +H +k +: +U +T +X +D +K +? +[ +M +­ +x +y +( +) +W +ö +º +w +] +Q +J ++ +ü +! +È +á +% += +» +ñ +Ö +Y +ä +í +Z +« +@ +ó +ø +ï +ú +ê +ç +Á +É +Å +ß +{ +} +& +` +û +î +# +$ diff --git a/backend/ppocr/utils/dict/japan_dict.txt b/backend/ppocr/utils/dict/japan_dict.txt new file mode 100644 index 0000000..339d4b8 --- /dev/null +++ b/backend/ppocr/utils/dict/japan_dict.txt @@ -0,0 +1,4399 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +° +² +´ +½ +Á +Ä +Å +Ç +È +É +Í +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ğ +ī +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ż +Ž +ž +Ș +ș +ț +Δ +α +λ +μ +φ +Г +О +а +в +л +о +р +с +т +я +ồ +​ +— +― +’ +“ +” +… +℃ +→ +∇ +− +■ +☆ +  +、 +。 +々 +〆 +〈 +〉 +「 +」 +『 +』 +〔 +〕 +〜 +ぁ +あ +ぃ +い +う +ぇ +え +ぉ +お +か +が +き +ぎ +く +ぐ +け +げ +こ +ご +さ +ざ +し +じ +す +ず +せ +ぜ +そ +ぞ +た +だ +ち +ぢ +っ +つ +づ +て +で +と +ど +な +に +ぬ +ね +の +は +ば +ぱ +ひ +び +ぴ +ふ +ぶ +ぷ +へ +べ +ぺ +ほ +ぼ +ぽ +ま +み +む +め +も +ゃ +や +ゅ +ゆ +ょ +よ +ら +り +る +れ +ろ +わ +ゑ +を +ん +ゝ +ゞ +ァ +ア +ィ +イ +ゥ +ウ +ェ +エ +ォ +オ +カ +ガ +キ +ギ +ク +グ +ケ +ゲ +コ +ゴ +サ +ザ +シ +ジ +ス +ズ +セ +ゼ +ソ +ゾ +タ +ダ +チ +ヂ +ッ +ツ +ヅ +テ +デ +ト +ド +ナ +ニ +ヌ +ネ +ノ +ハ +バ +パ +ヒ +ビ +ピ +フ +ブ +プ +ヘ +ベ +ペ +ホ +ボ +ポ +マ +ミ +ム +メ +モ +ャ +ヤ +ュ +ユ +ョ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヰ +ン +ヴ +ヵ +ヶ +・ +ー +㈱ +一 +丁 +七 +万 +丈 +三 +上 +下 +不 +与 +丑 +且 +世 +丘 +丙 +丞 +両 +並 +中 +串 +丸 +丹 +主 +丼 +丿 +乃 +久 +之 +乎 +乏 +乗 +乘 +乙 +九 +乞 +也 +乱 +乳 +乾 +亀 +了 +予 +争 +事 +二 +于 +互 +五 +井 +亘 +亙 +些 +亜 +亟 +亡 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +人 +什 +仁 +仇 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +代 +令 +以 +仮 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +会 +伝 +伯 +估 +伴 +伶 +伸 +伺 +似 +伽 +佃 +但 +位 +低 +住 +佐 +佑 +体 +何 +余 +佚 +佛 +作 +佩 +佳 +併 +佶 +使 +侈 +例 +侍 +侏 +侑 +侘 +供 +依 +侠 +価 +侮 +侯 +侵 +侶 +便 +係 +促 +俄 +俊 +俔 +俗 +俘 +保 +信 +俣 +俤 +修 +俯 +俳 +俵 +俸 +俺 +倉 +個 +倍 +倒 +候 +借 +倣 +値 +倫 +倭 +倶 +倹 +偃 +假 +偈 +偉 +偏 +偐 +偕 +停 +健 +側 +偵 +偶 +偽 +傀 +傅 +傍 +傑 +傘 +備 +催 +傭 +傲 +傳 +債 +傷 +傾 +僊 +働 +像 +僑 +僕 +僚 +僧 +僭 +僮 +儀 +億 +儇 +儒 +儛 +償 +儡 +優 +儲 +儺 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +兎 +児 +党 +兜 +入 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +内 +円 +冊 +再 +冑 +冒 +冗 +写 +冠 +冤 +冥 +冨 +冬 +冲 +决 +冶 +冷 +准 +凉 +凋 +凌 +凍 +凛 +凝 +凞 +几 +凡 +処 +凪 +凰 +凱 +凶 +凸 +凹 +出 +函 +刀 +刃 +分 +切 +刈 +刊 +刎 +刑 +列 +初 +判 +別 +利 +刪 +到 +制 +刷 +券 +刹 +刺 +刻 +剃 +則 +削 +剋 +前 +剖 +剛 +剣 +剤 +剥 +剪 +副 +剰 +割 +創 +剽 +劇 +劉 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劭 +励 +労 +効 +劾 +勃 +勅 +勇 +勉 +勒 +動 +勘 +務 +勝 +募 +勢 +勤 +勧 +勲 +勺 +勾 +勿 +匁 +匂 +包 +匏 +化 +北 +匙 +匝 +匠 +匡 +匣 +匯 +匲 +匹 +区 +医 +匿 +十 +千 +升 +午 +卉 +半 +卍 +卑 +卒 +卓 +協 +南 +単 +博 +卜 +占 +卦 +卯 +印 +危 +即 +却 +卵 +卸 +卿 +厄 +厚 +原 +厠 +厨 +厩 +厭 +厳 +去 +参 +又 +叉 +及 +友 +双 +反 +収 +叔 +取 +受 +叙 +叛 +叟 +叡 +叢 +口 +古 +句 +叩 +只 +叫 +召 +可 +台 +叱 +史 +右 +叶 +号 +司 +吃 +各 +合 +吉 +吊 +同 +名 +后 +吏 +吐 +向 +君 +吝 +吟 +吠 +否 +含 +吸 +吹 +吻 +吽 +吾 +呂 +呆 +呈 +呉 +告 +呑 +周 +呪 +呰 +味 +呼 +命 +咀 +咄 +咋 +和 +咒 +咫 +咲 +咳 +咸 +哀 +品 +哇 +哉 +員 +哨 +哩 +哭 +哲 +哺 +唄 +唆 +唇 +唐 +唖 +唯 +唱 +唳 +唸 +唾 +啄 +商 +問 +啓 +啼 +善 +喋 +喚 +喜 +喝 +喧 +喩 +喪 +喫 +喬 +單 +喰 +営 +嗅 +嗇 +嗔 +嗚 +嗜 +嗣 +嘆 +嘉 +嘗 +嘘 +嘩 +嘯 +嘱 +嘲 +嘴 +噂 +噌 +噛 +器 +噴 +噺 +嚆 +嚢 +囀 +囃 +囉 +囚 +四 +回 +因 +団 +困 +囲 +図 +固 +国 +圀 +圃 +國 +圏 +園 +圓 +團 +圜 +土 +圧 +在 +圭 +地 +址 +坂 +均 +坊 +坐 +坑 +坡 +坤 +坦 +坪 +垂 +型 +垢 +垣 +埃 +埋 +城 +埒 +埔 +域 +埠 +埴 +埵 +執 +培 +基 +埼 +堀 +堂 +堅 +堆 +堕 +堤 +堪 +堯 +堰 +報 +場 +堵 +堺 +塀 +塁 +塊 +塑 +塔 +塗 +塘 +塙 +塚 +塞 +塩 +填 +塵 +塾 +境 +墉 +墓 +増 +墜 +墟 +墨 +墳 +墺 +墻 +墾 +壁 +壇 +壊 +壌 +壕 +士 +壬 +壮 +声 +壱 +売 +壷 +壹 +壺 +壽 +変 +夏 +夕 +外 +夙 +多 +夜 +夢 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +失 +夷 +夾 +奄 +奇 +奈 +奉 +奎 +奏 +契 +奔 +奕 +套 +奘 +奠 +奢 +奥 +奨 +奪 +奮 +女 +奴 +奸 +好 +如 +妃 +妄 +妊 +妍 +妓 +妖 +妙 +妥 +妨 +妬 +妲 +妹 +妻 +妾 +姉 +始 +姐 +姓 +委 +姚 +姜 +姞 +姥 +姦 +姨 +姪 +姫 +姶 +姻 +姿 +威 +娑 +娘 +娟 +娠 +娩 +娯 +娼 +婆 +婉 +婚 +婢 +婦 +婬 +婿 +媄 +媒 +媓 +媚 +媛 +媞 +媽 +嫁 +嫄 +嫉 +嫌 +嫐 +嫗 +嫡 +嬉 +嬌 +嬢 +嬪 +嬬 +嬾 +孁 +子 +孔 +字 +存 +孚 +孝 +孟 +季 +孤 +学 +孫 +孵 +學 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宕 +宗 +官 +宙 +定 +宛 +宜 +宝 +実 +客 +宣 +室 +宥 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寂 +寄 +寅 +密 +寇 +富 +寒 +寓 +寔 +寛 +寝 +察 +寡 +實 +寧 +審 +寮 +寵 +寶 +寸 +寺 +対 +寿 +封 +専 +射 +将 +尉 +尊 +尋 +對 +導 +小 +少 +尖 +尚 +尤 +尪 +尭 +就 +尹 +尺 +尻 +尼 +尽 +尾 +尿 +局 +居 +屈 +届 +屋 +屍 +屎 +屏 +屑 +屓 +展 +属 +屠 +層 +履 +屯 +山 +岐 +岑 +岡 +岩 +岫 +岬 +岳 +岷 +岸 +峠 +峡 +峨 +峯 +峰 +島 +峻 +崇 +崋 +崎 +崑 +崖 +崗 +崛 +崩 +嵌 +嵐 +嵩 +嵯 +嶂 +嶋 +嶠 +嶺 +嶼 +嶽 +巀 +巌 +巒 +巖 +川 +州 +巡 +巣 +工 +左 +巧 +巨 +巫 +差 +己 +巳 +巴 +巷 +巻 +巽 +巾 +市 +布 +帆 +希 +帖 +帚 +帛 +帝 +帥 +師 +席 +帯 +帰 +帳 +帷 +常 +帽 +幄 +幅 +幇 +幌 +幔 +幕 +幟 +幡 +幢 +幣 +干 +平 +年 +并 +幸 +幹 +幻 +幼 +幽 +幾 +庁 +広 +庄 +庇 +床 +序 +底 +庖 +店 +庚 +府 +度 +座 +庫 +庭 +庵 +庶 +康 +庸 +廂 +廃 +廉 +廊 +廓 +廟 +廠 +廣 +廬 +延 +廷 +建 +廻 +廼 +廿 +弁 +弄 +弉 +弊 +弌 +式 +弐 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弥 +弦 +弧 +弱 +張 +強 +弼 +弾 +彈 +彊 +彌 +彎 +当 +彗 +彙 +彝 +形 +彦 +彩 +彫 +彬 +彭 +彰 +影 +彷 +役 +彼 +往 +征 +徂 +径 +待 +律 +後 +徐 +徑 +徒 +従 +得 +徠 +御 +徧 +徨 +復 +循 +徭 +微 +徳 +徴 +德 +徹 +徽 +心 +必 +忉 +忌 +忍 +志 +忘 +忙 +応 +忠 +快 +忯 +念 +忻 +忽 +忿 +怒 +怖 +思 +怠 +怡 +急 +性 +怨 +怪 +怯 +恂 +恋 +恐 +恒 +恕 +恣 +恤 +恥 +恨 +恩 +恬 +恭 +息 +恵 +悉 +悌 +悍 +悔 +悟 +悠 +患 +悦 +悩 +悪 +悲 +悼 +情 +惇 +惑 +惚 +惜 +惟 +惠 +惣 +惧 +惨 +惰 +想 +惹 +惺 +愈 +愉 +愍 +意 +愔 +愚 +愛 +感 +愷 +愿 +慈 +態 +慌 +慎 +慕 +慢 +慣 +慧 +慨 +慮 +慰 +慶 +憂 +憎 +憐 +憑 +憙 +憤 +憧 +憩 +憬 +憲 +憶 +憾 +懇 +應 +懌 +懐 +懲 +懸 +懺 +懽 +懿 +戈 +戊 +戌 +戎 +成 +我 +戒 +戔 +或 +戚 +戟 +戦 +截 +戮 +戯 +戴 +戸 +戻 +房 +所 +扁 +扇 +扈 +扉 +手 +才 +打 +払 +托 +扮 +扱 +扶 +批 +承 +技 +抄 +把 +抑 +抓 +投 +抗 +折 +抜 +択 +披 +抱 +抵 +抹 +押 +抽 +担 +拇 +拈 +拉 +拍 +拏 +拐 +拒 +拓 +拘 +拙 +招 +拝 +拠 +拡 +括 +拭 +拳 +拵 +拶 +拾 +拿 +持 +挂 +指 +按 +挑 +挙 +挟 +挨 +振 +挺 +挽 +挿 +捉 +捕 +捗 +捜 +捧 +捨 +据 +捺 +捻 +掃 +掄 +授 +掌 +排 +掖 +掘 +掛 +掟 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掲 +掴 +掻 +掾 +揃 +揄 +揆 +揉 +描 +提 +揖 +揚 +換 +握 +揮 +援 +揶 +揺 +損 +搦 +搬 +搭 +携 +搾 +摂 +摘 +摩 +摸 +摺 +撃 +撒 +撞 +撤 +撥 +撫 +播 +撮 +撰 +撲 +撹 +擁 +操 +擔 +擦 +擬 +擾 +攘 +攝 +攣 +支 +收 +改 +攻 +放 +政 +故 +敏 +救 +敗 +教 +敢 +散 +敦 +敬 +数 +整 +敵 +敷 +斂 +文 +斉 +斎 +斐 +斑 +斗 +料 +斜 +斟 +斤 +斥 +斧 +斬 +断 +斯 +新 +方 +於 +施 +旁 +旅 +旋 +旌 +族 +旗 +旛 +无 +旡 +既 +日 +旦 +旧 +旨 +早 +旬 +旭 +旺 +旻 +昂 +昆 +昇 +昉 +昌 +明 +昏 +易 +昔 +星 +映 +春 +昧 +昨 +昪 +昭 +是 +昵 +昼 +晁 +時 +晃 +晋 +晏 +晒 +晟 +晦 +晧 +晩 +普 +景 +晴 +晶 +智 +暁 +暇 +暈 +暉 +暑 +暖 +暗 +暘 +暢 +暦 +暫 +暮 +暲 +暴 +暹 +暾 +曄 +曇 +曉 +曖 +曙 +曜 +曝 +曠 +曰 +曲 +曳 +更 +書 +曹 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朋 +服 +朏 +朔 +朕 +朗 +望 +朝 +期 +朧 +木 +未 +末 +本 +札 +朱 +朴 +机 +朽 +杁 +杉 +李 +杏 +材 +村 +杓 +杖 +杜 +杞 +束 +条 +杢 +杣 +来 +杭 +杮 +杯 +東 +杲 +杵 +杷 +杼 +松 +板 +枅 +枇 +析 +枓 +枕 +林 +枚 +果 +枝 +枠 +枡 +枢 +枯 +枳 +架 +柄 +柊 +柏 +某 +柑 +染 +柔 +柘 +柚 +柯 +柱 +柳 +柴 +柵 +査 +柾 +柿 +栂 +栃 +栄 +栖 +栗 +校 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桁 +桂 +桃 +框 +案 +桐 +桑 +桓 +桔 +桜 +桝 +桟 +桧 +桴 +桶 +桾 +梁 +梅 +梆 +梓 +梔 +梗 +梛 +條 +梟 +梢 +梧 +梨 +械 +梱 +梲 +梵 +梶 +棄 +棋 +棒 +棗 +棘 +棚 +棟 +棠 +森 +棲 +棹 +棺 +椀 +椅 +椋 +植 +椎 +椏 +椒 +椙 +検 +椥 +椹 +椿 +楊 +楓 +楕 +楚 +楞 +楠 +楡 +楢 +楨 +楪 +楫 +業 +楮 +楯 +楳 +極 +楷 +楼 +楽 +概 +榊 +榎 +榕 +榛 +榜 +榮 +榱 +榴 +槃 +槇 +槊 +構 +槌 +槍 +槐 +様 +槙 +槻 +槽 +槿 +樂 +樋 +樓 +樗 +標 +樟 +模 +権 +横 +樫 +樵 +樹 +樺 +樽 +橇 +橋 +橘 +機 +橿 +檀 +檄 +檎 +檐 +檗 +檜 +檣 +檥 +檬 +檮 +檸 +檻 +櫃 +櫓 +櫛 +櫟 +櫨 +櫻 +欄 +欅 +欠 +次 +欣 +欧 +欲 +欺 +欽 +款 +歌 +歎 +歓 +止 +正 +此 +武 +歩 +歪 +歯 +歳 +歴 +死 +殆 +殉 +殊 +残 +殖 +殯 +殴 +段 +殷 +殺 +殻 +殿 +毀 +毅 +母 +毎 +毒 +比 +毘 +毛 +毫 +毬 +氈 +氏 +民 +気 +水 +氷 +永 +氾 +汀 +汁 +求 +汎 +汐 +汗 +汚 +汝 +江 +池 +汪 +汰 +汲 +決 +汽 +沂 +沃 +沅 +沆 +沈 +沌 +沐 +沓 +沖 +沙 +没 +沢 +沱 +河 +沸 +油 +治 +沼 +沽 +沿 +況 +泉 +泊 +泌 +法 +泗 +泡 +波 +泣 +泥 +注 +泯 +泰 +泳 +洋 +洒 +洗 +洛 +洞 +津 +洩 +洪 +洲 +洸 +洹 +活 +洽 +派 +流 +浄 +浅 +浙 +浚 +浜 +浣 +浦 +浩 +浪 +浮 +浴 +海 +浸 +涅 +消 +涌 +涙 +涛 +涯 +液 +涵 +涼 +淀 +淄 +淆 +淇 +淋 +淑 +淘 +淡 +淤 +淨 +淫 +深 +淳 +淵 +混 +淹 +添 +清 +済 +渉 +渋 +渓 +渕 +渚 +減 +渟 +渠 +渡 +渤 +渥 +渦 +温 +渫 +測 +港 +游 +渾 +湊 +湖 +湘 +湛 +湧 +湫 +湯 +湾 +湿 +満 +源 +準 +溜 +溝 +溢 +溥 +溪 +溶 +溺 +滄 +滅 +滋 +滌 +滑 +滕 +滝 +滞 +滴 +滸 +滹 +滿 +漁 +漂 +漆 +漉 +漏 +漑 +演 +漕 +漠 +漢 +漣 +漫 +漬 +漱 +漸 +漿 +潅 +潔 +潙 +潜 +潟 +潤 +潭 +潮 +潰 +潴 +澁 +澂 +澄 +澎 +澗 +澤 +澪 +澱 +澳 +激 +濁 +濃 +濟 +濠 +濡 +濤 +濫 +濯 +濱 +濾 +瀉 +瀋 +瀑 +瀕 +瀞 +瀟 +瀧 +瀬 +瀾 +灌 +灑 +灘 +火 +灯 +灰 +灸 +災 +炉 +炊 +炎 +炒 +炭 +炮 +炷 +点 +為 +烈 +烏 +烙 +烝 +烹 +焔 +焙 +焚 +無 +焦 +然 +焼 +煇 +煉 +煌 +煎 +煕 +煙 +煤 +煥 +照 +煩 +煬 +煮 +煽 +熈 +熊 +熙 +熟 +熨 +熱 +熹 +熾 +燃 +燈 +燎 +燔 +燕 +燗 +燥 +燭 +燻 +爆 +爐 +爪 +爬 +爲 +爵 +父 +爺 +爼 +爽 +爾 +片 +版 +牌 +牒 +牘 +牙 +牛 +牝 +牟 +牡 +牢 +牧 +物 +牲 +特 +牽 +犂 +犠 +犬 +犯 +状 +狂 +狄 +狐 +狗 +狙 +狛 +狡 +狩 +独 +狭 +狷 +狸 +狼 +猊 +猛 +猟 +猥 +猨 +猩 +猪 +猫 +献 +猴 +猶 +猷 +猾 +猿 +獄 +獅 +獏 +獣 +獲 +玄 +玅 +率 +玉 +王 +玖 +玩 +玲 +珀 +珂 +珈 +珉 +珊 +珍 +珎 +珞 +珠 +珣 +珥 +珪 +班 +現 +球 +理 +琉 +琢 +琥 +琦 +琮 +琲 +琳 +琴 +琵 +琶 +瑁 +瑋 +瑙 +瑚 +瑛 +瑜 +瑞 +瑠 +瑤 +瑩 +瑪 +瑳 +瑾 +璃 +璋 +璜 +璞 +璧 +璨 +環 +璵 +璽 +璿 +瓊 +瓔 +瓜 +瓢 +瓦 +瓶 +甍 +甑 +甕 +甘 +甚 +甞 +生 +産 +甥 +用 +甫 +田 +由 +甲 +申 +男 +町 +画 +界 +畏 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +番 +異 +畳 +當 +畷 +畸 +畺 +畿 +疆 +疇 +疋 +疎 +疏 +疑 +疫 +疱 +疲 +疹 +疼 +疾 +病 +症 +痒 +痔 +痕 +痘 +痙 +痛 +痢 +痩 +痴 +痺 +瘍 +瘡 +瘧 +療 +癇 +癌 +癒 +癖 +癡 +癪 +発 +登 +白 +百 +的 +皆 +皇 +皋 +皐 +皓 +皮 +皺 +皿 +盂 +盃 +盆 +盈 +益 +盒 +盗 +盛 +盞 +盟 +盡 +監 +盤 +盥 +盧 +目 +盲 +直 +相 +盾 +省 +眉 +看 +県 +眞 +真 +眠 +眷 +眺 +眼 +着 +睡 +督 +睦 +睨 +睿 +瞋 +瞑 +瞞 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞿 +矍 +矛 +矜 +矢 +知 +矧 +矩 +短 +矮 +矯 +石 +砂 +砌 +研 +砕 +砥 +砦 +砧 +砲 +破 +砺 +硝 +硫 +硬 +硯 +碁 +碇 +碌 +碑 +碓 +碕 +碗 +碣 +碧 +碩 +確 +碾 +磁 +磐 +磔 +磧 +磨 +磬 +磯 +礁 +礎 +礒 +礙 +礫 +礬 +示 +礼 +社 +祀 +祁 +祇 +祈 +祉 +祐 +祓 +祕 +祖 +祗 +祚 +祝 +神 +祟 +祠 +祢 +祥 +票 +祭 +祷 +祺 +禁 +禄 +禅 +禊 +禍 +禎 +福 +禔 +禖 +禛 +禦 +禧 +禮 +禰 +禹 +禽 +禿 +秀 +私 +秋 +科 +秒 +秘 +租 +秤 +秦 +秩 +称 +移 +稀 +程 +税 +稔 +稗 +稙 +稚 +稜 +稠 +種 +稱 +稲 +稷 +稻 +稼 +稽 +稿 +穀 +穂 +穆 +積 +穎 +穏 +穗 +穜 +穢 +穣 +穫 +穴 +究 +空 +突 +窃 +窄 +窒 +窓 +窟 +窠 +窩 +窪 +窮 +窯 +竃 +竄 +竈 +立 +站 +竜 +竝 +竟 +章 +童 +竪 +竭 +端 +竴 +競 +竹 +竺 +竽 +竿 +笄 +笈 +笏 +笑 +笙 +笛 +笞 +笠 +笥 +符 +第 +笹 +筅 +筆 +筇 +筈 +等 +筋 +筌 +筍 +筏 +筐 +筑 +筒 +答 +策 +筝 +筥 +筧 +筬 +筮 +筯 +筰 +筵 +箆 +箇 +箋 +箏 +箒 +箔 +箕 +算 +箙 +箜 +管 +箪 +箭 +箱 +箸 +節 +篁 +範 +篆 +篇 +築 +篋 +篌 +篝 +篠 +篤 +篥 +篦 +篩 +篭 +篳 +篷 +簀 +簒 +簡 +簧 +簪 +簫 +簺 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籟 +籠 +籤 +籬 +米 +籾 +粂 +粉 +粋 +粒 +粕 +粗 +粘 +粛 +粟 +粥 +粧 +粮 +粳 +精 +糊 +糖 +糜 +糞 +糟 +糠 +糧 +糯 +糸 +糺 +系 +糾 +紀 +約 +紅 +紋 +納 +紐 +純 +紗 +紘 +紙 +級 +紛 +素 +紡 +索 +紫 +紬 +累 +細 +紳 +紵 +紹 +紺 +絁 +終 +絃 +組 +絅 +経 +結 +絖 +絞 +絡 +絣 +給 +統 +絲 +絵 +絶 +絹 +絽 +綏 +經 +継 +続 +綜 +綟 +綬 +維 +綱 +網 +綴 +綸 +綺 +綽 +綾 +綿 +緊 +緋 +総 +緑 +緒 +線 +締 +緥 +編 +緩 +緬 +緯 +練 +緻 +縁 +縄 +縅 +縒 +縛 +縞 +縢 +縣 +縦 +縫 +縮 +縹 +總 +績 +繁 +繊 +繋 +繍 +織 +繕 +繝 +繦 +繧 +繰 +繹 +繼 +纂 +纈 +纏 +纐 +纒 +纛 +缶 +罔 +罠 +罧 +罪 +置 +罰 +署 +罵 +罷 +罹 +羂 +羅 +羆 +羇 +羈 +羊 +羌 +美 +群 +羨 +義 +羯 +羲 +羹 +羽 +翁 +翅 +翌 +習 +翔 +翛 +翠 +翡 +翫 +翰 +翺 +翻 +翼 +耀 +老 +考 +者 +耆 +而 +耐 +耕 +耗 +耨 +耳 +耶 +耽 +聊 +聖 +聘 +聚 +聞 +聟 +聡 +聨 +聯 +聰 +聲 +聴 +職 +聾 +肄 +肆 +肇 +肉 +肋 +肌 +肖 +肘 +肛 +肝 +股 +肢 +肥 +肩 +肪 +肯 +肱 +育 +肴 +肺 +胃 +胆 +背 +胎 +胖 +胚 +胝 +胞 +胡 +胤 +胱 +胴 +胸 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脚 +脛 +脩 +脱 +脳 +腋 +腎 +腐 +腑 +腔 +腕 +腫 +腰 +腱 +腸 +腹 +腺 +腿 +膀 +膏 +膚 +膜 +膝 +膠 +膣 +膨 +膩 +膳 +膵 +膾 +膿 +臂 +臆 +臈 +臍 +臓 +臘 +臚 +臣 +臥 +臨 +自 +臭 +至 +致 +臺 +臼 +舂 +舅 +與 +興 +舌 +舍 +舎 +舒 +舖 +舗 +舘 +舜 +舞 +舟 +舩 +航 +般 +舳 +舶 +船 +艇 +艘 +艦 +艮 +良 +色 +艶 +芋 +芒 +芙 +芝 +芥 +芦 +芬 +芭 +芯 +花 +芳 +芸 +芹 +芻 +芽 +芿 +苅 +苑 +苔 +苗 +苛 +苞 +苡 +若 +苦 +苧 +苫 +英 +苴 +苻 +茂 +范 +茄 +茅 +茎 +茗 +茘 +茜 +茨 +茲 +茵 +茶 +茸 +茹 +草 +荊 +荏 +荒 +荘 +荷 +荻 +荼 +莞 +莪 +莫 +莬 +莱 +莵 +莽 +菅 +菊 +菌 +菓 +菖 +菘 +菜 +菟 +菩 +菫 +華 +菱 +菴 +萄 +萊 +萌 +萍 +萎 +萠 +萩 +萬 +萱 +落 +葉 +著 +葛 +葡 +董 +葦 +葩 +葬 +葭 +葱 +葵 +葺 +蒋 +蒐 +蒔 +蒙 +蒟 +蒡 +蒲 +蒸 +蒻 +蒼 +蒿 +蓄 +蓆 +蓉 +蓋 +蓑 +蓬 +蓮 +蓼 +蔀 +蔑 +蔓 +蔚 +蔡 +蔦 +蔬 +蔭 +蔵 +蔽 +蕃 +蕉 +蕊 +蕎 +蕨 +蕩 +蕪 +蕭 +蕾 +薄 +薇 +薊 +薔 +薗 +薙 +薛 +薦 +薨 +薩 +薪 +薫 +薬 +薭 +薮 +藁 +藉 +藍 +藏 +藐 +藝 +藤 +藩 +藪 +藷 +藹 +藺 +藻 +蘂 +蘆 +蘇 +蘊 +蘭 +虎 +虐 +虔 +虚 +虜 +虞 +號 +虫 +虹 +虻 +蚊 +蚕 +蛇 +蛉 +蛍 +蛎 +蛙 +蛛 +蛟 +蛤 +蛭 +蛮 +蛸 +蛹 +蛾 +蜀 +蜂 +蜃 +蜆 +蜊 +蜘 +蜜 +蜷 +蜻 +蝉 +蝋 +蝕 +蝙 +蝠 +蝦 +蝶 +蝿 +螂 +融 +螣 +螺 +蟄 +蟇 +蟠 +蟷 +蟹 +蟻 +蠢 +蠣 +血 +衆 +行 +衍 +衒 +術 +街 +衙 +衛 +衝 +衞 +衡 +衢 +衣 +表 +衫 +衰 +衵 +衷 +衽 +衾 +衿 +袁 +袈 +袋 +袍 +袒 +袖 +袙 +袞 +袢 +被 +袰 +袱 +袴 +袷 +袿 +裁 +裂 +裃 +装 +裏 +裔 +裕 +裘 +裙 +補 +裟 +裡 +裲 +裳 +裴 +裸 +裹 +製 +裾 +褂 +褄 +複 +褌 +褐 +褒 +褥 +褪 +褶 +褻 +襄 +襖 +襞 +襟 +襠 +襦 +襪 +襲 +襴 +襷 +西 +要 +覆 +覇 +覈 +見 +規 +視 +覗 +覚 +覧 +親 +覲 +観 +覺 +觀 +角 +解 +触 +言 +訂 +計 +討 +訓 +託 +記 +訛 +訟 +訢 +訥 +訪 +設 +許 +訳 +訴 +訶 +診 +註 +証 +詐 +詔 +評 +詛 +詞 +詠 +詢 +詣 +試 +詩 +詫 +詮 +詰 +話 +該 +詳 +誄 +誅 +誇 +誉 +誌 +認 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +説 +読 +誰 +課 +誼 +誾 +調 +談 +請 +諌 +諍 +諏 +諒 +論 +諚 +諜 +諟 +諡 +諦 +諧 +諫 +諭 +諮 +諱 +諶 +諷 +諸 +諺 +諾 +謀 +謄 +謌 +謎 +謗 +謙 +謚 +講 +謝 +謡 +謫 +謬 +謹 +證 +識 +譚 +譛 +譜 +警 +譬 +譯 +議 +譲 +譴 +護 +讀 +讃 +讐 +讒 +谷 +谿 +豅 +豆 +豊 +豎 +豐 +豚 +象 +豪 +豫 +豹 +貌 +貝 +貞 +負 +財 +貢 +貧 +貨 +販 +貪 +貫 +責 +貯 +貰 +貴 +買 +貸 +費 +貼 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賎 +賑 +賓 +賛 +賜 +賞 +賠 +賢 +賣 +賤 +賦 +質 +賭 +購 +賽 +贄 +贅 +贈 +贋 +贔 +贖 +赤 +赦 +走 +赴 +起 +超 +越 +趙 +趣 +足 +趺 +趾 +跋 +跏 +距 +跡 +跨 +跪 +路 +跳 +践 +踊 +踏 +踐 +踞 +踪 +踵 +蹄 +蹉 +蹊 +蹟 +蹲 +蹴 +躅 +躇 +躊 +躍 +躑 +躙 +躪 +身 +躬 +躯 +躰 +車 +軋 +軌 +軍 +軒 +軟 +転 +軸 +軻 +軽 +軾 +較 +載 +輌 +輔 +輜 +輝 +輦 +輩 +輪 +輯 +輸 +輿 +轄 +轍 +轟 +轢 +辛 +辞 +辟 +辥 +辦 +辨 +辰 +辱 +農 +辺 +辻 +込 +迂 +迅 +迎 +近 +返 +迢 +迦 +迪 +迫 +迭 +述 +迷 +迹 +追 +退 +送 +逃 +逅 +逆 +逍 +透 +逐 +逓 +途 +逕 +逗 +這 +通 +逝 +逞 +速 +造 +逢 +連 +逮 +週 +進 +逸 +逼 +遁 +遂 +遅 +遇 +遊 +運 +遍 +過 +遐 +道 +達 +違 +遙 +遜 +遠 +遡 +遣 +遥 +適 +遭 +遮 +遯 +遵 +遷 +選 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邉 +邊 +邑 +那 +邦 +邨 +邪 +邯 +邵 +邸 +郁 +郊 +郎 +郡 +郢 +部 +郭 +郴 +郵 +郷 +都 +鄂 +鄙 +鄭 +鄰 +鄲 +酉 +酋 +酌 +配 +酎 +酒 +酔 +酢 +酥 +酪 +酬 +酵 +酷 +酸 +醍 +醐 +醒 +醗 +醜 +醤 +醪 +醵 +醸 +采 +釈 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釘 +釜 +針 +釣 +釧 +釿 +鈍 +鈎 +鈐 +鈔 +鈞 +鈦 +鈴 +鈷 +鈸 +鈿 +鉄 +鉇 +鉉 +鉋 +鉛 +鉢 +鉤 +鉦 +鉱 +鉾 +銀 +銃 +銅 +銈 +銑 +銕 +銘 +銚 +銜 +銭 +鋏 +鋒 +鋤 +鋭 +鋲 +鋳 +鋸 +鋺 +鋼 +錆 +錍 +錐 +錘 +錠 +錣 +錦 +錫 +錬 +錯 +録 +錵 +鍋 +鍍 +鍑 +鍔 +鍛 +鍬 +鍮 +鍵 +鍼 +鍾 +鎌 +鎖 +鎗 +鎚 +鎧 +鎬 +鎮 +鎰 +鎹 +鏃 +鏑 +鏡 +鐃 +鐇 +鐐 +鐔 +鐘 +鐙 +鐚 +鐡 +鐵 +鐸 +鑁 +鑊 +鑑 +鑒 +鑚 +鑠 +鑢 +鑰 +鑵 +鑷 +鑼 +鑽 +鑿 +長 +門 +閃 +閇 +閉 +開 +閏 +閑 +間 +閔 +閘 +関 +閣 +閤 +閥 +閦 +閨 +閬 +閲 +閻 +閼 +閾 +闇 +闍 +闔 +闕 +闘 +關 +闡 +闢 +闥 +阜 +阪 +阮 +阯 +防 +阻 +阿 +陀 +陂 +附 +陌 +降 +限 +陛 +陞 +院 +陣 +除 +陥 +陪 +陬 +陰 +陳 +陵 +陶 +陸 +険 +陽 +隅 +隆 +隈 +隊 +隋 +階 +随 +隔 +際 +障 +隠 +隣 +隧 +隷 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雊 +雋 +雌 +雍 +雑 +雖 +雙 +雛 +離 +難 +雨 +雪 +雫 +雰 +雲 +零 +雷 +雹 +電 +需 +震 +霊 +霍 +霖 +霜 +霞 +霧 +霰 +露 +靈 +青 +靖 +静 +靜 +非 +面 +革 +靫 +靭 +靱 +靴 +靺 +鞁 +鞄 +鞆 +鞋 +鞍 +鞏 +鞘 +鞠 +鞨 +鞭 +韋 +韓 +韜 +韮 +音 +韶 +韻 +響 +頁 +頂 +頃 +項 +順 +須 +頌 +預 +頑 +頒 +頓 +領 +頚 +頬 +頭 +頴 +頸 +頻 +頼 +顆 +題 +額 +顎 +顔 +顕 +顗 +願 +顛 +類 +顧 +顯 +風 +飛 +食 +飢 +飩 +飫 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餅 +餉 +養 +餌 +餐 +餓 +餘 +餝 +餡 +館 +饂 +饅 +饉 +饋 +饌 +饒 +饗 +首 +馗 +香 +馨 +馬 +馳 +馴 +駄 +駅 +駆 +駈 +駐 +駒 +駕 +駝 +駿 +騁 +騎 +騏 +騒 +験 +騙 +騨 +騰 +驕 +驚 +驛 +驢 +骨 +骸 +髄 +體 +高 +髙 +髢 +髪 +髭 +髮 +髷 +髻 +鬘 +鬚 +鬢 +鬨 +鬯 +鬱 +鬼 +魁 +魂 +魄 +魅 +魏 +魔 +魚 +魯 +鮎 +鮑 +鮒 +鮪 +鮫 +鮭 +鮮 +鯉 +鯔 +鯖 +鯛 +鯨 +鯰 +鯱 +鰐 +鰒 +鰭 +鰯 +鰰 +鰹 +鰻 +鱈 +鱒 +鱗 +鱧 +鳥 +鳩 +鳰 +鳳 +鳴 +鳶 +鴈 +鴉 +鴎 +鴛 +鴟 +鴦 +鴨 +鴫 +鴻 +鵄 +鵜 +鵞 +鵡 +鵬 +鵲 +鵺 +鶉 +鶏 +鶯 +鶴 +鷄 +鷙 +鷲 +鷹 +鷺 +鸚 +鸞 +鹸 +鹽 +鹿 +麁 +麒 +麓 +麗 +麝 +麞 +麟 +麦 +麩 +麹 +麺 +麻 +麾 +麿 +黄 +黌 +黍 +黒 +黙 +黛 +黠 +鼈 +鼉 +鼎 +鼓 +鼠 +鼻 +齊 +齋 +齟 +齢 +齬 +龍 +龕 +龗 +! +# +% +& +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +R +S +T +U +V +W +X +Z +a +c +d +e +f +h +i +j +k +l +m +n +o +p +r +s +t +u +y +z +~ +・ + diff --git a/backend/ppocr/utils/dict/ka_dict.txt b/backend/ppocr/utils/dict/ka_dict.txt new file mode 100644 index 0000000..d506b69 --- /dev/null +++ b/backend/ppocr/utils/dict/ka_dict.txt @@ -0,0 +1,153 @@ +k +a +_ +i +m +g +/ +1 +2 +I +L +S +V +R +C +0 +v +l +6 +4 +8 +. +j +p +ಗ +ು +ಣ +ಪ +ಡ +ಿ +ಸ +ಲ +ಾ +ದ +್ +7 +5 +3 +ವ +ಷ +ಬ +ಹ +ೆ +9 +ಅ +ಳ +ನ +ರ +ಉ +ಕ +ಎ +ೇ +ಂ +ೈ +ೊ +ೀ +ಯ +ೋ +ತ +ಶ +ಭ +ಧ +ಚ +ಜ +ೂ +ಮ +ಒ +ೃ +ಥ +ಇ +ಟ +ಖ +ಆ +ಞ +ಫ +- +ಢ +ಊ +ಓ +ಐ +ಃ +ಘ +ಝ +ೌ +ಠ +ಛ +ಔ +ಏ +ಈ +ಋ +೨ +೦ +೧ +೮ +೯ +೪ +, +೫ +೭ +೩ +೬ +ಙ +s +c +e +n +w +o +u +t +d +E +A +T +B +Z +N +G +O +q +z +r +x +P +K +M +J +U +D +f +F +h +b +W +Y +y +H +X +Q +' +# +& +! +@ +$ +: +% +é +É +( +? ++ + diff --git a/backend/ppocr/utils/dict/kie_dict/xfund_class_list.txt b/backend/ppocr/utils/dict/kie_dict/xfund_class_list.txt new file mode 100644 index 0000000..faded9f --- /dev/null +++ b/backend/ppocr/utils/dict/kie_dict/xfund_class_list.txt @@ -0,0 +1,4 @@ +OTHER +QUESTION +ANSWER +HEADER diff --git a/backend/ppocr/utils/dict/kn_dict.txt b/backend/ppocr/utils/dict/kn_dict.txt new file mode 100644 index 0000000..33d605c --- /dev/null +++ b/backend/ppocr/utils/dict/kn_dict.txt @@ -0,0 +1,153 @@ +k +a +_ +i +m +g +/ +1 +2 +I +L +S +V +R +C +0 +v +l +6 +4 +8 +. +j +p +ಗ +ು +ಣ +ಪ +ಡ +ಿ +ಸ +ಲ +ಾ +ದ +್ +7 +5 +3 +ವ +ಷ +ಬ +ಹ +ೆ +9 +ಅ +ಳ +ನ +ರ +ಉ +ಕ +ಎ +ೇ +ಂ +ೈ +ೊ +ೀ +ಯ +ೋ +ತ +ಶ +ಭ +ಧ +ಚ +ಜ +ೂ +ಮ +ಒ +ೃ +ಥ +ಇ +ಟ +ಖ +ಆ +ಞ +ಫ +- +ಢ +ಊ +ಓ +ಐ +ಃ +ಘ +ಝ +ೌ +ಠ +ಛ +ಔ +ಏ +ಈ +ಋ +೨ +೦ +೧ +೮ +೯ +೪ +, +೫ +೭ +೩ +೬ +ಙ +s +c +e +n +w +o +u +t +d +E +A +T +B +Z +N +G +O +q +z +r +x +P +K +M +J +U +D +f +F +h +b +W +Y +y +H +X +Q +' +# +& +! +@ +$ +: +% +é +É +( +? ++ + diff --git a/backend/ppocr/utils/dict/korean_dict.txt b/backend/ppocr/utils/dict/korean_dict.txt new file mode 100644 index 0000000..a13899f --- /dev/null +++ b/backend/ppocr/utils/dict/korean_dict.txt @@ -0,0 +1,3688 @@ +! +" +# +$ +% +& +' +* ++ +- +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +© +° +² +½ +Á +Ä +Å +Ç +É +Í +Î +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ě +ğ +ī +İ +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ź +ż +Ž +ž +Ș +ș +Α +Δ +α +λ +φ +Г +О +а +в +л +о +р +с +т +я +​ +’ +“ +” +→ +∇ +∼ +「 +」 +ア +カ +グ +ニ +ラ +ン +ㄱ +ㄴ +ㄷ +ㄸ +ㄹ +ㅂ +ㅅ +ㅆ +ㅇ +ㅈ +ㅊ +ㅋ +ㅌ +ㅎ +ㅓ +ㅜ +ㅣ +一 +丁 +七 +三 +上 +下 +不 +丑 +世 +丘 +丞 +中 +丸 +丹 +主 +乃 +久 +之 +乎 +乘 +九 +也 +乳 +乾 +事 +二 +云 +互 +五 +井 +亞 +亡 +交 +亥 +亨 +享 +京 +亭 +人 +仁 +今 +他 +仙 +代 +令 +以 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +伯 +伴 +伸 +佃 +佈 +位 +低 +住 +佐 +何 +佛 +作 +使 +來 +供 +依 +侯 +侵 +侶 +便 +俗 +保 +俠 +信 +修 +俱 +俳 +倉 +個 +倍 +倒 +候 +借 +値 +倫 +倭 +假 +偈 +偉 +偏 +停 +偶 +傅 +傑 +傳 +傷 +傾 +像 +僞 +僥 +僧 +價 +儀 +儉 +儒 +優 +儼 +兀 +允 +元 +兆 +先 +光 +克 +兒 +入 +內 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +再 +冠 +冥 +冶 +准 +凞 +凡 +凱 +出 +函 +刀 +分 +刊 +刑 +列 +初 +判 +別 +利 +到 +制 +券 +刺 +刻 +則 +前 +剛 +副 +創 +劃 +劑 +力 +功 +加 +劣 +助 +劫 +勇 +動 +務 +勝 +勢 +勳 +勸 +匈 +化 +北 +匠 +區 +十 +千 +午 +半 +卍 +卑 +卒 +卓 +南 +博 +卜 +占 +卦 +印 +危 +卵 +卷 +卽 +卿 +厄 +原 +厦 +去 +參 +又 +叉 +友 +反 +叔 +受 +口 +古 +句 +可 +台 +史 +右 +司 +各 +合 +吉 +同 +名 +后 +吏 +吐 +君 +吠 +吳 +呂 +告 +周 +味 +呵 +命 +和 +咳 +咸 +咽 +哀 +品 +哨 +哮 +哲 +唐 +唯 +唱 +商 +問 +啼 +善 +喆 +喉 +喜 +喩 +喪 +嘗 +器 +嚴 +囊 +四 +回 +因 +困 +固 +圈 +國 +圍 +園 +圓 +圖 +團 +土 +在 +地 +均 +坊 +坐 +坑 +坵 +型 +垢 +城 +域 +埴 +執 +培 +基 +堂 +堅 +堆 +堤 +堯 +報 +場 +塔 +塚 +塞 +塵 +境 +墜 +墟 +墨 +墳 +墾 +壁 +壇 +壓 +壤 +士 +壬 +壯 +壺 +壽 +夏 +夕 +外 +多 +夜 +夢 +大 +天 +太 +夫 +央 +失 +夷 +奄 +奇 +奉 +奎 +奏 +契 +奔 +奮 +女 +奴 +好 +如 +妄 +妊 +妖 +妙 +始 +姑 +姓 +姚 +姜 +威 +婆 +婚 +婦 +媒 +媚 +子 +孔 +字 +存 +孝 +孟 +季 +孤 +孫 +學 +孺 +宇 +守 +安 +宋 +宗 +官 +宙 +定 +客 +宣 +室 +宮 +害 +家 +容 +寂 +寃 +寄 +寅 +密 +寇 +富 +寒 +寓 +實 +審 +寫 +寬 +寶 +寸 +寺 +封 +將 +專 +尊 +對 +小 +少 +尙 +尹 +尼 +尿 +局 +居 +屈 +屋 +屍 +屎 +屛 +層 +屬 +山 +岐 +岡 +岩 +岳 +岸 +峙 +峰 +島 +峻 +峽 +崇 +崔 +崖 +崩 +嶋 +巖 +川 +州 +巢 +工 +左 +巧 +巨 +巫 +差 +己 +巷 +市 +布 +帝 +師 +帶 +常 +帽 +幕 +干 +平 +年 +幹 +幻 +幼 +幽 +庇 +序 +店 +府 +度 +座 +庫 +庭 +康 +廟 +廣 +廳 +延 +廷 +建 +廻 +弁 +式 +弑 +弓 +引 +弘 +弟 +弱 +張 +强 +弼 +彌 +彛 +形 +彬 +影 +役 +彼 +彿 +往 +征 +待 +律 +後 +徐 +徑 +得 +從 +循 +微 +德 +徹 +心 +必 +忌 +忍 +志 +忠 +思 +怡 +急 +性 +恐 +恒 +恨 +恩 +悅 +悖 +患 +悲 +情 +惑 +惟 +惠 +惡 +想 +惺 +愁 +意 +愚 +愛 +感 +愼 +慈 +態 +慕 +慣 +慧 +慾 +憂 +憤 +憺 +應 +懸 +戎 +成 +我 +戟 +戮 +戰 +戴 +戶 +房 +所 +手 +才 +打 +批 +承 +技 +抄 +把 +抗 +抱 +抽 +拇 +拓 +拘 +拙 +拜 +拾 +持 +指 +捌 +捨 +捿 +授 +掌 +排 +接 +推 +提 +揚 +揭 +援 +損 +搗 +摩 +播 +操 +擒 +擔 +擘 +據 +擧 +攘 +攝 +攬 +支 +改 +攻 +放 +政 +故 +敍 +敎 +救 +敗 +散 +敬 +整 +數 +文 +斗 +料 +斛 +斜 +斧 +斯 +新 +斷 +方 +於 +施 +旋 +族 +旗 +日 +旨 +早 +旱 +昌 +明 +易 +昔 +星 +春 +昧 +昭 +是 +時 +晉 +晋 +晩 +普 +景 +晴 +晶 +智 +暈 +暑 +暗 +暘 +曉 +曜 +曠 +曦 +曰 +曲 +書 +曹 +曼 +曾 +最 +會 +月 +有 +朋 +服 +望 +朝 +期 +木 +未 +末 +本 +朱 +朴 +李 +材 +村 +杖 +杜 +杞 +杭 +杯 +東 +松 +板 +林 +果 +枝 +枯 +枰 +枾 +柏 +柑 +柱 +栗 +校 +栢 +核 +根 +格 +桀 +桂 +案 +桎 +桑 +桓 +桔 +梁 +梏 +梓 +梗 +條 +梨 +梵 +棗 +棟 +森 +植 +椒 +楊 +楓 +楚 +業 +楮 +極 +榮 +槃 +槍 +樂 +樓 +樗 +樣 +樸 +樹 +樺 +樽 +橄 +橋 +橘 +機 +橡 +檀 +檎 +權 +欌 +欖 +次 +欲 +歌 +歐 +止 +正 +此 +步 +武 +歲 +歸 +死 +殖 +段 +殷 +殺 +殿 +毅 +母 +毒 +比 +毛 +氏 +民 +氣 +水 +永 +求 +汎 +汗 +江 +池 +沅 +沒 +沖 +沙 +沛 +河 +油 +治 +沼 +沿 +泉 +泊 +法 +泗 +泡 +波 +注 +泰 +洋 +洙 +洛 +洞 +津 +洲 +活 +派 +流 +浅 +浦 +浮 +浴 +海 +涅 +涇 +消 +涌 +液 +淑 +淡 +淨 +淫 +深 +淳 +淵 +淸 +渠 +渡 +游 +渾 +湖 +湯 +源 +溪 +溫 +溶 +滄 +滅 +滋 +滯 +滿 +漁 +漆 +漢 +漫 +漸 +潑 +潤 +潭 +澄 +澎 +澤 +澳 +澹 +濁 +濕 +濟 +濤 +濯 +瀋 +瀝 +灣 +火 +灰 +灸 +災 +炎 +炭 +点 +烈 +烏 +烙 +焚 +無 +焦 +然 +煌 +煎 +照 +煬 +煮 +熟 +熱 +燁 +燈 +燔 +燕 +燥 +燧 +燮 +爲 +爵 +父 +片 +版 +牌 +牛 +牝 +牟 +牡 +物 +特 +犧 +犬 +狀 +狗 +猥 +猩 +猪 +獨 +獵 +獸 +獻 +玄 +玉 +王 +玲 +珍 +珠 +珪 +班 +現 +球 +理 +琴 +瑞 +瑟 +瑪 +璃 +璋 +璽 +瓜 +瓦 +甑 +甘 +生 +産 +用 +甫 +田 +由 +甲 +申 +男 +界 +畏 +留 +畜 +畢 +略 +番 +異 +畵 +當 +畸 +疏 +疑 +疫 +疹 +疼 +病 +症 +痔 +痛 +痺 +瘀 +瘍 +瘡 +療 +癌 +癖 +登 +發 +白 +百 +的 +皆 +皇 +皮 +盂 +盆 +益 +盛 +盜 +盟 +盡 +盤 +盧 +目 +直 +相 +省 +看 +眞 +眼 +睡 +督 +瞋 +矢 +矣 +知 +短 +石 +破 +碍 +碑 +磁 +磨 +磬 +示 +社 +祇 +祖 +祝 +神 +祥 +祭 +祺 +禁 +禅 +禍 +福 +禦 +禪 +禮 +禹 +禽 +禾 +秀 +私 +秉 +秋 +科 +秘 +秤 +秦 +秩 +移 +稀 +稗 +種 +稱 +稷 +稼 +稽 +穀 +穆 +積 +空 +窮 +竅 +立 +章 +童 +竭 +端 +竹 +笑 +符 +第 +筆 +等 +筍 +答 +策 +箋 +箕 +管 +箱 +節 +篇 +簡 +米 +粉 +粘 +粥 +精 +糖 +糞 +系 +紀 +紂 +約 +紅 +紋 +純 +紙 +級 +素 +索 +紫 +紬 +累 +細 +紳 +終 +組 +結 +絡 +統 +絲 +絶 +絹 +經 +綠 +維 +綱 +網 +綸 +綽 +緖 +線 +緣 +緯 +縣 +縱 +總 +織 +繡 +繩 +繪 +繭 +纂 +續 +罕 +置 +罰 +羅 +羊 +美 +群 +義 +羽 +翁 +習 +翟 +老 +考 +者 +而 +耐 +耕 +耳 +聃 +聖 +聞 +聰 +聲 +職 +肇 +肉 +肖 +肝 +股 +肥 +育 +肺 +胃 +胎 +胚 +胞 +胡 +胥 +能 +脂 +脈 +脚 +脛 +脣 +脩 +脫 +脯 +脾 +腋 +腎 +腫 +腸 +腹 +膜 +膠 +膨 +膽 +臆 +臟 +臣 +臥 +臨 +自 +至 +致 +臺 +臼 +臾 +與 +興 +舊 +舌 +舍 +舒 +舜 +舟 +般 +船 +艦 +良 +色 +芋 +花 +芳 +芽 +苑 +苔 +苕 +苛 +苞 +若 +苦 +英 +茂 +茵 +茶 +茹 +荀 +荇 +草 +荒 +荷 +莊 +莫 +菊 +菌 +菜 +菩 +菫 +華 +菴 +菽 +萊 +萍 +萬 +落 +葉 +著 +葛 +董 +葬 +蒙 +蒜 +蒲 +蒸 +蒿 +蓮 +蔓 +蔘 +蔡 +蔬 +蕃 +蕉 +蕓 +薄 +薑 +薛 +薩 +薪 +薺 +藏 +藝 +藤 +藥 +藩 +藻 +蘆 +蘇 +蘊 +蘚 +蘭 +虎 +處 +虛 +虞 +虹 +蜀 +蜂 +蜜 +蝕 +蝶 +融 +蟬 +蟲 +蠶 +蠻 +血 +衆 +行 +術 +衛 +衡 +衣 +表 +袁 +裔 +裕 +裙 +補 +製 +複 +襄 +西 +要 +見 +視 +親 +覺 +觀 +角 +解 +言 +訂 +訊 +訓 +託 +記 +訣 +設 +診 +註 +評 +詩 +話 +詵 +誅 +誌 +認 +誕 +語 +誠 +誤 +誥 +誦 +說 +調 +談 +諍 +論 +諡 +諫 +諭 +諸 +謙 +講 +謝 +謠 +證 +識 +譚 +譜 +譯 +議 +護 +讀 +變 +谷 +豆 +豊 +豚 +象 +豪 +豫 +貝 +貞 +財 +貧 +貨 +貪 +貫 +貴 +貸 +費 +資 +賊 +賓 +賞 +賢 +賣 +賦 +質 +贍 +赤 +赫 +走 +起 +超 +越 +趙 +趣 +趨 +足 +趾 +跋 +跡 +路 +踏 +蹟 +身 +躬 +車 +軍 +軒 +軟 +載 +輓 +輕 +輪 +輯 +輸 +輻 +輿 +轅 +轉 +辨 +辭 +辯 +辰 +農 +近 +迦 +述 +追 +逆 +透 +逐 +通 +逝 +造 +逢 +連 +進 +逵 +遂 +遊 +運 +遍 +過 +道 +達 +遠 +遡 +適 +遷 +選 +遺 +遽 +還 +邊 +邑 +那 +邪 +郞 +郡 +部 +都 +鄒 +鄕 +鄭 +鄲 +配 +酒 +酸 +醉 +醫 +醯 +釋 +里 +重 +野 +量 +釐 +金 +針 +鈍 +鈴 +鉞 +銀 +銅 +銘 +鋼 +錄 +錢 +錦 +鎭 +鏡 +鐘 +鐵 +鑑 +鑛 +長 +門 +閃 +開 +間 +閔 +閣 +閥 +閭 +閻 +闕 +關 +阪 +防 +阿 +陀 +降 +限 +陝 +院 +陰 +陳 +陵 +陶 +陸 +陽 +隆 +隊 +隋 +階 +際 +障 +隣 +隨 +隱 +隷 +雀 +雄 +雅 +集 +雇 +雌 +雖 +雙 +雜 +離 +難 +雨 +雪 +雲 +電 +霜 +露 +靈 +靑 +靖 +靜 +非 +面 +革 +靴 +鞏 +韓 +音 +韶 +韻 +順 +須 +頊 +頌 +領 +頭 +顔 +願 +顚 +類 +顯 +風 +飛 +食 +飢 +飮 +飯 +飾 +養 +餓 +餘 +首 +香 +馨 +馬 +駒 +騫 +騷 +驕 +骨 +骸 +髓 +體 +高 +髥 +髮 +鬪 +鬱 +鬼 +魏 +魔 +魚 +魯 +鮮 +鰍 +鰐 +鳥 +鳧 +鳳 +鴨 +鵲 +鶴 +鷄 +鷹 +鹽 +鹿 +麗 +麥 +麻 +黃 +黑 +默 +點 +黨 +鼎 +齊 +齋 +齒 +龍 +龜 +가 +각 +간 +갇 +갈 +갉 +감 +갑 +값 +갓 +갔 +강 +갖 +갗 +같 +갚 +갛 +개 +객 +갠 +갤 +갬 +갭 +갯 +갰 +갱 +갸 +걀 +걔 +걘 +거 +걱 +건 +걷 +걸 +검 +겁 +것 +겄 +겅 +겆 +겉 +겊 +겋 +게 +겐 +겔 +겟 +겠 +겡 +겨 +격 +겪 +견 +결 +겸 +겹 +겻 +겼 +경 +곁 +계 +곕 +곗 +고 +곡 +곤 +곧 +골 +곪 +곬 +곯 +곰 +곱 +곳 +공 +곶 +과 +곽 +관 +괄 +괌 +광 +괘 +괜 +괭 +괴 +괸 +굉 +교 +구 +국 +군 +굳 +굴 +굵 +굶 +굼 +굽 +굿 +궁 +궂 +궈 +권 +궐 +궜 +궝 +궤 +귀 +귄 +귈 +귓 +규 +균 +귤 +그 +극 +근 +글 +긁 +금 +급 +긋 +긍 +기 +긴 +길 +김 +깁 +깃 +깅 +깊 +까 +깍 +깎 +깐 +깔 +깜 +깝 +깟 +깡 +깥 +깨 +깬 +깰 +깻 +깼 +깽 +꺄 +꺼 +꺽 +꺾 +껀 +껄 +껌 +껍 +껏 +껐 +껑 +께 +껴 +꼈 +꼍 +꼐 +꼬 +꼭 +꼴 +꼼 +꼽 +꼿 +꽁 +꽂 +꽃 +꽉 +꽝 +꽤 +꽥 +꾀 +꾜 +꾸 +꾹 +꾼 +꿀 +꿇 +꿈 +꿉 +꿋 +꿍 +꿎 +꿔 +꿨 +꿩 +꿰 +꿴 +뀄 +뀌 +뀐 +뀔 +뀜 +뀝 +끄 +끈 +끊 +끌 +끓 +끔 +끕 +끗 +끙 +끝 +끼 +끽 +낀 +낄 +낌 +낍 +낏 +낑 +나 +낙 +낚 +난 +낟 +날 +낡 +남 +납 +낫 +났 +낭 +낮 +낯 +낱 +낳 +내 +낵 +낸 +낼 +냄 +냅 +냇 +냈 +냉 +냐 +냔 +냘 +냥 +너 +넉 +넋 +넌 +널 +넓 +넘 +넙 +넛 +넜 +넝 +넣 +네 +넥 +넨 +넬 +넴 +넵 +넷 +넸 +넹 +녀 +녁 +년 +념 +녔 +녕 +녘 +녜 +노 +녹 +논 +놀 +놈 +놋 +농 +높 +놓 +놔 +놨 +뇌 +뇨 +뇩 +뇽 +누 +눅 +눈 +눌 +눔 +눕 +눗 +눠 +눴 +뉘 +뉜 +뉩 +뉴 +늄 +늅 +늉 +느 +늑 +는 +늘 +늙 +늠 +늡 +능 +늦 +늪 +늬 +니 +닉 +닌 +닐 +님 +닙 +닛 +닝 +닢 +다 +닥 +닦 +단 +닫 +달 +닭 +닮 +닯 +닳 +담 +답 +닷 +당 +닻 +닿 +대 +댁 +댄 +댈 +댐 +댑 +댓 +댔 +댕 +댜 +더 +덕 +덖 +던 +덜 +덟 +덤 +덥 +덧 +덩 +덫 +덮 +데 +덱 +덴 +델 +뎀 +뎃 +뎅 +뎌 +뎠 +뎨 +도 +독 +돈 +돋 +돌 +돔 +돕 +돗 +동 +돛 +돝 +돼 +됐 +되 +된 +될 +됨 +됩 +됴 +두 +둑 +둔 +둘 +둠 +둡 +둣 +둥 +둬 +뒀 +뒤 +뒬 +뒷 +뒹 +듀 +듈 +듐 +드 +득 +든 +듣 +들 +듦 +듬 +듭 +듯 +등 +듸 +디 +딕 +딘 +딛 +딜 +딤 +딥 +딧 +딨 +딩 +딪 +따 +딱 +딴 +딸 +땀 +땄 +땅 +때 +땐 +땔 +땜 +땝 +땠 +땡 +떠 +떡 +떤 +떨 +떫 +떰 +떱 +떳 +떴 +떵 +떻 +떼 +떽 +뗀 +뗄 +뗍 +뗏 +뗐 +뗑 +또 +똑 +똘 +똥 +뙤 +뚜 +뚝 +뚤 +뚫 +뚱 +뛰 +뛴 +뛸 +뜀 +뜁 +뜨 +뜩 +뜬 +뜯 +뜰 +뜸 +뜻 +띄 +띈 +띌 +띔 +띕 +띠 +띤 +띨 +띱 +띵 +라 +락 +란 +랄 +람 +랍 +랏 +랐 +랑 +랒 +랗 +래 +랙 +랜 +랠 +램 +랩 +랫 +랬 +랭 +랴 +략 +량 +러 +럭 +런 +럴 +럼 +럽 +럿 +렀 +렁 +렇 +레 +렉 +렌 +렐 +렘 +렙 +렛 +렝 +려 +력 +련 +렬 +렴 +렵 +렷 +렸 +령 +례 +로 +록 +론 +롤 +롬 +롭 +롯 +롱 +롸 +롹 +뢰 +뢴 +뢸 +룃 +료 +룐 +룡 +루 +룩 +룬 +룰 +룸 +룹 +룻 +룽 +뤄 +뤘 +뤼 +류 +륙 +륜 +률 +륨 +륭 +르 +륵 +른 +를 +름 +릅 +릇 +릉 +릎 +리 +릭 +린 +릴 +림 +립 +릿 +링 +마 +막 +만 +많 +맏 +말 +맑 +맘 +맙 +맛 +망 +맞 +맡 +맣 +매 +맥 +맨 +맬 +맴 +맵 +맷 +맸 +맹 +맺 +먀 +먁 +머 +먹 +먼 +멀 +멈 +멋 +멍 +멎 +메 +멕 +멘 +멜 +멤 +멥 +멧 +멩 +며 +멱 +면 +멸 +몄 +명 +몇 +모 +목 +몫 +몬 +몰 +몸 +몹 +못 +몽 +뫼 +묘 +무 +묵 +묶 +문 +묻 +물 +묽 +뭄 +뭅 +뭇 +뭉 +뭍 +뭏 +뭐 +뭔 +뭘 +뭡 +뭣 +뮈 +뮌 +뮐 +뮤 +뮬 +므 +믈 +믐 +미 +믹 +민 +믿 +밀 +밈 +밉 +밋 +밌 +밍 +및 +밑 +바 +박 +밖 +반 +받 +발 +밝 +밟 +밤 +밥 +밧 +방 +밭 +배 +백 +밴 +밸 +뱀 +뱁 +뱃 +뱄 +뱅 +뱉 +뱍 +뱐 +버 +벅 +번 +벌 +범 +법 +벗 +벙 +벚 +베 +벡 +벤 +벨 +벰 +벱 +벳 +벵 +벼 +벽 +변 +별 +볍 +볏 +볐 +병 +볕 +보 +복 +볶 +본 +볼 +봄 +봅 +봇 +봉 +봐 +봤 +뵈 +뵐 +뵙 +부 +북 +분 +붇 +불 +붉 +붐 +붓 +붕 +붙 +뷔 +뷰 +뷴 +뷸 +브 +븐 +블 +비 +빅 +빈 +빌 +빔 +빕 +빗 +빙 +빚 +빛 +빠 +빡 +빤 +빨 +빳 +빴 +빵 +빻 +빼 +빽 +뺀 +뺄 +뺌 +뺏 +뺐 +뺑 +뺨 +뻐 +뻑 +뻔 +뻗 +뻘 +뻣 +뻤 +뻥 +뻬 +뼈 +뼉 +뼘 +뽀 +뽈 +뽐 +뽑 +뽕 +뾰 +뿌 +뿍 +뿐 +뿔 +뿜 +쁘 +쁜 +쁠 +쁨 +삐 +삔 +삘 +사 +삭 +삯 +산 +살 +삵 +삶 +삼 +삽 +삿 +샀 +상 +샅 +새 +색 +샌 +샐 +샘 +샙 +샛 +샜 +생 +샤 +샨 +샬 +샴 +샵 +샷 +샹 +서 +석 +섞 +선 +섣 +설 +섬 +섭 +섯 +섰 +성 +섶 +세 +섹 +센 +셀 +셈 +셉 +셋 +셌 +셍 +셔 +션 +셜 +셨 +셰 +셴 +셸 +소 +속 +손 +솔 +솜 +솝 +솟 +송 +솥 +쇄 +쇠 +쇤 +쇳 +쇼 +숀 +숄 +숍 +수 +숙 +순 +숟 +술 +숨 +숩 +숫 +숭 +숯 +숱 +숲 +숴 +쉐 +쉘 +쉬 +쉭 +쉰 +쉴 +쉼 +쉽 +슈 +슐 +슘 +슛 +슝 +스 +슥 +슨 +슬 +슭 +슴 +습 +슷 +승 +시 +식 +신 +싣 +실 +싫 +심 +십 +싯 +싱 +싶 +싸 +싹 +싼 +쌀 +쌈 +쌉 +쌌 +쌍 +쌓 +쌔 +쌘 +쌩 +써 +썩 +썬 +썰 +썸 +썹 +썼 +썽 +쎄 +쎈 +쏘 +쏙 +쏜 +쏟 +쏠 +쏭 +쏴 +쐈 +쐐 +쐬 +쑤 +쑥 +쑨 +쒀 +쒔 +쓰 +쓱 +쓴 +쓸 +씀 +씁 +씌 +씨 +씩 +씬 +씰 +씸 +씹 +씻 +씽 +아 +악 +안 +앉 +않 +알 +앎 +앓 +암 +압 +앗 +았 +앙 +앞 +애 +액 +앤 +앨 +앰 +앱 +앳 +앴 +앵 +야 +약 +얀 +얄 +얇 +얌 +얍 +얏 +양 +얕 +얗 +얘 +얜 +어 +억 +언 +얹 +얻 +얼 +얽 +엄 +업 +없 +엇 +었 +엉 +엊 +엌 +엎 +에 +엑 +엔 +엘 +엠 +엡 +엣 +엥 +여 +역 +엮 +연 +열 +엷 +염 +엽 +엾 +엿 +였 +영 +옅 +옆 +옇 +예 +옌 +옐 +옙 +옛 +오 +옥 +온 +올 +옭 +옮 +옳 +옴 +옵 +옷 +옹 +옻 +와 +왁 +완 +왈 +왑 +왓 +왔 +왕 +왜 +왠 +왱 +외 +왼 +요 +욕 +욘 +욜 +욤 +용 +우 +욱 +운 +울 +움 +웁 +웃 +웅 +워 +웍 +원 +월 +웜 +웠 +웡 +웨 +웬 +웰 +웸 +웹 +위 +윅 +윈 +윌 +윔 +윗 +윙 +유 +육 +윤 +율 +윱 +윳 +융 +으 +윽 +은 +을 +읊 +음 +읍 +응 +의 +읜 +읠 +이 +익 +인 +일 +읽 +잃 +임 +입 +잇 +있 +잉 +잊 +잎 +자 +작 +잔 +잖 +잘 +잠 +잡 +잣 +잤 +장 +잦 +재 +잭 +잰 +잴 +잽 +잿 +쟀 +쟁 +쟈 +쟉 +쟤 +저 +적 +전 +절 +젊 +점 +접 +젓 +정 +젖 +제 +젝 +젠 +젤 +젬 +젭 +젯 +져 +젼 +졀 +졌 +졍 +조 +족 +존 +졸 +좀 +좁 +종 +좇 +좋 +좌 +좍 +좽 +죄 +죠 +죤 +주 +죽 +준 +줄 +줌 +줍 +줏 +중 +줘 +줬 +쥐 +쥔 +쥘 +쥬 +쥴 +즈 +즉 +즌 +즐 +즘 +즙 +증 +지 +직 +진 +짇 +질 +짊 +짐 +집 +짓 +징 +짖 +짙 +짚 +짜 +짝 +짠 +짢 +짤 +짧 +짬 +짭 +짰 +짱 +째 +짹 +짼 +쨀 +쨉 +쨋 +쨌 +쨍 +쩄 +쩌 +쩍 +쩐 +쩔 +쩜 +쩝 +쩡 +쩨 +쪄 +쪘 +쪼 +쪽 +쪾 +쫀 +쫄 +쫑 +쫓 +쫙 +쬐 +쭈 +쭉 +쭐 +쭙 +쯔 +쯤 +쯧 +찌 +찍 +찐 +찔 +찜 +찝 +찡 +찢 +찧 +차 +착 +찬 +찮 +찰 +참 +찹 +찻 +찼 +창 +찾 +채 +책 +챈 +챌 +챔 +챕 +챗 +챘 +챙 +챠 +챤 +처 +척 +천 +철 +첨 +첩 +첫 +청 +체 +첵 +첸 +첼 +쳄 +쳇 +쳉 +쳐 +쳔 +쳤 +초 +촉 +촌 +촘 +촛 +총 +촨 +촬 +최 +쵸 +추 +축 +춘 +출 +춤 +춥 +춧 +충 +춰 +췄 +췌 +취 +췬 +츄 +츠 +측 +츨 +츰 +층 +치 +칙 +친 +칠 +칡 +침 +칩 +칫 +칭 +카 +칵 +칸 +칼 +캄 +캅 +캇 +캉 +캐 +캔 +캘 +캠 +캡 +캣 +캤 +캥 +캬 +커 +컥 +컨 +컫 +컬 +컴 +컵 +컷 +컸 +컹 +케 +켄 +켈 +켐 +켓 +켕 +켜 +켠 +켤 +켭 +켯 +켰 +코 +콕 +콘 +콜 +콤 +콥 +콧 +콩 +콰 +콱 +콴 +콸 +쾅 +쾌 +쾡 +쾨 +쾰 +쿄 +쿠 +쿡 +쿤 +쿨 +쿰 +쿵 +쿼 +퀀 +퀄 +퀘 +퀭 +퀴 +퀵 +퀸 +퀼 +큐 +큘 +크 +큰 +클 +큼 +큽 +키 +킥 +킨 +킬 +킴 +킵 +킷 +킹 +타 +탁 +탄 +탈 +탉 +탐 +탑 +탓 +탔 +탕 +태 +택 +탠 +탤 +탬 +탭 +탯 +탰 +탱 +터 +턱 +턴 +털 +텀 +텁 +텃 +텄 +텅 +테 +텍 +텐 +텔 +템 +텝 +텡 +텨 +톈 +토 +톡 +톤 +톨 +톰 +톱 +톳 +통 +퇴 +툇 +투 +툭 +툰 +툴 +툼 +퉁 +퉈 +퉜 +튀 +튄 +튈 +튕 +튜 +튠 +튤 +튬 +트 +특 +튼 +튿 +틀 +틈 +틉 +틋 +틔 +티 +틱 +틴 +틸 +팀 +팁 +팅 +파 +팍 +팎 +판 +팔 +팜 +팝 +팟 +팠 +팡 +팥 +패 +팩 +팬 +팰 +팸 +팻 +팼 +팽 +퍼 +퍽 +펀 +펄 +펌 +펍 +펐 +펑 +페 +펙 +펜 +펠 +펨 +펩 +펫 +펭 +펴 +편 +펼 +폄 +폈 +평 +폐 +포 +폭 +폰 +폴 +폼 +폿 +퐁 +표 +푭 +푸 +푹 +푼 +풀 +품 +풋 +풍 +퓨 +퓬 +퓰 +퓸 +프 +픈 +플 +픔 +픕 +피 +픽 +핀 +필 +핌 +핍 +핏 +핑 +하 +학 +한 +할 +핥 +함 +합 +핫 +항 +해 +핵 +핸 +핼 +햄 +햅 +햇 +했 +행 +햐 +향 +헀 +허 +헉 +헌 +헐 +험 +헙 +헛 +헝 +헤 +헥 +헨 +헬 +헴 +헵 +헷 +헹 +혀 +혁 +현 +혈 +혐 +협 +혓 +혔 +형 +혜 +호 +혹 +혼 +홀 +홈 +홉 +홋 +홍 +홑 +화 +확 +환 +활 +홧 +황 +홰 +홱 +횃 +회 +획 +횝 +횟 +횡 +효 +후 +훅 +훈 +훌 +훑 +훔 +훗 +훤 +훨 +훼 +휄 +휑 +휘 +휙 +휜 +휠 +휩 +휭 +휴 +휼 +흄 +흉 +흐 +흑 +흔 +흘 +흙 +흠 +흡 +흣 +흥 +흩 +희 +흰 +흽 +히 +힉 +힌 +힐 +힘 +힙 +힝 +車 +滑 +金 +奈 +羅 +洛 +卵 +欄 +蘭 +郎 +來 +盧 +老 +魯 +綠 +鹿 +論 +雷 +樓 +縷 +凌 +樂 +不 +參 +葉 +沈 +若 +兩 +凉 +梁 +呂 +女 +廬 +麗 +黎 +曆 +歷 +戀 +蓮 +連 +列 +烈 +裂 +念 +獵 +靈 +領 +例 +禮 +醴 +惡 +尿 +料 +遼 +龍 +暈 +柳 +流 +類 +六 +陸 +倫 +律 +栗 +利 +李 +梨 +理 +離 +燐 +林 +臨 +立 +茶 +切 +宅 + diff --git a/backend/ppocr/utils/dict/latin_dict.txt b/backend/ppocr/utils/dict/latin_dict.txt new file mode 100644 index 0000000..e166bf3 --- /dev/null +++ b/backend/ppocr/utils/dict/latin_dict.txt @@ -0,0 +1,185 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +} +¡ +£ +§ +ª +« +­ +° +² +³ +´ +µ +· +º +» +¿ +À +Á + +Ä +Å +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ò +Ó +Ô +Õ +Ö +Ú +Ü +Ý +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +ą +Ć +ć +Č +č +Đ +đ +ę +ı +Ł +ł +ō +Œ +œ +Š +š +Ÿ +Ž +ž +ʒ +β +δ +ε +з +Ṡ +‘ +€ +™ diff --git a/backend/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt b/backend/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt new file mode 100644 index 0000000..8be0f48 --- /dev/null +++ b/backend/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt @@ -0,0 +1,10 @@ +text +title +figure +figure_caption +table +table_caption +header +footer +reference +equation \ No newline at end of file diff --git a/backend/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt b/backend/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt new file mode 100644 index 0000000..ca6acf4 --- /dev/null +++ b/backend/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt @@ -0,0 +1,5 @@ +text +title +list +table +figure \ No newline at end of file diff --git a/backend/ppocr/utils/dict/layout_dict/layout_table_dict.txt b/backend/ppocr/utils/dict/layout_dict/layout_table_dict.txt new file mode 100644 index 0000000..faea15e --- /dev/null +++ b/backend/ppocr/utils/dict/layout_dict/layout_table_dict.txt @@ -0,0 +1 @@ +table \ No newline at end of file diff --git a/backend/ppocr/utils/dict/mr_dict.txt b/backend/ppocr/utils/dict/mr_dict.txt new file mode 100644 index 0000000..283b150 --- /dev/null +++ b/backend/ppocr/utils/dict/mr_dict.txt @@ -0,0 +1,153 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +० +१ +२ +३ +४ +५ +६ +७ +८ +९ diff --git a/backend/ppocr/utils/dict/ne_dict.txt b/backend/ppocr/utils/dict/ne_dict.txt new file mode 100644 index 0000000..5a7df95 --- /dev/null +++ b/backend/ppocr/utils/dict/ne_dict.txt @@ -0,0 +1,153 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +े +ै +ो +ौ +् +॒ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ diff --git a/backend/ppocr/utils/dict/oc_dict.txt b/backend/ppocr/utils/dict/oc_dict.txt new file mode 100644 index 0000000..e88af8b --- /dev/null +++ b/backend/ppocr/utils/dict/oc_dict.txt @@ -0,0 +1,96 @@ +o +c +_ +i +m +g +/ +2 +0 +I +L +S +V +R +C +1 +v +a +l +4 +3 +. +j +p +r +e +è +t +9 +7 +5 +8 +n +' +b +s +6 +q +u +á +d +ò +à +h +z +f +ï +í +A +ç +x +ó +é +P +O +Ò +ü +k +À +F +- +ú +­ +æ +Á +D +E +w +K +T +N +y +U +Z +G +B +J +H +M +W +Y +X +Q +% +$ +, +@ +& +! +: +( +# +? ++ +É + diff --git a/backend/ppocr/utils/dict/pt_dict.txt b/backend/ppocr/utils/dict/pt_dict.txt new file mode 100644 index 0000000..9500fae --- /dev/null +++ b/backend/ppocr/utils/dict/pt_dict.txt @@ -0,0 +1,130 @@ +p +u +_ +i +m +g +/ +8 +I +L +S +V +R +C +2 +0 +1 +v +a +l +6 +7 +4 +5 +. +j + +q +e +s +t +ã +o +x +9 +c +n +r +z +ç +õ +3 +A +U +d +º +ô +­ +, +E +; +ó +á +b +D +? +ú +ê +- +h +P +f +à +N +í +O +M +G +É +é +â +F +: +T +Á +" +Q +) +W +J +B +H +( +ö +% +Ö +« +w +K +y +! +k +] +' +Z ++ +Ç +Õ +Y +À +X +µ +» +ª +Í +ü +ä +´ +è +ñ +ß +ï +Ú +ë +Ô +Ï +Ó +[ +Ì +< + +ò +§ +³ +ø +å +# +$ +& +@ diff --git a/backend/ppocr/utils/dict/pu_dict.txt b/backend/ppocr/utils/dict/pu_dict.txt new file mode 100644 index 0000000..9500fae --- /dev/null +++ b/backend/ppocr/utils/dict/pu_dict.txt @@ -0,0 +1,130 @@ +p +u +_ +i +m +g +/ +8 +I +L +S +V +R +C +2 +0 +1 +v +a +l +6 +7 +4 +5 +. +j + +q +e +s +t +ã +o +x +9 +c +n +r +z +ç +õ +3 +A +U +d +º +ô +­ +, +E +; +ó +á +b +D +? +ú +ê +- +h +P +f +à +N +í +O +M +G +É +é +â +F +: +T +Á +" +Q +) +W +J +B +H +( +ö +% +Ö +« +w +K +y +! +k +] +' +Z ++ +Ç +Õ +Y +À +X +µ +» +ª +Í +ü +ä +´ +è +ñ +ß +ï +Ú +ë +Ô +Ï +Ó +[ +Ì +< + +ò +§ +³ +ø +å +# +$ +& +@ diff --git a/backend/ppocr/utils/dict/rs_cyrillic_dict.txt b/backend/ppocr/utils/dict/rs_cyrillic_dict.txt new file mode 100644 index 0000000..95dd463 --- /dev/null +++ b/backend/ppocr/utils/dict/rs_cyrillic_dict.txt @@ -0,0 +1,134 @@ +r +s +c +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p +м +а +с +и +р +ћ +е +ш +3 +4 +о +г +н +з +в +л +6 +т +ж +у +к +п +њ +д +ч +С +ј +ф +ц +љ +х +О +И +А +б +Ш +К +ђ +џ +М +В +З +Д +Р +У +Н +Т +Б +? +П +Х +Ј +Ц +Г +Љ +Л +Ф +e +n +w +E +F +A +N +f +o +b +M +G +t +y +W +k +P +u +H +B +T +z +h +O +Y +d +U +K +D +x +X +J +Z +Q +q +' +- +@ +é +# +! +, +% +$ +: +& ++ +( +É + diff --git a/backend/ppocr/utils/dict/rs_dict.txt b/backend/ppocr/utils/dict/rs_dict.txt new file mode 100644 index 0000000..d1ce46d --- /dev/null +++ b/backend/ppocr/utils/dict/rs_dict.txt @@ -0,0 +1,91 @@ +r +s +_ +i +m +g +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +7 +5 +8 +6 +. +j +p + +t +d +9 +3 +e +š +4 +k +u +ć +c +n +đ +o +z +č +b +ž +f +Z +T +h +M +F +O +Š +B +H +A +E +Đ +Ž +D +P +G +Č +K +U +N +J +Ć +w +y +W +x +Y +X +q +Q +# +& +$ +, +- +% +' +@ +! +: +? +( +É +é ++ diff --git a/backend/ppocr/utils/dict/rs_latin_dict.txt b/backend/ppocr/utils/dict/rs_latin_dict.txt new file mode 100644 index 0000000..d1ce46d --- /dev/null +++ b/backend/ppocr/utils/dict/rs_latin_dict.txt @@ -0,0 +1,91 @@ +r +s +_ +i +m +g +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +7 +5 +8 +6 +. +j +p + +t +d +9 +3 +e +š +4 +k +u +ć +c +n +đ +o +z +č +b +ž +f +Z +T +h +M +F +O +Š +B +H +A +E +Đ +Ž +D +P +G +Č +K +U +N +J +Ć +w +y +W +x +Y +X +q +Q +# +& +$ +, +- +% +' +@ +! +: +? +( +É +é ++ diff --git a/backend/ppocr/utils/dict/rsc_dict.txt b/backend/ppocr/utils/dict/rsc_dict.txt new file mode 100644 index 0000000..95dd463 --- /dev/null +++ b/backend/ppocr/utils/dict/rsc_dict.txt @@ -0,0 +1,134 @@ +r +s +c +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p +м +а +с +и +р +ћ +е +ш +3 +4 +о +г +н +з +в +л +6 +т +ж +у +к +п +њ +д +ч +С +ј +ф +ц +љ +х +О +И +А +б +Ш +К +ђ +џ +М +В +З +Д +Р +У +Н +Т +Б +? +П +Х +Ј +Ц +Г +Љ +Л +Ф +e +n +w +E +F +A +N +f +o +b +M +G +t +y +W +k +P +u +H +B +T +z +h +O +Y +d +U +K +D +x +X +J +Z +Q +q +' +- +@ +é +# +! +, +% +$ +: +& ++ +( +É + diff --git a/backend/ppocr/utils/dict/ru_dict.txt b/backend/ppocr/utils/dict/ru_dict.txt new file mode 100644 index 0000000..aff9c16 --- /dev/null +++ b/backend/ppocr/utils/dict/ru_dict.txt @@ -0,0 +1,163 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +Ё +Є +І +Ј +Љ +Ў +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я +ё +ђ +є +і +ј +љ +њ +ћ +ў +џ +Ґ +ґ diff --git a/backend/ppocr/utils/dict/spin_dict.txt b/backend/ppocr/utils/dict/spin_dict.txt new file mode 100644 index 0000000..8ee8347 --- /dev/null +++ b/backend/ppocr/utils/dict/spin_dict.txt @@ -0,0 +1,68 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +: +( +' +- +, +% +> +. +[ +? +) +" += +_ +* +] +; +& ++ +$ +@ +/ +| +! +< +# +` +{ +~ +\ +} +^ \ No newline at end of file diff --git a/backend/ppocr/utils/dict/ta_dict.txt b/backend/ppocr/utils/dict/ta_dict.txt new file mode 100644 index 0000000..19d8189 --- /dev/null +++ b/backend/ppocr/utils/dict/ta_dict.txt @@ -0,0 +1,128 @@ +t +a +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +l +9 +7 +8 +. +j +p +ப +ூ +த +ம +ி +வ +ர +் +ந +ோ +ன +6 +ஆ +ற +ல +5 +ள +ா +ொ +ழ +ு +4 +ெ +ண +க +ட +ை +ே +ச +ய +ஒ +இ +அ +ங +உ +ீ +ஞ +எ +ஓ +ஃ +ஜ +ஷ +ஸ +ஏ +ஊ +ஹ +ஈ +ஐ +ௌ +ஔ +s +c +e +n +w +F +T +O +P +K +A +N +G +Y +E +M +H +U +B +o +b +D +d +r +W +u +y +f +X +k +q +h +J +z +Z +Q +x +- +' +$ +, +% +@ +é +! +# ++ +É +& +: +( +? + diff --git a/backend/ppocr/utils/dict/table_dict.txt b/backend/ppocr/utils/dict/table_dict.txt new file mode 100644 index 0000000..2ef028c --- /dev/null +++ b/backend/ppocr/utils/dict/table_dict.txt @@ -0,0 +1,277 @@ +← + +☆ +─ +α + + +⋅ +$ +ω +ψ +χ +( +υ +≥ +σ +, +ρ +ε +0 +■ +4 +8 +✗ +b +< +✓ +Ψ +Ω +€ +D +3 +Π +H +║ + +L +Φ +Χ +θ +P +κ +λ +μ +T +ξ +X +β +γ +δ +\ +ζ +η +` +d + +h +f +l +Θ +p +√ +t + +x +Β +Γ +Δ +| +ǂ +ɛ +j +̧ +➢ +⁡ +̌ +′ +« +△ +▲ +# + +' +Ι ++ +¶ +/ +▼ +⇑ +□ +· +7 +▪ +; +? +➔ +∩ +C +÷ +G +⇒ +K + +O +S +С +W +Α +[ +○ +_ +● +‡ +c +z +g + +o + +〈 +〉 +s +⩽ +w +φ +ʹ +{ +» +∣ +̆ +e +ˆ +∈ +τ +◆ +ι +∅ +∆ +∙ +∘ +Ø +ß +✔ +∞ +∑ +− +× +◊ +∗ +∖ +˃ +˂ +∫ +" +i +& +π +↔ +* +∥ +æ +∧ +. +⁄ +ø +Q +∼ +6 +⁎ +: +★ +> +a +B +≈ +F +J +̄ +N +♯ +R +V + +― +Z +♣ +^ +¤ +¥ +§ + +¢ +£ +≦ +­ +≤ +‖ +Λ +© +n +↓ +→ +↑ +r +° +± +v + +♂ +k +♀ +~ +ᅟ +̇ +@ +” +♦ +ł +® +⊕ +„ +! + +% +⇓ +) +- +1 +5 +9 += +А +A +‰ +⋆ +Σ +E +◦ +I +※ +M +m +̨ +⩾ +† + +• +U +Y +
 +] +̸ +2 +‐ +– +‒ +̂ +— +̀ +́ +’ +‘ +⋮ +⋯ +̊ +“ +̈ +≧ +q +u +ı +y + +​ +̃ +} +ν diff --git a/backend/ppocr/utils/dict/table_master_structure_dict.txt b/backend/ppocr/utils/dict/table_master_structure_dict.txt new file mode 100644 index 0000000..95ab253 --- /dev/null +++ b/backend/ppocr/utils/dict/table_master_structure_dict.txt @@ -0,0 +1,39 @@ + + + + + + + + + + + colspan="2" + colspan="3" + + + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" + + + + + + + + diff --git a/backend/ppocr/utils/dict/table_structure_dict.txt b/backend/ppocr/utils/dict/table_structure_dict.txt new file mode 100644 index 0000000..8edb10b --- /dev/null +++ b/backend/ppocr/utils/dict/table_structure_dict.txt @@ -0,0 +1,28 @@ + + + + + + + + + + colspan="2" + colspan="3" + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" \ No newline at end of file diff --git a/backend/ppocr/utils/dict/table_structure_dict_ch.txt b/backend/ppocr/utils/dict/table_structure_dict_ch.txt new file mode 100644 index 0000000..0c59c0e --- /dev/null +++ b/backend/ppocr/utils/dict/table_structure_dict_ch.txt @@ -0,0 +1,48 @@ + + + + + + + + + + colspan="2" + colspan="3" + colspan="4" + colspan="5" + colspan="6" + colspan="7" + colspan="8" + colspan="9" + colspan="10" + colspan="11" + colspan="12" + colspan="13" + colspan="14" + colspan="15" + colspan="16" + colspan="17" + colspan="18" + colspan="19" + colspan="20" + rowspan="2" + rowspan="3" + rowspan="4" + rowspan="5" + rowspan="6" + rowspan="7" + rowspan="8" + rowspan="9" + rowspan="10" + rowspan="11" + rowspan="12" + rowspan="13" + rowspan="14" + rowspan="15" + rowspan="16" + rowspan="17" + rowspan="18" + rowspan="19" + rowspan="20" diff --git a/backend/ppocr/utils/dict/te_dict.txt b/backend/ppocr/utils/dict/te_dict.txt new file mode 100644 index 0000000..83d74cc --- /dev/null +++ b/backend/ppocr/utils/dict/te_dict.txt @@ -0,0 +1,151 @@ +t +e +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +3 +4 +8 +9 +. +j +p +త +ె +ర +క +్ +ి +ం +చ +ే +ద +ు +7 +6 +ఉ +ా +మ +ట +ో +వ +ప +ల +శ +ఆ +య +ై +భ +' +ీ +గ +ూ +డ +ధ +హ +న +జ +స +[ +‌ +ష +అ +ణ +ఫ +బ +ఎ +; +ళ +థ +ొ +ఠ +ృ +ఒ +ఇ +ః +ఊ +ఖ +- +ఐ +ఘ +ౌ +ఏ +ఈ +ఛ +, +ఓ +ఞ +| +? +: +ఢ +" +( +” +! ++ +) +* += +& +“ +€ +] +£ +$ +s +c +n +w +k +J +G +u +d +r +E +o +h +y +b +f +B +M +O +T +N +D +P +A +F +x +W +Y +U +H +K +X +z +Z +Q +q +É +% +# +@ +é diff --git a/backend/ppocr/utils/dict/ug_dict.txt b/backend/ppocr/utils/dict/ug_dict.txt new file mode 100644 index 0000000..77602f2 --- /dev/null +++ b/backend/ppocr/utils/dict/ug_dict.txt @@ -0,0 +1,114 @@ +u +g +_ +i +m +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +8 +5 +3 +6 +9 +. +j +p + +ق +ا +پ +ل +4 +7 +ئ +ى +ش +ت +ي +ك +د +ف +ر +و +ن +ب +ە +خ +ې +چ +ۇ +ز +س +م +ۋ +گ +ڭ +ۆ +ۈ +ج +غ +ھ +ژ +s +c +e +n +w +P +E +D +U +d +r +b +y +B +o +O +Y +N +T +k +t +h +A +H +F +z +W +K +G +M +f +Z +X +Q +J +x +q +- +! +% +# +? +: +$ +, +& +' +É +@ +é +( ++ diff --git a/backend/ppocr/utils/dict/uk_dict.txt b/backend/ppocr/utils/dict/uk_dict.txt new file mode 100644 index 0000000..c5ffc0a --- /dev/null +++ b/backend/ppocr/utils/dict/uk_dict.txt @@ -0,0 +1,142 @@ +u +k +_ +i +m +g +/ +1 +6 +I +L +S +V +R +C +2 +0 +v +a +l +7 +9 +. +j +p +в +і +д +п +о +н +с +т +ю +4 +5 +3 +а +и +м +е +р +ч +у +Б +з +л +к +8 +А +В +г +є +б +ь +х +ґ +ш +ц +ф +я +щ +ж +Г +Х +У +Т +Е +І +Н +П +З +Л +Ю +С +Д +М +К +Р +Ф +О +Ц +И +Я +Ч +Ш +Ж +Є +Ґ +Ь +s +c +e +n +w +A +P +r +E +t +o +h +d +y +M +G +N +F +B +T +D +U +O +W +Z +f +H +Y +b +K +z +x +Q +X +q +J +$ +- +' +# +& +% +? +: +! +, ++ +@ +( +é +É + diff --git a/backend/ppocr/utils/dict/ur_dict.txt b/backend/ppocr/utils/dict/ur_dict.txt new file mode 100644 index 0000000..c06786a --- /dev/null +++ b/backend/ppocr/utils/dict/ur_dict.txt @@ -0,0 +1,137 @@ +u +r +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p + +چ +ٹ +پ +ا +ئ +ی +ے +4 +6 +و +ل +ن +ڈ +ھ +ک +ت +ش +ف +ق +ر +د +5 +ب +ج +خ +ہ +س +ز +غ +ڑ +ں +آ +م +ؤ +ط +ص +ح +ع +گ +ث +ض +ذ +ۓ +ِ +ء +ظ +ً +ي +ُ +ۃ +أ +ٰ +ە +ژ +ۂ +ة +ّ +ك +ه +s +c +e +n +w +o +d +t +D +M +T +U +E +b +P +h +y +W +H +A +x +B +O +N +G +Y +Q +F +k +K +q +J +Z +f +z +X +' +@ +& +! +, +: +$ +- +# +? +% +é ++ +( +É diff --git a/backend/ppocr/utils/dict/xi_dict.txt b/backend/ppocr/utils/dict/xi_dict.txt new file mode 100644 index 0000000..f195f1e --- /dev/null +++ b/backend/ppocr/utils/dict/xi_dict.txt @@ -0,0 +1,110 @@ +x +i +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +a +l +3 +6 +4 +5 +. +j +p + +Q +u +e +r +o +8 +7 +n +c +9 +t +b +é +q +d +ó +y +F +s +, +O +í +T +f +" +U +M +h +: +P +H +A +E +D +z +N +á +ñ +ú +% +; +è ++ +Y +- +B +G +( +) +¿ +? +w +¡ +! +X +É +K +k +Á +ü +Ú +« +» +J +' +ö +W +Z +º +Ö +­ +[ +] +Ç +ç +à +ä +û +ò +Í +ê +ô +ø +ª diff --git a/backend/ppocr/utils/e2e_metric/Deteval.py b/backend/ppocr/utils/e2e_metric/Deteval.py new file mode 100755 index 0000000..45567a7 --- /dev/null +++ b/backend/ppocr/utils/e2e_metric/Deteval.py @@ -0,0 +1,574 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import scipy.io as io +from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area + + +def get_socre_A(gt_dir, pred_dict): + allInputs = 1 + + def input_reading_mod(pred_dict): + """This helper reads input from txt files""" + det = [] + n = len(pred_dict) + for i in range(n): + points = pred_dict[i]['points'] + text = pred_dict[i]['texts'] + point = ",".join(map(str, points.reshape(-1, ))) + det.append([point, text]) + return det + + def gt_reading_mod(gt_dict): + """This helper reads groundtruths from mat files""" + gt = [] + n = len(gt_dict) + for i in range(n): + points = gt_dict[i]['points'].tolist() + h = len(points) + text = gt_dict[i]['text'] + xx = [ + np.array( + ['x:'], dtype=' 1): + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + det_x = detection[0::2] + det_y = detection[1::2] + det_gt_iou = iod(det_x, det_y, gt_x, gt_y) + if det_gt_iou > threshold: + detections[det_id] = [] + + detections[:] = [item for item in detections if item != []] + return detections + + def sigma_calculation(det_x, det_y, gt_x, gt_y): + """ + sigma = inter_area / gt_area + """ + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(gt_x, gt_y)), 2) + + def tau_calculation(det_x, det_y, gt_x, gt_y): + if area(det_x, det_y) == 0.0: + return 0 + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(det_x, det_y)), 2) + + ##############################Initialization################################### + # global_sigma = [] + # global_tau = [] + # global_pred_str = [] + # global_gt_str = [] + ############################################################################### + + for input_id in range(allInputs): + if (input_id != '.DS_Store') and (input_id != 'Pascal_result.txt') and ( + input_id != 'Pascal_result_curved.txt') and (input_id != 'Pascal_result_non_curved.txt') and ( + input_id != 'Deteval_result.txt') and (input_id != 'Deteval_result_curved.txt') \ + and (input_id != 'Deteval_result_non_curved.txt'): + detections = input_reading_mod(pred_dict) + groundtruths = gt_reading_mod(gt_dir) + detections = detection_filtering( + detections, + groundtruths) # filters detections overlapping with DC area + dc_id = [] + for i in range(len(groundtruths)): + if groundtruths[i][5] == '#': + dc_id.append(i) + cnt = 0 + for a in dc_id: + num = a - cnt + del groundtruths[num] + cnt += 1 + + local_sigma_table = np.zeros((len(groundtruths), len(detections))) + local_tau_table = np.zeros((len(groundtruths), len(detections))) + local_pred_str = {} + local_gt_str = {} + + for gt_id, gt in enumerate(groundtruths): + if len(detections) > 0: + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + pred_seq_str = detection_orig[1].strip() + det_x = detection[0::2] + det_y = detection[1::2] + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + gt_seq_str = str(gt[4].tolist()[0]) + + local_sigma_table[gt_id, det_id] = sigma_calculation( + det_x, det_y, gt_x, gt_y) + local_tau_table[gt_id, det_id] = tau_calculation( + det_x, det_y, gt_x, gt_y) + local_pred_str[det_id] = pred_seq_str + local_gt_str[gt_id] = gt_seq_str + + global_sigma = local_sigma_table + global_tau = local_tau_table + global_pred_str = local_pred_str + global_gt_str = local_gt_str + + single_data = {} + single_data['sigma'] = global_sigma + single_data['global_tau'] = global_tau + single_data['global_pred_str'] = global_pred_str + single_data['global_gt_str'] = global_gt_str + return single_data + + +def get_socre_B(gt_dir, img_id, pred_dict): + allInputs = 1 + + def input_reading_mod(pred_dict): + """This helper reads input from txt files""" + det = [] + n = len(pred_dict) + for i in range(n): + points = pred_dict[i]['points'] + text = pred_dict[i]['texts'] + point = ",".join(map(str, points.reshape(-1, ))) + det.append([point, text]) + return det + + def gt_reading_mod(gt_dir, gt_id): + gt = io.loadmat('%s/poly_gt_img%s.mat' % (gt_dir, gt_id)) + gt = gt['polygt'] + return gt + + def detection_filtering(detections, groundtruths, threshold=0.5): + for gt_id, gt in enumerate(groundtruths): + if (gt[5] == '#') and (gt[1].shape[1] > 1): + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + det_x = detection[0::2] + det_y = detection[1::2] + det_gt_iou = iod(det_x, det_y, gt_x, gt_y) + if det_gt_iou > threshold: + detections[det_id] = [] + + detections[:] = [item for item in detections if item != []] + return detections + + def sigma_calculation(det_x, det_y, gt_x, gt_y): + """ + sigma = inter_area / gt_area + """ + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(gt_x, gt_y)), 2) + + def tau_calculation(det_x, det_y, gt_x, gt_y): + if area(det_x, det_y) == 0.0: + return 0 + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(det_x, det_y)), 2) + + ##############################Initialization################################### + # global_sigma = [] + # global_tau = [] + # global_pred_str = [] + # global_gt_str = [] + ############################################################################### + + for input_id in range(allInputs): + if (input_id != '.DS_Store') and (input_id != 'Pascal_result.txt') and ( + input_id != 'Pascal_result_curved.txt') and (input_id != 'Pascal_result_non_curved.txt') and ( + input_id != 'Deteval_result.txt') and (input_id != 'Deteval_result_curved.txt') \ + and (input_id != 'Deteval_result_non_curved.txt'): + detections = input_reading_mod(pred_dict) + groundtruths = gt_reading_mod(gt_dir, img_id).tolist() + detections = detection_filtering( + detections, + groundtruths) # filters detections overlapping with DC area + dc_id = [] + for i in range(len(groundtruths)): + if groundtruths[i][5] == '#': + dc_id.append(i) + cnt = 0 + for a in dc_id: + num = a - cnt + del groundtruths[num] + cnt += 1 + + local_sigma_table = np.zeros((len(groundtruths), len(detections))) + local_tau_table = np.zeros((len(groundtruths), len(detections))) + local_pred_str = {} + local_gt_str = {} + + for gt_id, gt in enumerate(groundtruths): + if len(detections) > 0: + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + pred_seq_str = detection_orig[1].strip() + det_x = detection[0::2] + det_y = detection[1::2] + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + gt_seq_str = str(gt[4].tolist()[0]) + + local_sigma_table[gt_id, det_id] = sigma_calculation( + det_x, det_y, gt_x, gt_y) + local_tau_table[gt_id, det_id] = tau_calculation( + det_x, det_y, gt_x, gt_y) + local_pred_str[det_id] = pred_seq_str + local_gt_str[gt_id] = gt_seq_str + + global_sigma = local_sigma_table + global_tau = local_tau_table + global_pred_str = local_pred_str + global_gt_str = local_gt_str + + single_data = {} + single_data['sigma'] = global_sigma + single_data['global_tau'] = global_tau + single_data['global_pred_str'] = global_pred_str + single_data['global_gt_str'] = global_gt_str + return single_data + + +def combine_results(all_data): + tr = 0.7 + tp = 0.6 + fsc_k = 0.8 + k = 2 + global_sigma = [] + global_tau = [] + global_pred_str = [] + global_gt_str = [] + for data in all_data: + global_sigma.append(data['sigma']) + global_tau.append(data['global_tau']) + global_pred_str.append(data['global_pred_str']) + global_gt_str.append(data['global_gt_str']) + + global_accumulative_recall = 0 + global_accumulative_precision = 0 + total_num_gt = 0 + total_num_det = 0 + hit_str_count = 0 + hit_count = 0 + + def one_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idy): + hit_str_num = 0 + for gt_id in range(num_gt): + gt_matching_qualified_sigma_candidates = np.where( + local_sigma_table[gt_id, :] > tr) + gt_matching_num_qualified_sigma_candidates = gt_matching_qualified_sigma_candidates[ + 0].shape[0] + gt_matching_qualified_tau_candidates = np.where( + local_tau_table[gt_id, :] > tp) + gt_matching_num_qualified_tau_candidates = gt_matching_qualified_tau_candidates[ + 0].shape[0] + + det_matching_qualified_sigma_candidates = np.where( + local_sigma_table[:, gt_matching_qualified_sigma_candidates[0]] + > tr) + det_matching_num_qualified_sigma_candidates = det_matching_qualified_sigma_candidates[ + 0].shape[0] + det_matching_qualified_tau_candidates = np.where( + local_tau_table[:, gt_matching_qualified_tau_candidates[0]] > + tp) + det_matching_num_qualified_tau_candidates = det_matching_qualified_tau_candidates[ + 0].shape[0] + + if (gt_matching_num_qualified_sigma_candidates == 1) and (gt_matching_num_qualified_tau_candidates == 1) and \ + (det_matching_num_qualified_sigma_candidates == 1) and ( + det_matching_num_qualified_tau_candidates == 1): + global_accumulative_recall = global_accumulative_recall + 1.0 + global_accumulative_precision = global_accumulative_precision + 1.0 + local_accumulative_recall = local_accumulative_recall + 1.0 + local_accumulative_precision = local_accumulative_precision + 1.0 + + gt_flag[0, gt_id] = 1 + matched_det_id = np.where(local_sigma_table[gt_id, :] > tr) + # recg start + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][matched_det_id[0].tolist()[ + 0]] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + # recg end + det_flag[0, matched_det_id] = 1 + return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num + + def one_to_many(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idy): + hit_str_num = 0 + for gt_id in range(num_gt): + # skip the following if the groundtruth was matched + if gt_flag[0, gt_id] > 0: + continue + + non_zero_in_sigma = np.where(local_sigma_table[gt_id, :] > 0) + num_non_zero_in_sigma = non_zero_in_sigma[0].shape[0] + + if num_non_zero_in_sigma >= k: + ####search for all detections that overlaps with this groundtruth + qualified_tau_candidates = np.where((local_tau_table[ + gt_id, :] >= tp) & (det_flag[0, :] == 0)) + num_qualified_tau_candidates = qualified_tau_candidates[ + 0].shape[0] + + if num_qualified_tau_candidates == 1: + if ((local_tau_table[gt_id, qualified_tau_candidates] >= tp) + and + (local_sigma_table[gt_id, qualified_tau_candidates] >= + tr)): + # became an one-to-one case + global_accumulative_recall = global_accumulative_recall + 1.0 + global_accumulative_precision = global_accumulative_precision + 1.0 + local_accumulative_recall = local_accumulative_recall + 1.0 + local_accumulative_precision = local_accumulative_precision + 1.0 + + gt_flag[0, gt_id] = 1 + det_flag[0, qualified_tau_candidates] = 1 + # recg start + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][ + qualified_tau_candidates[0].tolist()[0]] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + # recg end + elif (np.sum(local_sigma_table[gt_id, qualified_tau_candidates]) + >= tr): + gt_flag[0, gt_id] = 1 + det_flag[0, qualified_tau_candidates] = 1 + # recg start + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][ + qualified_tau_candidates[0].tolist()[0]] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + # recg end + + global_accumulative_recall = global_accumulative_recall + fsc_k + global_accumulative_precision = global_accumulative_precision + num_qualified_tau_candidates * fsc_k + + local_accumulative_recall = local_accumulative_recall + fsc_k + local_accumulative_precision = local_accumulative_precision + num_qualified_tau_candidates * fsc_k + + return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num + + def many_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idy): + hit_str_num = 0 + for det_id in range(num_det): + # skip the following if the detection was matched + if det_flag[0, det_id] > 0: + continue + + non_zero_in_tau = np.where(local_tau_table[:, det_id] > 0) + num_non_zero_in_tau = non_zero_in_tau[0].shape[0] + + if num_non_zero_in_tau >= k: + ####search for all detections that overlaps with this groundtruth + qualified_sigma_candidates = np.where(( + local_sigma_table[:, det_id] >= tp) & (gt_flag[0, :] == 0)) + num_qualified_sigma_candidates = qualified_sigma_candidates[ + 0].shape[0] + + if num_qualified_sigma_candidates == 1: + if ((local_tau_table[qualified_sigma_candidates, det_id] >= + tp) and + (local_sigma_table[qualified_sigma_candidates, det_id] + >= tr)): + # became an one-to-one case + global_accumulative_recall = global_accumulative_recall + 1.0 + global_accumulative_precision = global_accumulative_precision + 1.0 + local_accumulative_recall = local_accumulative_recall + 1.0 + local_accumulative_precision = local_accumulative_precision + 1.0 + + gt_flag[0, qualified_sigma_candidates] = 1 + det_flag[0, det_id] = 1 + # recg start + pred_str_cur = global_pred_str[idy][det_id] + gt_len = len(qualified_sigma_candidates[0]) + for idx in range(gt_len): + ele_gt_id = qualified_sigma_candidates[0].tolist()[ + idx] + if ele_gt_id not in global_gt_str[idy]: + continue + gt_str_cur = global_gt_str[idy][ele_gt_id] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + break + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + break + # recg end + elif (np.sum(local_tau_table[qualified_sigma_candidates, + det_id]) >= tp): + det_flag[0, det_id] = 1 + gt_flag[0, qualified_sigma_candidates] = 1 + # recg start + pred_str_cur = global_pred_str[idy][det_id] + gt_len = len(qualified_sigma_candidates[0]) + for idx in range(gt_len): + ele_gt_id = qualified_sigma_candidates[0].tolist()[idx] + if ele_gt_id not in global_gt_str[idy]: + continue + gt_str_cur = global_gt_str[idy][ele_gt_id] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + break + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + break + # recg end + + global_accumulative_recall = global_accumulative_recall + num_qualified_sigma_candidates * fsc_k + global_accumulative_precision = global_accumulative_precision + fsc_k + + local_accumulative_recall = local_accumulative_recall + num_qualified_sigma_candidates * fsc_k + local_accumulative_precision = local_accumulative_precision + fsc_k + return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num + + for idx in range(len(global_sigma)): + local_sigma_table = np.array(global_sigma[idx]) + local_tau_table = global_tau[idx] + + num_gt = local_sigma_table.shape[0] + num_det = local_sigma_table.shape[1] + + total_num_gt = total_num_gt + num_gt + total_num_det = total_num_det + num_det + + local_accumulative_recall = 0 + local_accumulative_precision = 0 + gt_flag = np.zeros((1, num_gt)) + det_flag = np.zeros((1, num_det)) + + #######first check for one-to-one case########## + local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ + gt_flag, det_flag, hit_str_num = one_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idx) + + hit_str_count += hit_str_num + #######then check for one-to-many case########## + local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ + gt_flag, det_flag, hit_str_num = one_to_many(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idx) + hit_str_count += hit_str_num + #######then check for many-to-one case########## + local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ + gt_flag, det_flag, hit_str_num = many_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idx) + hit_str_count += hit_str_num + + try: + recall = global_accumulative_recall / total_num_gt + except ZeroDivisionError: + recall = 0 + + try: + precision = global_accumulative_precision / total_num_det + except ZeroDivisionError: + precision = 0 + + try: + f_score = 2 * precision * recall / (precision + recall) + except ZeroDivisionError: + f_score = 0 + + try: + seqerr = 1 - float(hit_str_count) / global_accumulative_recall + except ZeroDivisionError: + seqerr = 1 + + try: + recall_e2e = float(hit_str_count) / total_num_gt + except ZeroDivisionError: + recall_e2e = 0 + + try: + precision_e2e = float(hit_str_count) / total_num_det + except ZeroDivisionError: + precision_e2e = 0 + + try: + f_score_e2e = 2 * precision_e2e * recall_e2e / ( + precision_e2e + recall_e2e) + except ZeroDivisionError: + f_score_e2e = 0 + + final = { + 'total_num_gt': total_num_gt, + 'total_num_det': total_num_det, + 'global_accumulative_recall': global_accumulative_recall, + 'hit_str_count': hit_str_count, + 'recall': recall, + 'precision': precision, + 'f_score': f_score, + 'seqerr': seqerr, + 'recall_e2e': recall_e2e, + 'precision_e2e': precision_e2e, + 'f_score_e2e': f_score_e2e + } + return final diff --git a/backend/ppocr/utils/e2e_metric/polygon_fast.py b/backend/ppocr/utils/e2e_metric/polygon_fast.py new file mode 100755 index 0000000..81c9ad7 --- /dev/null +++ b/backend/ppocr/utils/e2e_metric/polygon_fast.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from shapely.geometry import Polygon +""" +:param det_x: [1, N] Xs of detection's vertices +:param det_y: [1, N] Ys of detection's vertices +:param gt_x: [1, N] Xs of groundtruth's vertices +:param gt_y: [1, N] Ys of groundtruth's vertices + +############## +All the calculation of 'AREA' in this script is handled by: +1) First generating a binary mask with the polygon area filled up with 1's +2) Summing up all the 1's +""" + + +def area(x, y): + polygon = Polygon(np.stack([x, y], axis=1)) + return float(polygon.area) + + +def approx_area_of_intersection(det_x, det_y, gt_x, gt_y): + """ + This helper determine if both polygons are intersecting with each others with an approximation method. + Area of intersection represented by the minimum bounding rectangular [xmin, ymin, xmax, ymax] + """ + det_ymax = np.max(det_y) + det_xmax = np.max(det_x) + det_ymin = np.min(det_y) + det_xmin = np.min(det_x) + + gt_ymax = np.max(gt_y) + gt_xmax = np.max(gt_x) + gt_ymin = np.min(gt_y) + gt_xmin = np.min(gt_x) + + all_min_ymax = np.minimum(det_ymax, gt_ymax) + all_max_ymin = np.maximum(det_ymin, gt_ymin) + + intersect_heights = np.maximum(0.0, (all_min_ymax - all_max_ymin)) + + all_min_xmax = np.minimum(det_xmax, gt_xmax) + all_max_xmin = np.maximum(det_xmin, gt_xmin) + intersect_widths = np.maximum(0.0, (all_min_xmax - all_max_xmin)) + + return intersect_heights * intersect_widths + + +def area_of_intersection(det_x, det_y, gt_x, gt_y): + p1 = Polygon(np.stack([det_x, det_y], axis=1)).buffer(0) + p2 = Polygon(np.stack([gt_x, gt_y], axis=1)).buffer(0) + return float(p1.intersection(p2).area) + + +def area_of_union(det_x, det_y, gt_x, gt_y): + p1 = Polygon(np.stack([det_x, det_y], axis=1)).buffer(0) + p2 = Polygon(np.stack([gt_x, gt_y], axis=1)).buffer(0) + return float(p1.union(p2).area) + + +def iou(det_x, det_y, gt_x, gt_y): + return area_of_intersection(det_x, det_y, gt_x, gt_y) / ( + area_of_union(det_x, det_y, gt_x, gt_y) + 1.0) + + +def iod(det_x, det_y, gt_x, gt_y): + """ + This helper determine the fraction of intersection area over detection area + """ + return area_of_intersection(det_x, det_y, gt_x, gt_y) / ( + area(det_x, det_y) + 1.0) diff --git a/backend/ppocr/utils/e2e_utils/extract_batchsize.py b/backend/ppocr/utils/e2e_utils/extract_batchsize.py new file mode 100644 index 0000000..e99a833 --- /dev/null +++ b/backend/ppocr/utils/e2e_utils/extract_batchsize.py @@ -0,0 +1,87 @@ +import paddle +import numpy as np +import copy + + +def org_tcl_rois(batch_size, pos_lists, pos_masks, label_lists, tcl_bs): + """ + """ + pos_lists_, pos_masks_, label_lists_ = [], [], [] + img_bs = batch_size + ngpu = int(batch_size / img_bs) + img_ids = np.array(pos_lists, dtype=np.int32)[:, 0, 0].copy() + pos_lists_split, pos_masks_split, label_lists_split = [], [], [] + for i in range(ngpu): + pos_lists_split.append([]) + pos_masks_split.append([]) + label_lists_split.append([]) + + for i in range(img_ids.shape[0]): + img_id = img_ids[i] + gpu_id = int(img_id / img_bs) + img_id = img_id % img_bs + pos_list = pos_lists[i].copy() + pos_list[:, 0] = img_id + pos_lists_split[gpu_id].append(pos_list) + pos_masks_split[gpu_id].append(pos_masks[i].copy()) + label_lists_split[gpu_id].append(copy.deepcopy(label_lists[i])) + # repeat or delete + for i in range(ngpu): + vp_len = len(pos_lists_split[i]) + if vp_len <= tcl_bs: + for j in range(0, tcl_bs - vp_len): + pos_list = pos_lists_split[i][j].copy() + pos_lists_split[i].append(pos_list) + pos_mask = pos_masks_split[i][j].copy() + pos_masks_split[i].append(pos_mask) + label_list = copy.deepcopy(label_lists_split[i][j]) + label_lists_split[i].append(label_list) + else: + for j in range(0, vp_len - tcl_bs): + c_len = len(pos_lists_split[i]) + pop_id = np.random.permutation(c_len)[0] + pos_lists_split[i].pop(pop_id) + pos_masks_split[i].pop(pop_id) + label_lists_split[i].pop(pop_id) + # merge + for i in range(ngpu): + pos_lists_.extend(pos_lists_split[i]) + pos_masks_.extend(pos_masks_split[i]) + label_lists_.extend(label_lists_split[i]) + return pos_lists_, pos_masks_, label_lists_ + + +def pre_process(label_list, pos_list, pos_mask, max_text_length, max_text_nums, + pad_num, tcl_bs): + label_list = label_list.numpy() + batch, _, _, _ = label_list.shape + pos_list = pos_list.numpy() + pos_mask = pos_mask.numpy() + pos_list_t = [] + pos_mask_t = [] + label_list_t = [] + for i in range(batch): + for j in range(max_text_nums): + if pos_mask[i, j].any(): + pos_list_t.append(pos_list[i][j]) + pos_mask_t.append(pos_mask[i][j]) + label_list_t.append(label_list[i][j]) + pos_list, pos_mask, label_list = org_tcl_rois(batch, pos_list_t, pos_mask_t, + label_list_t, tcl_bs) + label = [] + tt = [l.tolist() for l in label_list] + for i in range(tcl_bs): + k = 0 + for j in range(max_text_length): + if tt[i][j][0] != pad_num: + k += 1 + else: + break + label.append(k) + label = paddle.to_tensor(label) + label = paddle.cast(label, dtype='int64') + pos_list = paddle.to_tensor(pos_list) + pos_mask = paddle.to_tensor(pos_mask) + label_list = paddle.squeeze(paddle.to_tensor(label_list), axis=2) + label_list = paddle.cast(label_list, dtype='int32') + return pos_list, pos_mask, label_list, label diff --git a/backend/ppocr/utils/e2e_utils/extract_textpoint_fast.py b/backend/ppocr/utils/e2e_utils/extract_textpoint_fast.py new file mode 100644 index 0000000..787cd30 --- /dev/null +++ b/backend/ppocr/utils/e2e_utils/extract_textpoint_fast.py @@ -0,0 +1,457 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import math + +import numpy as np +from itertools import groupby +from skimage.morphology._skeletonize import thin + + +def get_dict(character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + +def softmax(logits): + """ + logits: N x d + """ + max_value = np.max(logits, axis=1, keepdims=True) + exp = np.exp(logits - max_value) + exp_sum = np.sum(exp, axis=1, keepdims=True) + dist = exp / exp_sum + return dist + + +def get_keep_pos_idxs(labels, remove_blank=None): + """ + Remove duplicate and get pos idxs of keep items. + The value of keep_blank should be [None, 95]. + """ + duplicate_len_list = [] + keep_pos_idx_list = [] + keep_char_idx_list = [] + for k, v_ in groupby(labels): + current_len = len(list(v_)) + if k != remove_blank: + current_idx = int(sum(duplicate_len_list) + current_len // 2) + keep_pos_idx_list.append(current_idx) + keep_char_idx_list.append(k) + duplicate_len_list.append(current_len) + return keep_char_idx_list, keep_pos_idx_list + + +def remove_blank(labels, blank=0): + new_labels = [x for x in labels if x != blank] + return new_labels + + +def insert_blank(labels, blank=0): + new_labels = [blank] + for l in labels: + new_labels += [l, blank] + return new_labels + + +def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): + """ + CTC greedy (best path) decoder. + """ + raw_str = np.argmax(np.array(probs_seq), axis=1) + remove_blank_in_pos = None if keep_blank_in_idxs else blank + dedup_str, keep_idx_list = get_keep_pos_idxs( + raw_str, remove_blank=remove_blank_in_pos) + dst_str = remove_blank(dedup_str, blank=blank) + return dst_str, keep_idx_list + + +def instance_ctc_greedy_decoder(gather_info, logits_map, pts_num=4): + _, _, C = logits_map.shape + ys, xs = zip(*gather_info) + logits_seq = logits_map[list(ys), list(xs)] + probs_seq = logits_seq + labels = np.argmax(probs_seq, axis=1) + dst_str = [k for k, v_ in groupby(labels) if k != C - 1] + detal = len(gather_info) // (pts_num - 1) + keep_idx_list = [0] + [detal * (i + 1) for i in range(pts_num - 2)] + [-1] + keep_gather_list = [gather_info[idx] for idx in keep_idx_list] + return dst_str, keep_gather_list + + +def ctc_decoder_for_image(gather_info_list, + logits_map, + Lexicon_Table, + pts_num=6): + """ + CTC decoder using multiple processes. + """ + decoder_str = [] + decoder_xys = [] + for gather_info in gather_info_list: + if len(gather_info) < pts_num: + continue + dst_str, xys_list = instance_ctc_greedy_decoder( + gather_info, logits_map, pts_num=pts_num) + dst_str_readable = ''.join([Lexicon_Table[idx] for idx in dst_str]) + if len(dst_str_readable) < 2: + continue + decoder_str.append(dst_str_readable) + decoder_xys.append(xys_list) + return decoder_str, decoder_xys + + +def sort_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list, point_direction): + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point, np.array(sorted_direction) + + +def add_id(pos_list, image_id=0): + """ + Add id for gather feature, for inference. + """ + new_list = [] + for item in pos_list: + new_list.append((image_id, item[0], item[1])) + return new_list + + +def sort_and_expand_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + left_list = [] + right_list = [] + for i in range(append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + left_list.append((ly, lx)) + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + right_list.append((ry, rx)) + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + binary_tcl_map: h x w + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + max_append_num = 2 * append_num + + left_list = [] + right_list = [] + for i in range(max_append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + if binary_tcl_map[ly, lx] > 0.5: + left_list.append((ly, lx)) + else: + break + + for i in range(max_append_num): + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + if binary_tcl_map[ry, rx] > 0.5: + right_list.append((ry, rx)) + else: + break + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2) + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def restore_poly(instance_yxs_list, seq_strs, p_border, ratio_w, ratio_h, src_w, + src_h, valid_set): + poly_list = [] + keep_str_list = [] + for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs): + if len(keep_str) < 2: + print('--> too short, {}'.format(keep_str)) + continue + + offset_expand = 1.0 + if valid_set == 'totaltext': + offset_expand = 1.2 + + point_pair_list = [] + for y, x in yx_center_line: + offset = p_border[:, y, x].reshape(2, 2) * offset_expand + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + detected_poly = point_pair2poly(point_pair_list) + detected_poly = expand_poly_along_width( + detected_poly, shrink_ratio_of_width=0.2) + detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h) + + keep_str_list.append(keep_str) + if valid_set == 'partvgg': + middle_point = len(detected_poly) // 2 + detected_poly = detected_poly[ + [0, middle_point - 1, middle_point, -1], :] + poly_list.append(detected_poly) + elif valid_set == 'totaltext': + poly_list.append(detected_poly) + else: + print('--> Not supported format.') + exit(-1) + return poly_list, keep_str_list + + +def generate_pivot_list_fast(p_score, + p_char_maps, + f_direction, + Lexicon_Table, + score_thresh=0.5): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map.astype(np.uint8)) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + if len(pos_list) < 3: + continue + + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + all_pos_yxs.append(pos_list_sorted) + + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decoded_str, keep_yxs_list = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, Lexicon_Table=Lexicon_Table) + return keep_yxs_list, decoded_str + + +def extract_main_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + pos_list = np.array(pos_list) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + average_direction = average_direction / ( + np.linalg.norm(average_direction) + 1e-6) + return average_direction + + +def sort_by_direction_with_image_id_deprecated(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[id, y, x], [id, y, x], [id, y, x] ...] + """ + pos_list_full = np.array(pos_list).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + return sorted_list + + +def sort_by_direction_with_image_id(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list_full, point_direction): + pos_list_full = np.array(pos_list_full).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 3) + point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point diff --git a/backend/ppocr/utils/e2e_utils/extract_textpoint_slow.py b/backend/ppocr/utils/e2e_utils/extract_textpoint_slow.py new file mode 100644 index 0000000..ace46fb --- /dev/null +++ b/backend/ppocr/utils/e2e_utils/extract_textpoint_slow.py @@ -0,0 +1,592 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import math + +import numpy as np +from itertools import groupby +from skimage.morphology._skeletonize import thin + + +def get_dict(character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + pair_length_list = [] + for point_pair in point_pair_list: + pair_length = np.linalg.norm(point_pair[0] - point_pair[1]) + pair_length_list.append(pair_length) + pair_length_list = np.array(pair_length_list) + pair_info = (pair_length_list.max(), pair_length_list.min(), + pair_length_list.mean()) + + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2), pair_info + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def softmax(logits): + """ + logits: N x d + """ + max_value = np.max(logits, axis=1, keepdims=True) + exp = np.exp(logits - max_value) + exp_sum = np.sum(exp, axis=1, keepdims=True) + dist = exp / exp_sum + return dist + + +def get_keep_pos_idxs(labels, remove_blank=None): + """ + Remove duplicate and get pos idxs of keep items. + The value of keep_blank should be [None, 95]. + """ + duplicate_len_list = [] + keep_pos_idx_list = [] + keep_char_idx_list = [] + for k, v_ in groupby(labels): + current_len = len(list(v_)) + if k != remove_blank: + current_idx = int(sum(duplicate_len_list) + current_len // 2) + keep_pos_idx_list.append(current_idx) + keep_char_idx_list.append(k) + duplicate_len_list.append(current_len) + return keep_char_idx_list, keep_pos_idx_list + + +def remove_blank(labels, blank=0): + new_labels = [x for x in labels if x != blank] + return new_labels + + +def insert_blank(labels, blank=0): + new_labels = [blank] + for l in labels: + new_labels += [l, blank] + return new_labels + + +def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): + """ + CTC greedy (best path) decoder. + """ + raw_str = np.argmax(np.array(probs_seq), axis=1) + remove_blank_in_pos = None if keep_blank_in_idxs else blank + dedup_str, keep_idx_list = get_keep_pos_idxs( + raw_str, remove_blank=remove_blank_in_pos) + dst_str = remove_blank(dedup_str, blank=blank) + return dst_str, keep_idx_list + + +def instance_ctc_greedy_decoder(gather_info, + logits_map, + keep_blank_in_idxs=True): + """ + gather_info: [[x, y], [x, y] ...] + logits_map: H x W X (n_chars + 1) + """ + _, _, C = logits_map.shape + ys, xs = zip(*gather_info) + logits_seq = logits_map[list(ys), list(xs)] # n x 96 + probs_seq = softmax(logits_seq) + dst_str, keep_idx_list = ctc_greedy_decoder( + probs_seq, blank=C - 1, keep_blank_in_idxs=keep_blank_in_idxs) + keep_gather_list = [gather_info[idx] for idx in keep_idx_list] + return dst_str, keep_gather_list + + +def ctc_decoder_for_image(gather_info_list, logits_map, + keep_blank_in_idxs=True): + """ + CTC decoder using multiple processes. + """ + decoder_results = [] + for gather_info in gather_info_list: + res = instance_ctc_greedy_decoder( + gather_info, logits_map, keep_blank_in_idxs=keep_blank_in_idxs) + decoder_results.append(res) + return decoder_results + + +def sort_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list, point_direction): + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point, np.array(sorted_direction) + + +def add_id(pos_list, image_id=0): + """ + Add id for gather feature, for inference. + """ + new_list = [] + for item in pos_list: + new_list.append((image_id, item[0], item[1])) + return new_list + + +def sort_and_expand_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + # expand along + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + left_list = [] + right_list = [] + for i in range(append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + left_list.append((ly, lx)) + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + right_list.append((ry, rx)) + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + binary_tcl_map: h x w + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + # expand along + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + max_append_num = 2 * append_num + + left_list = [] + right_list = [] + for i in range(max_append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + if binary_tcl_map[ly, lx] > 0.5: + left_list.append((ly, lx)) + else: + break + + for i in range(max_append_num): + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + if binary_tcl_map[ry, rx] > 0.5: + right_list.append((ry, rx)) + else: + break + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def generate_pivot_list_curved(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_expand=True, + is_backbone=False, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + center_pos_yxs = [] + end_points_yxs = [] + instance_center_pos_yxs = [] + pred_strs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + ### FIX-ME, eliminate outlier + if len(pos_list) < 3: + continue + + if is_expand: + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + else: + pos_list_sorted, _ = sort_with_direction(pos_list, f_direction) + all_pos_yxs.append(pos_list_sorted) + + # use decoder to filter backgroud points. + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decode_res = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, keep_blank_in_idxs=True) + for decoded_str, keep_yxs_list in decode_res: + if is_backbone: + keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id) + instance_center_pos_yxs.append(keep_yxs_list_with_id) + pred_strs.append(decoded_str) + else: + end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1])) + center_pos_yxs.extend(keep_yxs_list) + + if is_backbone: + return pred_strs, instance_center_pos_yxs + else: + return center_pos_yxs, end_points_yxs + + +def generate_pivot_list_horizontal(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map_bi = (p_score > score_thresh) * 1.0 + instance_count, instance_label_map = cv2.connectedComponents( + p_tcl_map_bi.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + center_pos_yxs = [] + end_points_yxs = [] + instance_center_pos_yxs = [] + + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + ### FIX-ME, eliminate outlier + if len(pos_list) < 5: + continue + + # add rule here + main_direction = extract_main_direction(pos_list, + f_direction) # y x + reference_directin = np.array([0, 1]).reshape([-1, 2]) # y x + is_h_angle = abs(np.sum( + main_direction * reference_directin)) < math.cos(math.pi / 180 * + 70) + + point_yxs = np.array(pos_list) + max_y, max_x = np.max(point_yxs, axis=0) + min_y, min_x = np.min(point_yxs, axis=0) + is_h_len = (max_y - min_y) < 1.5 * (max_x - min_x) + + pos_list_final = [] + if is_h_len: + xs = np.unique(xs) + for x in xs: + ys = instance_label_map[:, x].copy().reshape((-1, )) + y = int(np.where(ys == instance_id)[0].mean()) + pos_list_final.append((y, x)) + else: + ys = np.unique(ys) + for y in ys: + xs = instance_label_map[y, :].copy().reshape((-1, )) + x = int(np.where(xs == instance_id)[0].mean()) + pos_list_final.append((y, x)) + + pos_list_sorted, _ = sort_with_direction(pos_list_final, + f_direction) + all_pos_yxs.append(pos_list_sorted) + + # use decoder to filter backgroud points. + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decode_res = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, keep_blank_in_idxs=True) + for decoded_str, keep_yxs_list in decode_res: + if is_backbone: + keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id) + instance_center_pos_yxs.append(keep_yxs_list_with_id) + else: + end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1])) + center_pos_yxs.extend(keep_yxs_list) + + if is_backbone: + return instance_center_pos_yxs + else: + return center_pos_yxs, end_points_yxs + + +def generate_pivot_list_slow(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + is_curved=True, + image_id=0): + """ + Warp all the function together. + """ + if is_curved: + return generate_pivot_list_curved( + p_score, + p_char_maps, + f_direction, + score_thresh=score_thresh, + is_expand=True, + is_backbone=is_backbone, + image_id=image_id) + else: + return generate_pivot_list_horizontal( + p_score, + p_char_maps, + f_direction, + score_thresh=score_thresh, + is_backbone=is_backbone, + image_id=image_id) + + +# for refine module +def extract_main_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + pos_list = np.array(pos_list) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + average_direction = average_direction / ( + np.linalg.norm(average_direction) + 1e-6) + return average_direction + + +def sort_by_direction_with_image_id_deprecated(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[id, y, x], [id, y, x], [id, y, x] ...] + """ + pos_list_full = np.array(pos_list).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + return sorted_list + + +def sort_by_direction_with_image_id(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list_full, point_direction): + pos_list_full = np.array(pos_list_full).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 3) + point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point + + +def generate_pivot_list_tt_inference(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + is_curved=True, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + ### FIX-ME, eliminate outlier + if len(pos_list) < 3: + continue + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + pos_list_sorted_with_id = add_id(pos_list_sorted, image_id=image_id) + all_pos_yxs.append(pos_list_sorted_with_id) + return all_pos_yxs diff --git a/backend/ppocr/utils/e2e_utils/pgnet_pp_utils.py b/backend/ppocr/utils/e2e_utils/pgnet_pp_utils.py new file mode 100644 index 0000000..a15503c --- /dev/null +++ b/backend/ppocr/utils/e2e_utils/pgnet_pp_utils.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import paddle +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) +from extract_textpoint_slow import * +from extract_textpoint_fast import generate_pivot_list_fast, restore_poly + + +class PGNet_PostProcess(object): + # two different post-process + def __init__(self, character_dict_path, valid_set, score_thresh, outs_dict, + shape_list): + self.Lexicon_Table = get_dict(character_dict_path) + self.valid_set = valid_set + self.score_thresh = score_thresh + self.outs_dict = outs_dict + self.shape_list = shape_list + + def pg_postprocess_fast(self): + p_score = self.outs_dict['f_score'] + p_border = self.outs_dict['f_border'] + p_char = self.outs_dict['f_char'] + p_direction = self.outs_dict['f_direction'] + if isinstance(p_score, paddle.Tensor): + p_score = p_score[0].numpy() + p_border = p_border[0].numpy() + p_direction = p_direction[0].numpy() + p_char = p_char[0].numpy() + else: + p_score = p_score[0] + p_border = p_border[0] + p_direction = p_direction[0] + p_char = p_char[0] + + src_h, src_w, ratio_h, ratio_w = self.shape_list[0] + instance_yxs_list, seq_strs = generate_pivot_list_fast( + p_score, + p_char, + p_direction, + self.Lexicon_Table, + score_thresh=self.score_thresh) + poly_list, keep_str_list = restore_poly(instance_yxs_list, seq_strs, + p_border, ratio_w, ratio_h, + src_w, src_h, self.valid_set) + data = { + 'points': poly_list, + 'texts': keep_str_list, + } + return data + + def pg_postprocess_slow(self): + p_score = self.outs_dict['f_score'] + p_border = self.outs_dict['f_border'] + p_char = self.outs_dict['f_char'] + p_direction = self.outs_dict['f_direction'] + if isinstance(p_score, paddle.Tensor): + p_score = p_score[0].numpy() + p_border = p_border[0].numpy() + p_direction = p_direction[0].numpy() + p_char = p_char[0].numpy() + else: + p_score = p_score[0] + p_border = p_border[0] + p_direction = p_direction[0] + p_char = p_char[0] + src_h, src_w, ratio_h, ratio_w = self.shape_list[0] + is_curved = self.valid_set == "totaltext" + char_seq_idx_set, instance_yxs_list = generate_pivot_list_slow( + p_score, + p_char, + p_direction, + score_thresh=self.score_thresh, + is_backbone=True, + is_curved=is_curved) + seq_strs = [] + for char_idx_set in char_seq_idx_set: + pr_str = ''.join([self.Lexicon_Table[pos] for pos in char_idx_set]) + seq_strs.append(pr_str) + poly_list = [] + keep_str_list = [] + all_point_list = [] + all_point_pair_list = [] + for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs): + if len(yx_center_line) == 1: + yx_center_line.append(yx_center_line[-1]) + + offset_expand = 1.0 + if self.valid_set == 'totaltext': + offset_expand = 1.2 + + point_pair_list = [] + for batch_id, y, x in yx_center_line: + offset = p_border[:, y, x].reshape(2, 2) + if offset_expand != 1.0: + offset_length = np.linalg.norm( + offset, axis=1, keepdims=True) + expand_length = np.clip( + offset_length * (offset_expand - 1), + a_min=0.5, + a_max=3.0) + offset_detal = offset / offset_length * expand_length + offset = offset + offset_detal + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + all_point_list.append([ + int(round(x * 4.0 / ratio_w)), + int(round(y * 4.0 / ratio_h)) + ]) + all_point_pair_list.append(point_pair.round().astype(np.int32) + .tolist()) + + detected_poly, pair_length_info = point_pair2poly(point_pair_list) + detected_poly = expand_poly_along_width( + detected_poly, shrink_ratio_of_width=0.2) + detected_poly[:, 0] = np.clip( + detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip( + detected_poly[:, 1], a_min=0, a_max=src_h) + + if len(keep_str) < 2: + continue + + keep_str_list.append(keep_str) + detected_poly = np.round(detected_poly).astype('int32') + if self.valid_set == 'partvgg': + middle_point = len(detected_poly) // 2 + detected_poly = detected_poly[ + [0, middle_point - 1, middle_point, -1], :] + poly_list.append(detected_poly) + elif self.valid_set == 'totaltext': + poly_list.append(detected_poly) + else: + print('--> Not supported format.') + exit(-1) + data = { + 'points': poly_list, + 'texts': keep_str_list, + } + return data diff --git a/backend/ppocr/utils/e2e_utils/visual.py b/backend/ppocr/utils/e2e_utils/visual.py new file mode 100644 index 0000000..e6e4fd0 --- /dev/null +++ b/backend/ppocr/utils/e2e_utils/visual.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import cv2 +import time + + +def resize_image(im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +def resize_image_min(im, max_side_len=512): + """ + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + if resize_h < resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + +def resize_image_for_totaltext(im, max_side_len=512): + """ + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + pair_length_list = [] + for point_pair in point_pair_list: + pair_length = np.linalg.norm(point_pair[0] - point_pair[1]) + pair_length_list.append(pair_length) + pair_length_list = np.array(pair_length_list) + pair_info = (pair_length_list.max(), pair_length_list.min(), + pair_length_list.mean()) + + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2), pair_info + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def norm2(x, axis=None): + if axis: + return np.sqrt(np.sum(x**2, axis=axis)) + return np.sqrt(np.sum(x**2)) + + +def cos(p1, p2): + return (p1 * p2).sum() / (norm2(p1) * norm2(p2)) diff --git a/backend/ppocr/utils/iou.py b/backend/ppocr/utils/iou.py new file mode 100644 index 0000000..35459f5 --- /dev/null +++ b/backend/ppocr/utils/iou.py @@ -0,0 +1,54 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/loss/iou.py +""" + +import paddle + +EPS = 1e-6 + + +def iou_single(a, b, mask, n_class): + valid = mask == 1 + a = a.masked_select(valid) + b = b.masked_select(valid) + miou = [] + for i in range(n_class): + if a.shape == [0] and a.shape == b.shape: + inter = paddle.to_tensor(0.0) + union = paddle.to_tensor(0.0) + else: + inter = ((a == i).logical_and(b == i)).astype('float32') + union = ((a == i).logical_or(b == i)).astype('float32') + miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS)) + miou = sum(miou) / len(miou) + return miou + + +def iou(a, b, mask, n_class=2, reduce=True): + batch_size = a.shape[0] + + a = a.reshape([batch_size, -1]) + b = b.reshape([batch_size, -1]) + mask = mask.reshape([batch_size, -1]) + + iou = paddle.zeros((batch_size, ), dtype='float32') + for i in range(batch_size): + iou[i] = iou_single(a[i], b[i], mask[i], n_class) + + if reduce: + iou = paddle.mean(iou) + return iou diff --git a/backend/ppocr/utils/loggers/__init__.py b/backend/ppocr/utils/loggers/__init__.py new file mode 100644 index 0000000..b1e92f7 --- /dev/null +++ b/backend/ppocr/utils/loggers/__init__.py @@ -0,0 +1,3 @@ +from .vdl_logger import VDLLogger +from .wandb_logger import WandbLogger +from .loggers import Loggers diff --git a/backend/ppocr/utils/loggers/base_logger.py b/backend/ppocr/utils/loggers/base_logger.py new file mode 100644 index 0000000..3a7fc35 --- /dev/null +++ b/backend/ppocr/utils/loggers/base_logger.py @@ -0,0 +1,15 @@ +import os +from abc import ABC, abstractmethod + +class BaseLogger(ABC): + def __init__(self, save_dir): + self.save_dir = save_dir + os.makedirs(self.save_dir, exist_ok=True) + + @abstractmethod + def log_metrics(self, metrics, prefix=None): + pass + + @abstractmethod + def close(self): + pass \ No newline at end of file diff --git a/backend/ppocr/utils/loggers/loggers.py b/backend/ppocr/utils/loggers/loggers.py new file mode 100644 index 0000000..2601466 --- /dev/null +++ b/backend/ppocr/utils/loggers/loggers.py @@ -0,0 +1,18 @@ +from .wandb_logger import WandbLogger + +class Loggers(object): + def __init__(self, loggers): + super().__init__() + self.loggers = loggers + + def log_metrics(self, metrics, prefix=None, step=None): + for logger in self.loggers: + logger.log_metrics(metrics, prefix=prefix, step=step) + + def log_model(self, is_best, prefix, metadata=None): + for logger in self.loggers: + logger.log_model(is_best=is_best, prefix=prefix, metadata=metadata) + + def close(self): + for logger in self.loggers: + logger.close() \ No newline at end of file diff --git a/backend/ppocr/utils/loggers/vdl_logger.py b/backend/ppocr/utils/loggers/vdl_logger.py new file mode 100644 index 0000000..c345f93 --- /dev/null +++ b/backend/ppocr/utils/loggers/vdl_logger.py @@ -0,0 +1,21 @@ +from .base_logger import BaseLogger +from visualdl import LogWriter + +class VDLLogger(BaseLogger): + def __init__(self, save_dir): + super().__init__(save_dir) + self.vdl_writer = LogWriter(logdir=save_dir) + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix + "/" + k: v for k, v in metrics.items()} + + for k, v in updated_metrics.items(): + self.vdl_writer.add_scalar(k, v, step) + + def log_model(self, is_best, prefix, metadata=None): + pass + + def close(self): + self.vdl_writer.close() \ No newline at end of file diff --git a/backend/ppocr/utils/loggers/wandb_logger.py b/backend/ppocr/utils/loggers/wandb_logger.py new file mode 100644 index 0000000..5c805f4 --- /dev/null +++ b/backend/ppocr/utils/loggers/wandb_logger.py @@ -0,0 +1,78 @@ +import os +from .base_logger import BaseLogger + +class WandbLogger(BaseLogger): + def __init__(self, + project=None, + name=None, + id=None, + entity=None, + save_dir=None, + config=None, + **kwargs): + try: + import wandb + self.wandb = wandb + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install wandb using `pip install wandb`" + ) + + self.project = project + self.name = name + self.id = id + self.save_dir = save_dir + self.config = config + self.kwargs = kwargs + self.entity = entity + self._run = None + self._wandb_init = dict( + project=self.project, + name=self.name, + id=self.id, + entity=self.entity, + dir=self.save_dir, + resume="allow" + ) + self._wandb_init.update(**kwargs) + + _ = self.run + + if self.config: + self.run.settings_config.update(self.config) + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is a wandb run already in progress " + "and newly created instances of `WandbLogger` will reuse" + " this run. If this is not desired, call `wandb.finish()`" + "before instantiating `WandbLogger`." + ) + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self._wandb_init) + return self._run + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix.lower() + "/" + k: v for k, v in metrics.items()} + + self.run.log(updated_metrics, step=step) + + def log_model(self, is_best, prefix, metadata=None): + model_path = os.path.join(self.save_dir, prefix + '.pdparams') + artifact = self.wandb.Artifact('model-{}'.format(self.run.id), type='model', metadata=metadata) + artifact.add_file(model_path, name="model_ckpt.pdparams") + + aliases = [prefix] + if is_best: + aliases.append("best") + + self.run.log_artifact(artifact, aliases=aliases) + + def close(self): + self.run.finish() \ No newline at end of file diff --git a/backend/ppocr/utils/logging.py b/backend/ppocr/utils/logging.py new file mode 100644 index 0000000..1eac8f3 --- /dev/null +++ b/backend/ppocr/utils/logging.py @@ -0,0 +1,71 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/PytorchOCR/blob/master/torchocr/utils/logging.py +""" + +import os +import sys +import logging +import functools +import paddle.distributed as dist + +logger_initialized = {} + + +@functools.lru_cache() +def get_logger(name='ppocr', log_file=None, log_level=logging.DEBUG): + """Initialize and get a logger by name. + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified a FileHandler will also be added. + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + formatter = logging.Formatter( + '[%(asctime)s] %(name)s %(levelname)s: %(message)s', + datefmt="%Y/%m/%d %H:%M:%S") + + stream_handler = logging.StreamHandler(stream=sys.stdout) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + if log_file is not None and dist.get_rank() == 0: + log_file_folder = os.path.split(log_file)[0] + os.makedirs(log_file_folder, exist_ok=True) + file_handler = logging.FileHandler(log_file, 'a') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + if dist.get_rank() == 0: + logger.setLevel(log_level) + else: + logger.setLevel(logging.ERROR) + logger_initialized[name] = True + logger.propagate = False + return logger diff --git a/backend/ppocr/utils/network.py b/backend/ppocr/utils/network.py new file mode 100644 index 0000000..118d1be --- /dev/null +++ b/backend/ppocr/utils/network.py @@ -0,0 +1,84 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tarfile +import requests +from tqdm import tqdm + +from ppocr.utils.logging import get_logger + + +def download_with_progressbar(url, save_path): + logger = get_logger() + response = requests.get(url, stream=True) + if response.status_code == 200: + total_size_in_bytes = int(response.headers.get('content-length', 1)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm( + total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(save_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + else: + logger.error("Something went wrong while downloading models") + sys.exit(0) + + +def maybe_download(model_storage_directory, url): + # using custom model + tar_file_name_list = [ + 'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel' + ] + if not os.path.exists( + os.path.join(model_storage_directory, 'inference.pdiparams') + ) or not os.path.exists( + os.path.join(model_storage_directory, 'inference.pdmodel')): + assert url.endswith('.tar'), 'Only supports tar compressed package' + tmp_path = os.path.join(model_storage_directory, url.split('/')[-1]) + print('download {} to {}'.format(url, tmp_path)) + os.makedirs(model_storage_directory, exist_ok=True) + download_with_progressbar(url, tmp_path) + with tarfile.open(tmp_path, 'r') as tarObj: + for member in tarObj.getmembers(): + filename = None + for tar_file_name in tar_file_name_list: + if tar_file_name in member.name: + filename = tar_file_name + if filename is None: + continue + file = tarObj.extractfile(member) + with open( + os.path.join(model_storage_directory, filename), + 'wb') as f: + f.write(file.read()) + os.remove(tmp_path) + + +def is_link(s): + return s is not None and s.startswith('http') + + +def confirm_model_dir_url(model_dir, default_model_dir, default_url): + url = default_url + if model_dir is None or is_link(model_dir): + if is_link(model_dir): + url = model_dir + file_name = url.split('/')[-1][:-4] + model_dir = default_model_dir + model_dir = os.path.join(model_dir, file_name) + return model_dir, url diff --git a/backend/ppocr/utils/poly_nms.py b/backend/ppocr/utils/poly_nms.py new file mode 100644 index 0000000..9dcb3d2 --- /dev/null +++ b/backend/ppocr/utils/poly_nms.py @@ -0,0 +1,146 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from shapely.geometry import Polygon + + +def points2polygon(points): + """Convert k points to 1 polygon. + + Args: + points (ndarray or list): A ndarray or a list of shape (2k) + that indicates k points. + + Returns: + polygon (Polygon): A polygon object. + """ + if isinstance(points, list): + points = np.array(points) + + assert isinstance(points, np.ndarray) + assert (points.size % 2 == 0) and (points.size >= 8) + + point_mat = points.reshape([-1, 2]) + return Polygon(point_mat) + + +def poly_intersection(poly_det, poly_gt, buffer=0.0001): + """Calculate the intersection area between two polygon. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + intersection_area (float): The intersection area between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + + if buffer == 0: + poly_inter = poly_det & poly_gt + else: + poly_inter = poly_det.buffer(buffer) & poly_gt.buffer(buffer) + return poly_inter.area, poly_inter + + +def poly_union(poly_det, poly_gt): + """Calculate the union area between two polygon. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + union_area (float): The union area between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + + area_det = poly_det.area + area_gt = poly_gt.area + area_inters, _ = poly_intersection(poly_det, poly_gt) + return area_det + area_gt - area_inters + + +def valid_boundary(x, with_score=True): + num = len(x) + if num < 8: + return False + if num % 2 == 0 and (not with_score): + return True + if num % 2 == 1 and with_score: + return True + + return False + + +def boundary_iou(src, target): + """Calculate the IOU between two boundaries. + + Args: + src (list): Source boundary. + target (list): Target boundary. + + Returns: + iou (float): The iou between two boundaries. + """ + assert valid_boundary(src, False) + assert valid_boundary(target, False) + src_poly = points2polygon(src) + target_poly = points2polygon(target) + + return poly_iou(src_poly, target_poly) + + +def poly_iou(poly_det, poly_gt): + """Calculate the IOU between two polygons. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + iou (float): The IOU between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + area_inters, _ = poly_intersection(poly_det, poly_gt) + area_union = poly_union(poly_det, poly_gt) + if area_union == 0: + return 0.0 + return area_inters / area_union + + +def poly_nms(polygons, threshold): + assert isinstance(polygons, list) + + polygons = np.array(sorted(polygons, key=lambda x: x[-1])) + + keep_poly = [] + index = [i for i in range(polygons.shape[0])] + + while len(index) > 0: + keep_poly.append(polygons[index[-1]].tolist()) + A = polygons[index[-1]][:-1] + index = np.delete(index, -1) + iou_list = np.zeros((len(index), )) + for i in range(len(index)): + B = polygons[index[i]][:-1] + iou_list[i] = boundary_iou(A, B) + remove_index = np.where(iou_list > threshold) + index = np.delete(index, remove_index) + + return keep_poly diff --git a/backend/ppocr/utils/profiler.py b/backend/ppocr/utils/profiler.py new file mode 100644 index 0000000..c4e28bc --- /dev/null +++ b/backend/ppocr/utils/profiler.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler( + _profiler_options['state'], _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/backend/ppocr/utils/save_load.py b/backend/ppocr/utils/save_load.py new file mode 100644 index 0000000..b09f1db --- /dev/null +++ b/backend/ppocr/utils/save_load.py @@ -0,0 +1,185 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import errno +import os +import pickle +import six + +import paddle + +from ppocr.utils.logging import get_logger + +__all__ = ['load_model'] + + +def _mkdir_if_not_exist(path, logger): + """ + mkdir if not exists, ignore the exception when multiprocess mkdir together + """ + if not os.path.exists(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno == errno.EEXIST and os.path.isdir(path): + logger.warning( + 'be happy if some process has already created {}'.format( + path)) + else: + raise OSError('Failed to mkdir {}'.format(path)) + + +def load_model(config, model, optimizer=None, model_type='det'): + """ + load model from checkpoint or pretrained_model + """ + logger = get_logger() + global_config = config['Global'] + checkpoints = global_config.get('checkpoints') + pretrained_model = global_config.get('pretrained_model') + best_model_dict = {} + + if model_type == 'vqa': + checkpoints = config['Architecture']['Backbone']['checkpoints'] + # load vqa method metric + if checkpoints: + if os.path.exists(os.path.join(checkpoints, 'metric.states')): + with open(os.path.join(checkpoints, 'metric.states'), + 'rb') as f: + states_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + best_model_dict = states_dict.get('best_model_dict', {}) + if 'epoch' in states_dict: + best_model_dict['start_epoch'] = states_dict['epoch'] + 1 + logger.info("resume from {}".format(checkpoints)) + + if optimizer is not None: + if checkpoints[-1] in ['/', '\\']: + checkpoints = checkpoints[:-1] + if os.path.exists(checkpoints + '.pdopt'): + optim_dict = paddle.load(checkpoints + '.pdopt') + optimizer.set_state_dict(optim_dict) + else: + logger.warning( + "{}.pdopt is not exists, params of optimizer is not loaded". + format(checkpoints)) + return best_model_dict + + if checkpoints: + if checkpoints.endswith('.pdparams'): + checkpoints = checkpoints.replace('.pdparams', '') + assert os.path.exists(checkpoints + ".pdparams"), \ + "The {}.pdparams does not exists!".format(checkpoints) + + # load params from trained model + params = paddle.load(checkpoints + '.pdparams') + state_dict = model.state_dict() + new_state_dict = {} + for key, value in state_dict.items(): + if key not in params: + logger.warning("{} not in loaded params {} !".format( + key, params.keys())) + continue + pre_value = params[key] + if list(value.shape) == list(pre_value.shape): + new_state_dict[key] = pre_value + else: + logger.warning( + "The shape of model params {} {} not matched with loaded params shape {} !". + format(key, value.shape, pre_value.shape)) + model.set_state_dict(new_state_dict) + + if optimizer is not None: + if os.path.exists(checkpoints + '.pdopt'): + optim_dict = paddle.load(checkpoints + '.pdopt') + optimizer.set_state_dict(optim_dict) + else: + logger.warning( + "{}.pdopt is not exists, params of optimizer is not loaded". + format(checkpoints)) + + if os.path.exists(checkpoints + '.states'): + with open(checkpoints + '.states', 'rb') as f: + states_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + best_model_dict = states_dict.get('best_model_dict', {}) + if 'epoch' in states_dict: + best_model_dict['start_epoch'] = states_dict['epoch'] + 1 + logger.info("resume from {}".format(checkpoints)) + elif pretrained_model: + load_pretrained_params(model, pretrained_model) + else: + logger.info('train from scratch') + return best_model_dict + + +def load_pretrained_params(model, path): + logger = get_logger() + if path.endswith('.pdparams'): + path = path.replace('.pdparams', '') + assert os.path.exists(path + ".pdparams"), \ + "The {}.pdparams does not exists!".format(path) + + params = paddle.load(path + '.pdparams') + state_dict = model.state_dict() + new_state_dict = {} + for k1 in params.keys(): + if k1 not in state_dict.keys(): + logger.warning("The pretrained params {} not in model".format(k1)) + else: + if list(state_dict[k1].shape) == list(params[k1].shape): + new_state_dict[k1] = params[k1] + else: + logger.warning( + "The shape of model params {} {} not matched with loaded params {} {} !". + format(k1, state_dict[k1].shape, k1, params[k1].shape)) + model.set_state_dict(new_state_dict) + logger.info("load pretrain successful from {}".format(path)) + return model + + +def save_model(model, + optimizer, + model_path, + logger, + config, + is_best=False, + prefix='ppocr', + **kwargs): + """ + save model to the target path + """ + _mkdir_if_not_exist(model_path, logger) + model_prefix = os.path.join(model_path, prefix) + paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') + if config['Architecture']["model_type"] != 'vqa': + paddle.save(model.state_dict(), model_prefix + '.pdparams') + metric_prefix = model_prefix + else: + if config['Global']['distributed']: + model._layers.backbone.model.save_pretrained(model_prefix) + else: + model.backbone.model.save_pretrained(model_prefix) + metric_prefix = os.path.join(model_prefix, 'metric') + # save metric and config + if is_best: + with open(metric_prefix + '.states', 'wb') as f: + pickle.dump(kwargs, f, protocol=2) + logger.info('save best model is to {}'.format(model_prefix)) + else: + logger.info("save model in {}".format(model_prefix)) diff --git a/backend/ppocr/utils/stats.py b/backend/ppocr/utils/stats.py new file mode 100755 index 0000000..179b008 --- /dev/null +++ b/backend/ppocr/utils/stats.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np +import datetime + +__all__ = ['TrainingStats', 'Time'] + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size): + self.deque = collections.deque(maxlen=window_size) + + def add_value(self, value): + self.deque.append(value) + + def get_median_value(self): + return np.median(self.deque) + + +def Time(): + return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + + +class TrainingStats(object): + def __init__(self, window_size, stats_keys): + self.window_size = window_size + self.smoothed_losses_and_metrics = { + key: SmoothedValue(window_size) + for key in stats_keys + } + + def update(self, stats): + for k, v in stats.items(): + if k not in self.smoothed_losses_and_metrics: + self.smoothed_losses_and_metrics[k] = SmoothedValue( + self.window_size) + self.smoothed_losses_and_metrics[k].add_value(v) + + def get(self, extras=None): + stats = collections.OrderedDict() + if extras: + for k, v in extras.items(): + stats[k] = v + for k, v in self.smoothed_losses_and_metrics.items(): + stats[k] = round(v.get_median_value(), 6) + + return stats + + def log(self, extras=None): + d = self.get(extras) + strs = [] + for k, v in d.items(): + strs.append('{}: {:x<6f}'.format(k, v)) + strs = ', '.join(strs) + return strs diff --git a/backend/ppocr/utils/utility.py b/backend/ppocr/utils/utility.py new file mode 100755 index 0000000..4a25ff8 --- /dev/null +++ b/backend/ppocr/utils/utility.py @@ -0,0 +1,131 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import imghdr +import cv2 +import random +import numpy as np +import paddle + + +def print_dict(d, logger, delimiter=0): + """ + Recursively visualize a dict and + indenting acrrording by the relationship of keys. + """ + for k, v in sorted(d.items()): + if isinstance(v, dict): + logger.info("{}{} : ".format(delimiter * " ", str(k))) + print_dict(v, logger, delimiter + 4) + elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict): + logger.info("{}{} : ".format(delimiter * " ", str(k))) + for value in v: + print_dict(value, logger, delimiter + 4) + else: + logger.info("{}{} : {}".format(delimiter * " ", k, v)) + + +def get_check_global_params(mode): + check_params = ['use_gpu', 'max_text_length', 'image_shape', \ + 'image_shape', 'character_type', 'loss_type'] + if mode == "train_eval": + check_params = check_params + [ \ + 'train_batch_size_per_card', 'test_batch_size_per_card'] + elif mode == "test": + check_params = check_params + ['test_batch_size_per_card'] + return check_params + + +def _check_image_file(path): + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'} + return any([path.lower().endswith(e) for e in img_end]) + + +def get_image_file_list(img_file): + imgs_lists = [] + if img_file is None or not os.path.exists(img_file): + raise Exception("not found any img file in {}".format(img_file)) + + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'} + if os.path.isfile(img_file) and _check_image_file(img_file): + imgs_lists.append(img_file) + elif os.path.isdir(img_file): + for single_file in os.listdir(img_file): + file_path = os.path.join(img_file, single_file) + if os.path.isfile(file_path) and _check_image_file(file_path): + imgs_lists.append(file_path) + if len(imgs_lists) == 0: + raise Exception("not found any img file in {}".format(img_file)) + imgs_lists = sorted(imgs_lists) + return imgs_lists + + +def check_and_read_gif(img_path): + if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: + gif = cv2.VideoCapture(img_path) + ret, frame = gif.read() + if not ret: + logger = logging.getLogger('ppocr') + logger.info("Cannot read {}. This gif image maybe corrupted.") + return None, False + if len(frame.shape) == 2 or frame.shape[-1] == 1: + frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) + imgvalue = frame[:, :, ::-1] + return imgvalue, True + return None, False + + +def load_vqa_bio_label_maps(label_map_path): + with open(label_map_path, "r", encoding='utf-8') as fin: + lines = fin.readlines() + lines = [line.strip() for line in lines] + if "O" not in lines: + lines.insert(0, "O") + labels = [] + for line in lines: + if line == "O": + labels.append("O") + else: + labels.append("B-" + line) + labels.append("I-" + line) + label2id_map = {label: idx for idx, label in enumerate(labels)} + id2label_map = {idx: label for idx, label in enumerate(labels)} + return label2id_map, id2label_map + + +def set_seed(seed=1024): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + +class AverageMeter: + def __init__(self): + self.reset() + + def reset(self): + """reset""" + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + """update""" + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count diff --git a/backend/ppocr/utils/visual.py b/backend/ppocr/utils/visual.py new file mode 100644 index 0000000..7a8c167 --- /dev/null +++ b/backend/ppocr/utils/visual.py @@ -0,0 +1,98 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import numpy as np +from PIL import Image, ImageDraw, ImageFont + + +def draw_ser_results(image, + ocr_results, + font_path="doc/fonts/simfang.ttf", + font_size=18): + np.random.seed(2021) + color = (np.random.permutation(range(255)), + np.random.permutation(range(255)), + np.random.permutation(range(255))) + color_map = { + idx: (color[0][idx], color[1][idx], color[2][idx]) + for idx in range(1, 255) + } + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + elif isinstance(image, str) and os.path.isfile(image): + image = Image.open(image).convert('RGB') + img_new = image.copy() + draw = ImageDraw.Draw(img_new) + + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + for ocr_info in ocr_results: + if ocr_info["pred_id"] not in color_map: + continue + color = color_map[ocr_info["pred_id"]] + text = "{}: {}".format(ocr_info["pred"], ocr_info["text"]) + + draw_box_txt(ocr_info["bbox"], text, draw, font, font_size, color) + + img_new = Image.blend(image, img_new, 0.5) + return np.array(img_new) + + +def draw_box_txt(bbox, text, draw, font, font_size, color): + # draw ocr results outline + bbox = ((bbox[0], bbox[1]), (bbox[2], bbox[3])) + draw.rectangle(bbox, fill=color) + + # draw ocr results + start_y = max(0, bbox[0][1] - font_size) + tw = font.getsize(text)[0] + draw.rectangle( + [(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1, start_y + font_size)], + fill=(0, 0, 255)) + draw.text((bbox[0][0] + 1, start_y), text, fill=(255, 255, 255), font=font) + + +def draw_re_results(image, + result, + font_path="doc/fonts/simfang.ttf", + font_size=18): + np.random.seed(0) + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + elif isinstance(image, str) and os.path.isfile(image): + image = Image.open(image).convert('RGB') + img_new = image.copy() + draw = ImageDraw.Draw(img_new) + + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + color_head = (0, 0, 255) + color_tail = (255, 0, 0) + color_line = (0, 255, 0) + + for ocr_info_head, ocr_info_tail in result: + draw_box_txt(ocr_info_head["bbox"], ocr_info_head["text"], draw, font, + font_size, color_head) + draw_box_txt(ocr_info_tail["bbox"], ocr_info_tail["text"], draw, font, + font_size, color_tail) + + center_head = ( + (ocr_info_head['bbox'][0] + ocr_info_head['bbox'][2]) // 2, + (ocr_info_head['bbox'][1] + ocr_info_head['bbox'][3]) // 2) + center_tail = ( + (ocr_info_tail['bbox'][0] + ocr_info_tail['bbox'][2]) // 2, + (ocr_info_tail['bbox'][1] + ocr_info_tail['bbox'][3]) // 2) + + draw.line([center_head, center_tail], fill=color_line, width=5) + + img_new = Image.blend(image, img_new, 0.5) + return np.array(img_new) diff --git a/backend/tools/infer/predict_cls.py b/backend/tools/infer/predict_cls.py new file mode 100755 index 0000000..ed2f47c --- /dev/null +++ b/backend/tools/infer/predict_cls.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import copy +import numpy as np +import math +import time +import traceback + +import tools.infer.utility as utility +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read_gif + +logger = get_logger() + + +class TextClassifier(object): + def __init__(self, args): + self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] + self.cls_batch_num = args.cls_batch_num + self.cls_thresh = args.cls_thresh + postprocess_params = { + 'name': 'ClsPostProcess', + "label_list": args.label_list, + } + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, _ = \ + utility.create_predictor(args, 'cls', logger) + self.use_onnx = args.use_onnx + + def resize_norm_img(self, img): + imgC, imgH, imgW = self.cls_image_shape + h = img.shape[0] + w = img.shape[1] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if self.cls_image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def __call__(self, img_list): + img_list = copy.deepcopy(img_list) + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the cls process + indices = np.argsort(np.array(width_list)) + + cls_res = [['', 0.0]] * img_num + batch_num = self.cls_batch_num + elapse = 0 + for beg_img_no in range(0, img_num, batch_num): + + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + max_wh_ratio = 0 + starttime = time.time() + for ino in range(beg_img_no, end_img_no): + h, w = img_list[indices[ino]].shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for ino in range(beg_img_no, end_img_no): + norm_img = self.resize_norm_img(img_list[indices[ino]]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, input_dict) + prob_out = outputs[0] + else: + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() + prob_out = self.output_tensors[0].copy_to_cpu() + self.predictor.try_shrink_memory() + cls_result = self.postprocess_op(prob_out) + elapse += time.time() - starttime + for rno in range(len(cls_result)): + label, score = cls_result[rno] + cls_res[indices[beg_img_no + rno]] = [label, score] + if '180' in label and score > self.cls_thresh: + img_list[indices[beg_img_no + rno]] = cv2.rotate( + img_list[indices[beg_img_no + rno]], 1) + return img_list, cls_res, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + text_classifier = TextClassifier(args) + valid_image_file_list = [] + img_list = [] + for image_file in image_file_list: + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(img) + try: + img_list, cls_res, predict_time = text_classifier(img_list) + except Exception as E: + logger.info(traceback.format_exc()) + logger.info(E) + exit() + for ino in range(len(img_list)): + logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], + cls_res[ino])) + + +if __name__ == "__main__": + main(utility.parse_args()) diff --git a/backend/tools/infer/predict_det.py b/backend/tools/infer/predict_det.py new file mode 100755 index 0000000..5f2675d --- /dev/null +++ b/backend/tools/infer/predict_det.py @@ -0,0 +1,302 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time +import sys + +import tools.infer.utility as utility +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +import json +logger = get_logger() + + +class TextDetector(object): + def __init__(self, args): + self.args = args + self.det_algorithm = args.det_algorithm + self.use_onnx = args.use_onnx + pre_process_list = [{ + 'DetResizeForTest': { + 'limit_side_len': args.det_limit_side_len, + 'limit_type': args.det_limit_type, + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {} + if self.det_algorithm == "DB": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + elif self.det_algorithm == "EAST": + postprocess_params['name'] = 'EASTPostProcess' + postprocess_params["score_thresh"] = args.det_east_score_thresh + postprocess_params["cover_thresh"] = args.det_east_cover_thresh + postprocess_params["nms_thresh"] = args.det_east_nms_thresh + elif self.det_algorithm == "SAST": + pre_process_list[0] = { + 'DetResizeForTest': { + 'resize_long': args.det_limit_side_len + } + } + postprocess_params['name'] = 'SASTPostProcess' + postprocess_params["score_thresh"] = args.det_sast_score_thresh + postprocess_params["nms_thresh"] = args.det_sast_nms_thresh + self.det_sast_polygon = args.det_sast_polygon + if self.det_sast_polygon: + postprocess_params["sample_pts_num"] = 6 + postprocess_params["expand_scale"] = 1.2 + postprocess_params["shrink_ratio_of_width"] = 0.2 + else: + postprocess_params["sample_pts_num"] = 2 + postprocess_params["expand_scale"] = 1.0 + postprocess_params["shrink_ratio_of_width"] = 0.3 + elif self.det_algorithm == "PSE": + postprocess_params['name'] = 'PSEPostProcess' + postprocess_params["thresh"] = args.det_pse_thresh + postprocess_params["box_thresh"] = args.det_pse_box_thresh + postprocess_params["min_area"] = args.det_pse_min_area + postprocess_params["box_type"] = args.det_pse_box_type + postprocess_params["scale"] = args.det_pse_scale + self.det_pse_box_type = args.det_pse_box_type + elif self.det_algorithm == "FCE": + pre_process_list[0] = { + 'DetResizeForTest': { + 'rescale_img': [1080, 736] + } + } + postprocess_params['name'] = 'FCEPostProcess' + postprocess_params["scales"] = args.scales + postprocess_params["alpha"] = args.alpha + postprocess_params["beta"] = args.beta + postprocess_params["fourier_degree"] = args.fourier_degree + postprocess_params["box_type"] = args.det_fce_box_type + else: + logger.info("unknown det_algorithm:{}".format(self.det_algorithm)) + sys.exit(0) + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor( + args, 'det', logger) + + if self.use_onnx: + img_h, img_w = self.input_tensor.shape[2:] + if img_h is not None and img_w is not None and img_h > 0 and img_w > 0: + pre_process_list[0] = { + 'DetResizeForTest': { + 'image_shape': [img_h, img_w] + } + } + self.preprocess_op = create_operators(pre_process_list) + + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="det", + model_precision=args.precision, + batch_size=1, + data_shape="dynamic", + save_path=None, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=2, + logger=logger) + + def order_points_clockwise(self, pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + diff = np.diff(pts, axis=1) + rect[1] = pts[np.argmin(diff)] + rect[3] = pts[np.argmax(diff)] + return rect + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def filter_tag_det_res(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.order_points_clockwise(box) + box = self.clip_det_res(box, img_height, img_width) + rect_width = int(np.linalg.norm(box[0] - box[1])) + rect_height = int(np.linalg.norm(box[0] - box[3])) + if rect_width <= 3 or rect_height <= 3: + continue + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + + st = time.time() + + if self.args.benchmark: + self.autolog.times.start() + + data = transform(data, self.preprocess_op) + img, shape_list = data + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + img = img.copy() + + if self.args.benchmark: + self.autolog.times.stamp() + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = img + outputs = self.predictor.run(self.output_tensors, input_dict) + else: + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.args.benchmark: + self.autolog.times.stamp() + + preds = {} + if self.det_algorithm == "EAST": + preds['f_geo'] = outputs[0] + preds['f_score'] = outputs[1] + elif self.det_algorithm == 'SAST': + preds['f_border'] = outputs[0] + preds['f_score'] = outputs[1] + preds['f_tco'] = outputs[2] + preds['f_tvo'] = outputs[3] + elif self.det_algorithm in ['DB', 'PSE']: + preds['maps'] = outputs[0] + elif self.det_algorithm == 'FCE': + for i, output in enumerate(outputs): + preds['level_{}'.format(i)] = output + else: + raise NotImplementedError + + #self.predictor.try_shrink_memory() + post_result = self.postprocess_op(preds, shape_list) + dt_boxes = post_result[0]['points'] + if (self.det_algorithm == "SAST" and self.det_sast_polygon) or ( + self.det_algorithm in ["PSE", "FCE"] and + self.postprocess_op.box_type == 'poly'): + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) + + if self.args.benchmark: + self.autolog.times.end(stamp=True) + et = time.time() + return dt_boxes, et - st + + +if __name__ == "__main__": + args = utility.parse_args() + image_file_list = get_image_file_list(args.image_dir) + text_detector = TextDetector(args) + count = 0 + total_time = 0 + draw_img_save = "./inference_results" + + if args.warmup: + img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) + for i in range(2): + res = text_detector(img) + + if not os.path.exists(draw_img_save): + os.makedirs(draw_img_save) + save_results = [] + for image_file in image_file_list: + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + st = time.time() + dt_boxes, _ = text_detector(img) + elapse = time.time() - st + if count > 0: + total_time += elapse + count += 1 + save_pred = os.path.basename(image_file) + "\t" + str( + json.dumps([x.tolist() for x in dt_boxes])) + "\n" + save_results.append(save_pred) + logger.info(save_pred) + logger.info("The predict time of {}: {}".format(image_file, elapse)) + src_im = utility.draw_text_det_res(dt_boxes, image_file) + img_name_pure = os.path.split(image_file)[-1] + img_path = os.path.join(draw_img_save, + "det_res_{}".format(img_name_pure)) + cv2.imwrite(img_path, src_im) + logger.info("The visualized image saved in {}".format(img_path)) + + with open(os.path.join(draw_img_save, "det_results.txt"), 'w') as f: + f.writelines(save_results) + f.close() + if args.benchmark: + text_detector.autolog.report() diff --git a/backend/tools/infer/predict_e2e.py b/backend/tools/infer/predict_e2e.py new file mode 100755 index 0000000..fb2859f --- /dev/null +++ b/backend/tools/infer/predict_e2e.py @@ -0,0 +1,169 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time +import sys + +import tools.infer.utility as utility +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process + +logger = get_logger() + + +class TextE2E(object): + def __init__(self, args): + self.args = args + self.e2e_algorithm = args.e2e_algorithm + self.use_onnx = args.use_onnx + pre_process_list = [{ + 'E2EResizeForTest': {} + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {} + if self.e2e_algorithm == "PGNet": + pre_process_list[0] = { + 'E2EResizeForTest': { + 'max_side_len': args.e2e_limit_side_len, + 'valid_set': 'totaltext' + } + } + postprocess_params['name'] = 'PGPostProcess' + postprocess_params["score_thresh"] = args.e2e_pgnet_score_thresh + postprocess_params["character_dict_path"] = args.e2e_char_dict_path + postprocess_params["valid_set"] = args.e2e_pgnet_valid_set + postprocess_params["mode"] = args.e2e_pgnet_mode + else: + logger.info("unknown e2e_algorithm:{}".format(self.e2e_algorithm)) + sys.exit(0) + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, _ = utility.create_predictor( + args, 'e2e', logger) # paddle.jit.load(args.det_model_dir) + # self.predictor.eval() + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def __call__(self, img): + + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img, shape_list = data + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + img = img.copy() + starttime = time.time() + + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = img + outputs = self.predictor.run(self.output_tensors, input_dict) + preds = {} + preds['f_border'] = outputs[0] + preds['f_char'] = outputs[1] + preds['f_direction'] = outputs[2] + preds['f_score'] = outputs[3] + else: + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + + preds = {} + if self.e2e_algorithm == 'PGNet': + preds['f_border'] = outputs[0] + preds['f_char'] = outputs[1] + preds['f_direction'] = outputs[2] + preds['f_score'] = outputs[3] + else: + raise NotImplementedError + post_result = self.postprocess_op(preds, shape_list) + points, strs = post_result['points'], post_result['texts'] + dt_boxes = self.filter_tag_det_res_only_clip(points, ori_im.shape) + elapse = time.time() - starttime + return dt_boxes, strs, elapse + + +if __name__ == "__main__": + args = utility.parse_args() + image_file_list = get_image_file_list(args.image_dir) + text_detector = TextE2E(args) + count = 0 + total_time = 0 + draw_img_save = "./inference_results" + if not os.path.exists(draw_img_save): + os.makedirs(draw_img_save) + for image_file in image_file_list: + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + points, strs, elapse = text_detector(img) + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + src_im = utility.draw_e2e_res(points, strs, image_file) + img_name_pure = os.path.split(image_file)[-1] + img_path = os.path.join(draw_img_save, + "e2e_res_{}".format(img_name_pure)) + cv2.imwrite(img_path, src_im) + logger.info("The visualized image saved in {}".format(img_path)) + if count > 1: + logger.info("Avg Time: {}".format(total_time / (count - 1))) diff --git a/backend/tools/infer/predict_rec.py b/backend/tools/infer/predict_rec.py new file mode 100755 index 0000000..3664ef2 --- /dev/null +++ b/backend/tools/infer/predict_rec.py @@ -0,0 +1,442 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +from PIL import Image +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import math +import time +import traceback +import paddle + +import tools.infer.utility as utility +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read_gif + +logger = get_logger() + + +class TextRecognizer(object): + def __init__(self, args): + self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] + self.rec_batch_num = args.rec_batch_num + self.rec_algorithm = args.rec_algorithm + postprocess_params = { + 'name': 'CTCLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + if self.rec_algorithm == "SRN": + postprocess_params = { + 'name': 'SRNLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "RARE": + postprocess_params = { + 'name': 'AttnLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'NRTR': + postprocess_params = { + 'name': 'NRTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "SAR": + postprocess_params = { + 'name': 'SARLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'rec', logger) + self.benchmark = args.benchmark + self.use_onnx = args.use_onnx + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="rec", + model_precision=args.precision, + batch_size=args.rec_batch_num, + data_shape="dynamic", + save_path=None, #args.save_log_path, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) + + def resize_norm_img(self, img, max_wh_ratio): + imgC, imgH, imgW = self.rec_image_shape + if self.rec_algorithm == 'NRTR': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # return padding_im + image_pil = Image.fromarray(np.uint8(img)) + img = image_pil.resize([100, 32], Image.ANTIALIAS) + img = np.array(img) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + return norm_img.astype(np.float32) / 128. - 1. + + assert imgC == img.shape[2] + imgW = int((imgH * max_wh_ratio)) + if self.use_onnx: + w = self.input_tensor.shape[3:][0] + if w is not None and w > 0: + imgW = w + + h, w = img.shape[:2] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + if self.rec_algorithm == 'RARE': + if resized_w > self.rec_image_shape[2]: + resized_w = self.rec_image_shape[2] + imgW = self.rec_image_shape[2] + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def resize_norm_img_svtr(self, img, image_shape): + + imgC, imgH, imgW = image_shape + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + def resize_norm_img_srn(self, img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + + def srn_other_inputs(self, image_shape, num_heads, max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape( + (feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape( + (max_text_length, 1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile( + gsrm_slf_attn_bias1, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile( + gsrm_slf_attn_bias2, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + encoder_word_pos = encoder_word_pos[np.newaxis, :] + gsrm_word_pos = gsrm_word_pos[np.newaxis, :] + + return [ + encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + + def process_image_srn(self, img, image_shape, num_heads, max_text_length): + norm_img = self.resize_norm_img_srn(img, image_shape) + norm_img = norm_img[np.newaxis, :] + + [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + self.srn_other_inputs(image_shape, num_heads, max_text_length) + + gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32) + gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32) + encoder_word_pos = encoder_word_pos.astype(np.int64) + gsrm_word_pos = gsrm_word_pos.astype(np.int64) + + return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + def resize_norm_img_sar(self, img, image_shape, + width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + def __call__(self, img_list): + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the recognition process + indices = np.argsort(np.array(width_list)) + rec_res = [['', 0.0]] * img_num + batch_num = self.rec_batch_num + st = time.time() + if self.benchmark: + self.autolog.times.start() + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + imgC, imgH, imgW = self.rec_image_shape + max_wh_ratio = imgW / imgH + # max_wh_ratio = 0 + for ino in range(beg_img_no, end_img_no): + h, w = img_list[indices[ino]].shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for ino in range(beg_img_no, end_img_no): + + if self.rec_algorithm == "SAR": + norm_img, _, _, valid_ratio = self.resize_norm_img_sar( + img_list[indices[ino]], self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + valid_ratio = np.expand_dims(valid_ratio, axis=0) + valid_ratios = [] + valid_ratios.append(valid_ratio) + norm_img_batch.append(norm_img) + elif self.rec_algorithm == "SRN": + norm_img = self.process_image_srn( + img_list[indices[ino]], self.rec_image_shape, 8, 25) + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + encoder_word_pos_list.append(norm_img[1]) + gsrm_word_pos_list.append(norm_img[2]) + gsrm_slf_attn_bias1_list.append(norm_img[3]) + gsrm_slf_attn_bias2_list.append(norm_img[4]) + norm_img_batch.append(norm_img[0]) + elif self.rec_algorithm == "SVTR": + norm_img = self.resize_norm_img_svtr(img_list[indices[ino]], + self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + else: + norm_img = self.resize_norm_img(img_list[indices[ino]], + max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + if self.benchmark: + self.autolog.times.stamp() + + if self.rec_algorithm == "SRN": + encoder_word_pos_list = np.concatenate(encoder_word_pos_list) + gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list) + gsrm_slf_attn_bias1_list = np.concatenate( + gsrm_slf_attn_bias1_list) + gsrm_slf_attn_bias2_list = np.concatenate( + gsrm_slf_attn_bias2_list) + + inputs = [ + norm_img_batch, + encoder_word_pos_list, + gsrm_word_pos_list, + gsrm_slf_attn_bias1_list, + gsrm_slf_attn_bias2_list, + ] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = {"predict": outputs[2]} + else: + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle( + input_names[i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = {"predict": outputs[2]} + elif self.rec_algorithm == "SAR": + valid_ratios = np.concatenate(valid_ratios) + inputs = [ + norm_img_batch, + valid_ratios, + ] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs[0] + else: + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle( + input_names[i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs[0] + else: + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs[0] + else: + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + if len(outputs) != 1: + preds = outputs + else: + preds = outputs[0] + rec_result = self.postprocess_op(preds) + for rno in range(len(rec_result)): + rec_res[indices[beg_img_no + rno]] = rec_result[rno] + if self.benchmark: + self.autolog.times.end(stamp=True) + return rec_res, time.time() - st + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + text_recognizer = TextRecognizer(args) + valid_image_file_list = [] + img_list = [] + + logger.info( + "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " + "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320" + ) + # warmup 2 times + if args.warmup: + img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8) + for i in range(2): + res = text_recognizer([img] * int(args.rec_batch_num)) + + for image_file in image_file_list: + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(img) + try: + rec_res, _ = text_recognizer(img_list) + + except Exception as E: + logger.info(traceback.format_exc()) + logger.info(E) + exit() + for ino in range(len(img_list)): + logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], + rec_res[ino])) + if args.benchmark: + text_recognizer.autolog.report() + + +if __name__ == "__main__": + main(utility.parse_args()) diff --git a/backend/tools/infer/predict_system.py b/backend/tools/infer/predict_system.py new file mode 100755 index 0000000..4af3da7 --- /dev/null +++ b/backend/tools/infer/predict_system.py @@ -0,0 +1,210 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import subprocess + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import copy +import numpy as np +import json +import time +import logging +from PIL import Image +import tools.infer.utility as utility +import tools.infer.predict_rec as predict_rec +import tools.infer.predict_det as predict_det +import tools.infer.predict_cls as predict_cls +from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.logging import get_logger +from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image +logger = get_logger() + + +class TextSystem(object): + def __init__(self, args): + if not args.show_log: + logger.setLevel(logging.INFO) + + self.text_detector = predict_det.TextDetector(args) + self.text_recognizer = predict_rec.TextRecognizer(args) + self.use_angle_cls = args.use_angle_cls + self.drop_score = args.drop_score + if self.use_angle_cls: + self.text_classifier = predict_cls.TextClassifier(args) + + self.args = args + self.crop_image_res_index = 0 + + def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res): + os.makedirs(output_dir, exist_ok=True) + bbox_num = len(img_crop_list) + for bno in range(bbox_num): + cv2.imwrite( + os.path.join(output_dir, + f"mg_crop_{bno+self.crop_image_res_index}.jpg"), + img_crop_list[bno]) + logger.debug(f"{bno}, {rec_res[bno]}") + self.crop_image_res_index += bbox_num + + def __call__(self, img, cls=True): + ori_im = img.copy() + dt_boxes, elapse = self.text_detector(img) + + if dt_boxes is None: + return None, None + img_crop_list = [] + + dt_boxes = sorted_boxes(dt_boxes) + + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + img_crop = get_rotate_crop_image(ori_im, tmp_box) + img_crop_list.append(img_crop) + if self.use_angle_cls and cls: + img_crop_list, angle_list, elapse = self.text_classifier( + img_crop_list) + + + rec_res, elapse = self.text_recognizer(img_crop_list) + if self.args.save_crop_res: + self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, + rec_res) + filter_boxes, filter_rec_res = [], [] + for box, rec_result in zip(dt_boxes, rec_res): + text, score = rec_result + if score >= self.drop_score: + filter_boxes.append(box) + filter_rec_res.append(rec_result) + return filter_boxes, filter_rec_res + + +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ + (_boxes[i + 1][0][0] < _boxes[i][0][0]): + tmp = _boxes[i] + _boxes[i] = _boxes[i + 1] + _boxes[i + 1] = tmp + return _boxes + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + image_file_list = image_file_list[args.process_id::args.total_process_num] + text_sys = TextSystem(args) + is_visualize = True + font_path = args.vis_font_path + drop_score = args.drop_score + draw_img_save_dir = args.draw_img_save_dir + os.makedirs(draw_img_save_dir, exist_ok=True) + save_results = [] + + logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " + "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320") + + # warm up 10 times + if args.warmup: + img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) + for i in range(10): + res = text_sys(img) + + total_time = 0 + cpu_mem, gpu_mem, gpu_util = 0, 0, 0 + _st = time.time() + count = 0 + for idx, image_file in enumerate(image_file_list): + + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.debug("error in loading image:{}".format(image_file)) + continue + starttime = time.time() + dt_boxes, rec_res = text_sys(img) + elapse = time.time() - starttime + total_time += elapse + + + res = [{ + "transcription": rec_res[idx][0], + "points": np.array(dt_boxes[idx]).astype(np.int32).tolist(), + } for idx in range(len(dt_boxes))] + save_pred = os.path.basename(image_file) + "\t" + json.dumps( + res, ensure_ascii=False) + "\n" + save_results.append(save_pred) + + if is_visualize: + image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) + boxes = dt_boxes + txts = [rec_res[i][0] for i in range(len(rec_res))] + scores = [rec_res[i][1] for i in range(len(rec_res))] + + draw_img = draw_ocr_box_txt( + image, + boxes, + txts, + scores, + drop_score=drop_score, + font_path=font_path) + if flag: + image_file = image_file[:-3] + "png" + cv2.imwrite( + os.path.join(draw_img_save_dir, os.path.basename(image_file)), + draw_img[:, :, ::-1]) + + + logger.info("The predict total time is {}".format(time.time() - _st)) + if args.benchmark: + text_sys.text_detector.autolog.report() + text_sys.text_recognizer.autolog.report() + + with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w', encoding='utf-8') as f: + f.writelines(save_results) + + +if __name__ == "__main__": + args = utility.parse_args() + if args.use_mp: + p_list = [] + total_process_num = args.total_process_num + for process_id in range(total_process_num): + cmd = [sys.executable, "-u"] + sys.argv + [ + "--process_id={}".format(process_id), + "--use_mp={}".format(False) + ] + p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) + p_list.append(p) + for p in p_list: + p.wait() + else: + main(args) diff --git a/backend/tools/infer/utility.py b/backend/tools/infer/utility.py new file mode 100644 index 0000000..33f0a48 --- /dev/null +++ b/backend/tools/infer/utility.py @@ -0,0 +1,645 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys +import platform +import cv2 +import numpy as np +import paddle +from PIL import Image, ImageDraw, ImageFont +import math +from paddle import inference +import time +from ppocr.utils.logging import get_logger + + +def str2bool(v): + return v.lower() in ("true", "t", "1") + + +def init_args(): + parser = argparse.ArgumentParser() + # params for prediction engine + parser.add_argument("--use_gpu", type=str2bool, default=True) + parser.add_argument("--ir_optim", type=str2bool, default=True) + parser.add_argument("--use_tensorrt", type=str2bool, default=False) + parser.add_argument("--min_subgraph_size", type=int, default=15) + parser.add_argument("--precision", type=str, default="fp32") + parser.add_argument("--gpu_mem", type=int, default=500) + + # params for text detector + parser.add_argument("--image_dir", type=str) + parser.add_argument("--det_algorithm", type=str, default='DB') + parser.add_argument("--det_model_dir", type=str) + parser.add_argument("--det_limit_side_len", type=float, default=960) + parser.add_argument("--det_limit_type", type=str, default='max') + + # DB parmas + parser.add_argument("--det_db_thresh", type=float, default=0.3) + parser.add_argument("--det_db_box_thresh", type=float, default=0.6) + parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) + parser.add_argument("--max_batch_size", type=int, default=10) + parser.add_argument("--use_dilation", type=str2bool, default=False) + parser.add_argument("--det_db_score_mode", type=str, default="fast") + # EAST parmas + parser.add_argument("--det_east_score_thresh", type=float, default=0.8) + parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) + parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) + + # SAST parmas + parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) + parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) + parser.add_argument("--det_sast_polygon", type=str2bool, default=False) + + # PSE parmas + parser.add_argument("--det_pse_thresh", type=float, default=0) + parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) + parser.add_argument("--det_pse_min_area", type=float, default=16) + parser.add_argument("--det_pse_box_type", type=str, default='quad') + parser.add_argument("--det_pse_scale", type=int, default=1) + + # FCE parmas + parser.add_argument("--scales", type=list, default=[8, 16, 32]) + parser.add_argument("--alpha", type=float, default=1.0) + parser.add_argument("--beta", type=float, default=1.0) + parser.add_argument("--fourier_degree", type=int, default=5) + parser.add_argument("--det_fce_box_type", type=str, default='poly') + + # params for text recognizer + parser.add_argument("--rec_algorithm", type=str, default='CRNN') + parser.add_argument("--rec_model_dir", type=str) + parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") + parser.add_argument("--rec_batch_num", type=int, default=6) + parser.add_argument("--max_text_length", type=int, default=25) + parser.add_argument( + "--rec_char_dict_path", + type=str, + default="./ppocr/utils/ppocr_keys_v1.txt") + parser.add_argument("--use_space_char", type=str2bool, default=True) + parser.add_argument( + "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf") + parser.add_argument("--drop_score", type=float, default=0.5) + + # params for e2e + parser.add_argument("--e2e_algorithm", type=str, default='PGNet') + parser.add_argument("--e2e_model_dir", type=str) + parser.add_argument("--e2e_limit_side_len", type=float, default=768) + parser.add_argument("--e2e_limit_type", type=str, default='max') + + # PGNet parmas + parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) + parser.add_argument( + "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt") + parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext') + parser.add_argument("--e2e_pgnet_mode", type=str, default='fast') + + # params for text classifier + parser.add_argument("--use_angle_cls", type=str2bool, default=False) + parser.add_argument("--cls_model_dir", type=str) + parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") + parser.add_argument("--label_list", type=list, default=['0', '180']) + parser.add_argument("--cls_batch_num", type=int, default=6) + parser.add_argument("--cls_thresh", type=float, default=0.9) + + parser.add_argument("--enable_mkldnn", type=str2bool, default=False) + parser.add_argument("--cpu_threads", type=int, default=10) + parser.add_argument("--use_pdserving", type=str2bool, default=False) + parser.add_argument("--warmup", type=str2bool, default=False) + + # + parser.add_argument( + "--draw_img_save_dir", type=str, default="./inference_results") + parser.add_argument("--save_crop_res", type=str2bool, default=False) + parser.add_argument("--crop_res_save_dir", type=str, default="./output") + + # multi-process + parser.add_argument("--use_mp", type=str2bool, default=False) + parser.add_argument("--total_process_num", type=int, default=1) + parser.add_argument("--process_id", type=int, default=0) + + parser.add_argument("--benchmark", type=str2bool, default=False) + parser.add_argument("--save_log_path", type=str, default="./log_output/") + + parser.add_argument("--show_log", type=str2bool, default=True) + parser.add_argument("--use_onnx", type=str2bool, default=False) + return parser + + +def parse_args(): + parser = init_args() + return parser.parse_args() + + +def create_predictor(args, mode, logger): + if mode == "det": + model_dir = args.det_model_dir + elif mode == 'cls': + model_dir = args.cls_model_dir + elif mode == 'rec': + model_dir = args.rec_model_dir + elif mode == 'table': + model_dir = args.table_model_dir + else: + model_dir = args.e2e_model_dir + + if model_dir is None: + logger.info("not find {} model file path {}".format(mode, model_dir)) + sys.exit(0) + if args.use_onnx: + import onnxruntime as ort + model_file_path = model_dir + if not os.path.exists(model_file_path): + raise ValueError("not find model file path {}".format( + model_file_path)) + sess = ort.InferenceSession(model_file_path) + return sess, sess.get_inputs()[0], None, None + + else: + model_file_path = model_dir + "/inference.pdmodel" + params_file_path = model_dir + "/inference.pdiparams" + if not os.path.exists(model_file_path): + raise ValueError("not find model file path {}".format( + model_file_path)) + if not os.path.exists(params_file_path): + raise ValueError("not find params file path {}".format( + params_file_path)) + + config = inference.Config(model_file_path, params_file_path) + + if hasattr(args, 'precision'): + if args.precision == "fp16" and args.use_tensorrt: + precision = inference.PrecisionType.Half + elif args.precision == "int8": + precision = inference.PrecisionType.Int8 + else: + precision = inference.PrecisionType.Float32 + else: + precision = inference.PrecisionType.Float32 + + if args.use_gpu: + gpu_id = get_infer_gpuid() + if gpu_id is None: + logger.warning( + "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson." + ) + config.enable_use_gpu(args.gpu_mem, 0) + if args.use_tensorrt: + config.enable_tensorrt_engine( + workspace_size=1 << 30, + precision_mode=precision, + max_batch_size=args.max_batch_size, + min_subgraph_size=args.min_subgraph_size) + # skip the minmum trt subgraph + use_dynamic_shape = True + if mode == "det": + min_input_shape = { + "x": [1, 3, 50, 50], + "conv2d_92.tmp_0": [1, 120, 20, 20], + "conv2d_91.tmp_0": [1, 24, 10, 10], + "conv2d_59.tmp_0": [1, 96, 20, 20], + "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10], + "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20], + "conv2d_124.tmp_0": [1, 256, 20, 20], + "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20], + "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20], + "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20], + "elementwise_add_7": [1, 56, 2, 2], + "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2] + } + max_input_shape = { + "x": [1, 3, 1536, 1536], + "conv2d_92.tmp_0": [1, 120, 400, 400], + "conv2d_91.tmp_0": [1, 24, 200, 200], + "conv2d_59.tmp_0": [1, 96, 400, 400], + "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200], + "conv2d_124.tmp_0": [1, 256, 400, 400], + "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400], + "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400], + "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400], + "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400], + "elementwise_add_7": [1, 56, 400, 400], + "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400] + } + opt_input_shape = { + "x": [1, 3, 640, 640], + "conv2d_92.tmp_0": [1, 120, 160, 160], + "conv2d_91.tmp_0": [1, 24, 80, 80], + "conv2d_59.tmp_0": [1, 96, 160, 160], + "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80], + "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160], + "conv2d_124.tmp_0": [1, 256, 160, 160], + "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160], + "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160], + "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160], + "elementwise_add_7": [1, 56, 40, 40], + "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40] + } + min_pact_shape = { + "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20], + "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20], + "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20], + "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20] + } + max_pact_shape = { + "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400], + "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400], + "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400], + "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400] + } + opt_pact_shape = { + "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160], + "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160], + "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160], + "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160] + } + min_input_shape.update(min_pact_shape) + max_input_shape.update(max_pact_shape) + opt_input_shape.update(opt_pact_shape) + elif mode == "rec": + if args.rec_algorithm != "CRNN": + use_dynamic_shape = False + imgH = int(args.rec_image_shape.split(',')[-2]) + min_input_shape = {"x": [1, 3, imgH, 10]} + max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 1536]} + opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]} + elif mode == "cls": + min_input_shape = {"x": [1, 3, 48, 10]} + max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]} + opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]} + else: + use_dynamic_shape = False + if use_dynamic_shape: + config.set_trt_dynamic_shape_info( + min_input_shape, max_input_shape, opt_input_shape) + + else: + config.disable_gpu() + if hasattr(args, "cpu_threads"): + config.set_cpu_math_library_num_threads(args.cpu_threads) + else: + # default cpu threads as 10 + config.set_cpu_math_library_num_threads(10) + if args.enable_mkldnn: + # cache 10 different shapes for mkldnn to avoid memory leak + config.set_mkldnn_cache_capacity(10) + config.enable_mkldnn() + if args.precision == "fp16": + config.enable_mkldnn_bfloat16() + # enable memory optim + config.enable_memory_optim() + config.disable_glog_info() + config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") + config.delete_pass("matmul_transpose_reshape_fuse_pass") + if mode == 'table': + config.delete_pass("fc_fuse_pass") # not supported for table + config.switch_use_feed_fetch_ops(False) + config.switch_ir_optim(True) + + # create predictor + predictor = inference.create_predictor(config) + input_names = predictor.get_input_names() + for name in input_names: + input_tensor = predictor.get_input_handle(name) + output_tensors = get_output_tensors(args, mode, predictor) + return predictor, input_tensor, output_tensors, config + + +def get_output_tensors(args, mode, predictor): + output_names = predictor.get_output_names() + output_tensors = [] + if mode == "rec" and args.rec_algorithm == "CRNN": + output_name = 'softmax_0.tmp_0' + if output_name in output_names: + return [predictor.get_output_handle(output_name)] + else: + for output_name in output_names: + output_tensor = predictor.get_output_handle(output_name) + output_tensors.append(output_tensor) + else: + for output_name in output_names: + output_tensor = predictor.get_output_handle(output_name) + output_tensors.append(output_tensor) + return output_tensors + + +def get_infer_gpuid(): + sysstr = platform.system() + if sysstr == "Windows": + return 0 + + if not paddle.fluid.core.is_compiled_with_rocm(): + cmd = "env | grep CUDA_VISIBLE_DEVICES" + else: + cmd = "env | grep HIP_VISIBLE_DEVICES" + env_cuda = os.popen(cmd).readlines() + if len(env_cuda) == 0: + return 0 + else: + gpu_id = env_cuda[0].strip().split("=")[1] + return int(gpu_id[0]) + + +def draw_e2e_res(dt_boxes, strs, img_path): + src_im = cv2.imread(img_path) + for box, str in zip(dt_boxes, strs): + box = box.astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + cv2.putText( + src_im, + str, + org=(int(box[0, 0, 0]), int(box[0, 0, 1])), + fontFace=cv2.FONT_HERSHEY_COMPLEX, + fontScale=0.7, + color=(0, 255, 0), + thickness=1) + return src_im + + +def draw_text_det_res(dt_boxes, img_path): + src_im = cv2.imread(img_path) + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape(-1, 2) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + return src_im + + +def resize_img(img, input_size=600): + """ + resize img and limit the longest side of the image to input_size + """ + img = np.array(img) + im_shape = img.shape + im_size_max = np.max(im_shape[0:2]) + im_scale = float(input_size) / float(im_size_max) + img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) + return img + + +def draw_ocr(image, + boxes, + txts=None, + scores=None, + drop_score=0.5, + font_path="./doc/fonts/simfang.ttf"): + """ + Visualize the results of OCR detection and recognition + args: + image(Image|array): RGB image + boxes(list): boxes with shape(N, 4, 2) + txts(list): the texts + scores(list): txxs corresponding scores + drop_score(float): only scores greater than drop_threshold will be visualized + font_path: the path of font which is used to draw text + return(array): + the visualized img + """ + if scores is None: + scores = [1] * len(boxes) + box_num = len(boxes) + for i in range(box_num): + if scores is not None and (scores[i] < drop_score or + math.isnan(scores[i])): + continue + box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64) + image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) + if txts is not None: + img = np.array(resize_img(image, input_size=600)) + txt_img = text_visual( + txts, + scores, + img_h=img.shape[0], + img_w=600, + threshold=drop_score, + font_path=font_path) + img = np.concatenate([np.array(img), np.array(txt_img)], axis=1) + return img + return image + + +def draw_ocr_box_txt(image, + boxes, + txts, + scores=None, + drop_score=0.5, + font_path="./doc/simfang.ttf"): + h, w = image.height, image.width + img_left = image.copy() + img_right = Image.new('RGB', (w, h), (255, 255, 255)) + + import random + + random.seed(0) + draw_left = ImageDraw.Draw(img_left) + draw_right = ImageDraw.Draw(img_right) + for idx, (box, txt) in enumerate(zip(boxes, txts)): + if scores is not None and scores[idx] < drop_score: + continue + color = (random.randint(0, 255), random.randint(0, 255), + random.randint(0, 255)) + draw_left.polygon(box, fill=color) + draw_right.polygon( + [ + box[0][0], box[0][1], box[1][0], box[1][1], box[2][0], + box[2][1], box[3][0], box[3][1] + ], + outline=color) + box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][ + 1])**2) + box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][ + 1])**2) + if box_height > 2 * box_width: + font_size = max(int(box_width * 0.9), 10) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + cur_y = box[0][1] + for c in txt: + char_size = font.getsize(c) + draw_right.text( + (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font) + cur_y += char_size[1] + else: + font_size = max(int(box_height * 0.8), 10) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + draw_right.text( + [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font) + img_left = Image.blend(image, img_left, 0.5) + img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) + img_show.paste(img_left, (0, 0, w, h)) + img_show.paste(img_right, (w, 0, w * 2, h)) + return np.array(img_show) + + +def str_count(s): + """ + Count the number of Chinese characters, + a single English character and a single number + equal to half the length of Chinese characters. + args: + s(string): the input of string + return(int): + the number of Chinese characters + """ + import string + count_zh = count_pu = 0 + s_len = len(s) + en_dg_count = 0 + for c in s: + if c in string.ascii_letters or c.isdigit() or c.isspace(): + en_dg_count += 1 + elif c.isalpha(): + count_zh += 1 + else: + count_pu += 1 + return s_len - math.ceil(en_dg_count / 2) + + +def text_visual(texts, + scores, + img_h=400, + img_w=600, + threshold=0., + font_path="./doc/simfang.ttf"): + """ + create new blank img and draw txt on it + args: + texts(list): the text will be draw + scores(list|None): corresponding score of each txt + img_h(int): the height of blank img + img_w(int): the width of blank img + font_path: the path of font which is used to draw text + return(array): + """ + if scores is not None: + assert len(texts) == len( + scores), "The number of txts and corresponding scores must match" + + def create_blank_img(): + blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255 + blank_img[:, img_w - 1:] = 0 + blank_img = Image.fromarray(blank_img).convert("RGB") + draw_txt = ImageDraw.Draw(blank_img) + return blank_img, draw_txt + + blank_img, draw_txt = create_blank_img() + + font_size = 20 + txt_color = (0, 0, 0) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + + gap = font_size + 5 + txt_img_list = [] + count, index = 1, 0 + for idx, txt in enumerate(texts): + index += 1 + if scores[idx] < threshold or math.isnan(scores[idx]): + index -= 1 + continue + first_line = True + while str_count(txt) >= img_w // font_size - 4: + tmp = txt + txt = tmp[:img_w // font_size - 4] + if first_line: + new_txt = str(index) + ': ' + txt + first_line = False + else: + new_txt = ' ' + txt + draw_txt.text((0, gap * count), new_txt, txt_color, font=font) + txt = tmp[img_w // font_size - 4:] + if count >= img_h // gap - 1: + txt_img_list.append(np.array(blank_img)) + blank_img, draw_txt = create_blank_img() + count = 0 + count += 1 + if first_line: + new_txt = str(index) + ': ' + txt + ' ' + '%.3f' % (scores[idx]) + else: + new_txt = " " + txt + " " + '%.3f' % (scores[idx]) + draw_txt.text((0, gap * count), new_txt, txt_color, font=font) + # whether add new blank img or not + if count >= img_h // gap - 1 and idx + 1 < len(texts): + txt_img_list.append(np.array(blank_img)) + blank_img, draw_txt = create_blank_img() + count = 0 + count += 1 + txt_img_list.append(np.array(blank_img)) + if len(txt_img_list) == 1: + blank_img = np.array(txt_img_list[0]) + else: + blank_img = np.concatenate(txt_img_list, axis=1) + return np.array(blank_img) + + +def base64_to_cv2(b64str): + import base64 + data = base64.b64decode(b64str.encode('utf8')) + data = np.fromstring(data, np.uint8) + data = cv2.imdecode(data, cv2.IMREAD_COLOR) + return data + + +def draw_boxes(image, boxes, scores=None, drop_score=0.5): + if scores is None: + scores = [1] * len(boxes) + for (box, score) in zip(boxes, scores): + if score < drop_score: + continue + box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64) + image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) + return image + + +def get_rotate_crop_image(img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + + +def check_gpu(use_gpu): + if use_gpu and not paddle.is_compiled_with_cuda(): + use_gpu = False + return use_gpu + + +if __name__ == '__main__': + pass diff --git a/backend/tools/inpaint_tools.py b/backend/tools/inpaint_tools.py new file mode 100644 index 0000000..527b6c2 --- /dev/null +++ b/backend/tools/inpaint_tools.py @@ -0,0 +1,109 @@ +import multiprocessing +import cv2 +import numpy as np + +from backend import config +from backend.inpaint.lama_inpaint import LamaInpaint + + +def batch_generator(data, max_batch_size): + """ + 根据data大小,生成最大长度不超过max_batch_size的均匀批次数据 + """ + n_samples = len(data) + # 尝试找到一个比MAX_BATCH_SIZE小的batch_size,以使得所有的批次数量尽量接近 + batch_size = max_batch_size + num_batches = n_samples // batch_size + + # 处理最后一批可能不足batch_size的情况 + # 如果最后一批少于其他批次,则减小batch_size尝试平衡每批的数量 + while n_samples % batch_size < batch_size / 2.0 and batch_size > 1: + batch_size -= 1 # 减小批次大小 + num_batches = n_samples // batch_size + + # 生成前num_batches个批次 + for i in range(num_batches): + yield data[i * batch_size:(i + 1) * batch_size] + + # 将剩余的数据作为最后一个批次 + last_batch_start = num_batches * batch_size + if last_batch_start < n_samples: + yield data[last_batch_start:] + + +def inference_task(batch_data): + inpainted_frame_dict = dict() + for data in batch_data: + index, original_frame, coords_list = data + mask_size = original_frame.shape[:2] + mask = create_mask(mask_size, coords_list) + inpaint_frame = inpaint(original_frame, mask) + inpainted_frame_dict[index] = inpaint_frame + return inpainted_frame_dict + + +def parallel_inference(inputs, batch_size=None, pool_size=None): + """ + 并行推理,同时保持结果顺序 + """ + if pool_size is None: + pool_size = multiprocessing.cpu_count() + # 使用上下文管理器自动管理进程池 + with multiprocessing.Pool(processes=pool_size) as pool: + batched_inputs = list(batch_generator(inputs, batch_size)) + # 使用map函数保证输入输出的顺序是一致的 + batch_results = pool.map(inference_task, batched_inputs) + # 将批推理结果展平 + index_inpainted_frames = [item for sublist in batch_results for item in sublist] + return index_inpainted_frames + + +def inpaint(img, mask): + lama_inpaint_instance = LamaInpaint() + img_inpainted = lama_inpaint_instance(img, mask) + return img_inpainted + + +def inpaint_with_multiple_masks(censored_img, mask_list): + inpainted_frame = censored_img + if mask_list: + for mask in mask_list: + inpainted_frame = inpaint(inpainted_frame, mask) + return inpainted_frame + + +def create_mask(size, coords_list): + mask = np.zeros(size, dtype="uint8") + if coords_list: + for coords in coords_list: + xmin, xmax, ymin, ymax = coords + # 为了避免框过小,放大10个像素 + cv2.rectangle(mask, (xmin - config.SUBTITLE_AREA_DEVIATION_PIXEL, ymin - config.SUBTITLE_AREA_DEVIATION_PIXEL), + (xmax + config.SUBTITLE_AREA_DEVIATION_PIXEL, ymax + config.SUBTITLE_AREA_DEVIATION_PIXEL), (255, 255, 255), thickness=-1) + return mask + + +def inpaint_video(video_path, sub_list): + index = 0 + frame_to_inpaint_list = [] + video_cap = cv2.VideoCapture(video_path) + while True: + # 读取视频帧 + ret, frame = video_cap.read() + if not ret: + break + index += 1 + if index in sub_list.keys(): + frame_to_inpaint_list.append((index, frame, sub_list[index])) + if len(frame_to_inpaint_list) > config.MAX_LOAD_NUM: + batch_results = parallel_inference(frame_to_inpaint_list) + for index, frame in batch_results: + file_name = f'/home/yao/Documents/Project/video-subtitle-remover/test/temp/{index}.png' + cv2.imwrite(file_name, frame) + print(f"success write: {file_name}") + frame_to_inpaint_list.clear() + print(f'finished') + + +if __name__ == '__main__': + multiprocessing.set_start_method("spawn") diff --git a/backend/tools/merge_video.py b/backend/tools/merge_video.py new file mode 100644 index 0000000..955e631 --- /dev/null +++ b/backend/tools/merge_video.py @@ -0,0 +1,33 @@ +import cv2 + + +def merge_video(video_input_path0, video_input_path1, video_input_path2, video_output_path): + """ + 将两个视频文件安装水平方向合并 + """ + input_video_cap0 = cv2.VideoCapture(video_input_path0) + input_video_cap1 = cv2.VideoCapture(video_input_path1) + input_video_cap2 = cv2.VideoCapture(video_input_path2) + fps = input_video_cap1.get(cv2.CAP_PROP_FPS) + size = (int(input_video_cap1.get(cv2.CAP_PROP_FRAME_WIDTH)), int(input_video_cap2.get(cv2.CAP_PROP_FRAME_HEIGHT)) * 3) + video_writer = cv2.VideoWriter(video_output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, size) + while True: + ret0, frame0 = input_video_cap0.read() + ret1, frame1 = input_video_cap1.read() + ret2, frame2 = input_video_cap2.read() + if not ret1 and not ret2: + break + else: + show = cv2.vconcat([frame0, frame1, frame2]) + video_writer.write(show) + video_writer.release() + + +if __name__ == '__main__': + v0_path = '../../test/test1.mp4' + v1_path = '../../test/test1_no_sub(bak2).mp4' + v2_path = '../../test/test1_no_sub.mp4' + video_out_path = '../../test/demo.mp4' + merge_video(v0_path, v1_path, v2_path, video_out_path) + # ffmpeg 命令 mp4转gif + # ffmpeg -i demo3.mp4 -vf "scale=w=720:h=-1,fps=15,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse" -loop 0 -r 15 -f gif output.gif diff --git a/design/demo.gif b/design/demo.gif new file mode 100644 index 0000000..28addf6 Binary files /dev/null and b/design/demo.gif differ diff --git a/design/demo.jpg b/design/demo.jpg new file mode 100644 index 0000000..fa627f7 Binary files /dev/null and b/design/demo.jpg differ diff --git a/design/demo.png b/design/demo.png new file mode 100644 index 0000000..5917f1c Binary files /dev/null and b/design/demo.png differ diff --git a/design/demo2.gif b/design/demo2.gif new file mode 100644 index 0000000..5839bc3 Binary files /dev/null and b/design/demo2.gif differ diff --git a/design/vsr.ico b/design/vsr.ico new file mode 100644 index 0000000..84a782b Binary files /dev/null and b/design/vsr.ico differ diff --git a/gui.py b/gui.py new file mode 100644 index 0000000..851c8d2 --- /dev/null +++ b/gui.py @@ -0,0 +1,371 @@ +# -*- coding: utf-8 -*- +""" +@Author : Fang Yao +@Time : 2021/4/1 6:07 下午 +@FileName: gui.py +@desc: 字幕提取器图形化界面 +""" +import backend.main +import os +import configparser +import PySimpleGUI as sg +import cv2 +from threading import Thread +import multiprocessing + + +class SubtitleRemoverGUI: + + def __init__(self): + # 初次运行检查运行环境是否正常 + from paddle import fluid + fluid.install_check.run_check() + self.font = 'Arial 10' + self.theme = 'LightBrown12' + sg.theme(self.theme) + self.icon = os.path.join(os.path.dirname(__file__), 'design', 'vsr.ico') + self.screen_width, self.screen_height = sg.Window.get_screen_size() + self.subtitle_config_file = os.path.join(os.path.dirname(__file__), 'subtitle.ini') + print(self.screen_width, self.screen_height) + # 设置视频预览区域大小 + self.video_preview_width = 960 + self.video_preview_height = self.video_preview_width * 9 // 16 + # 默认组件大小 + self.horizontal_slider_size = (120, 20) + self.output_size = (100, 10) + self.progressbar_size = (60, 20) + # 分辨率低于1080 + if self.screen_width // 2 < 960: + self.video_preview_width = 640 + self.video_preview_height = self.video_preview_width * 9 // 16 + self.horizontal_slider_size = (60, 20) + self.output_size = (58, 10) + self.progressbar_size = (28, 20) + # 字幕提取器布局 + self.layout = None + # 字幕提取其窗口 + self.window = None + # 视频路径 + self.video_path = None + # 视频cap + self.video_cap = None + # 视频的帧率 + self.fps = None + # 视频的帧数 + self.frame_count = None + # 视频的宽 + self.frame_width = None + # 视频的高 + self.frame_height = None + # 设置字幕区域高宽 + self.xmin = None + self.xmax = None + self.ymin = None + self.ymax = None + # 字幕提取器 + self.sr = None + + def run(self): + # 创建布局 + self._create_layout() + # 创建窗口 + self.window = sg.Window(title='Video Subtitle Remover', layout=self.layout, + icon=self.icon) + while True: + # 循环读取事件 + event, values = self.window.read(timeout=10) + # 处理【打开】事件 + self._file_event_handler(event, values) + # 处理【滑动】事件 + self._slide_event_handler(event, values) + # 处理【运行】事件 + self._run_event_handler(event, values) + # 如果关闭软件,退出 + if event == sg.WIN_CLOSED: + break + # 更新进度条 + if self.sr is not None: + self.window['-PROG-'].update(self.sr.progress_total) + if self.sr.preview_frame is not None: + self.window['-DISPLAY-'].update(data=cv2.imencode('.png', self._img_resize(self.sr.preview_frame))[1].tobytes()) + if self.sr.isFinished: + # 1) 打开修改字幕滑块区域按钮 + self.window['-Y-SLIDER-'].update(disabled=False) + self.window['-X-SLIDER-'].update(disabled=False) + self.window['-Y-SLIDER-H-'].update(disabled=False) + self.window['-X-SLIDER-W-'].update(disabled=False) + # 2) 打开【运行】、【打开】和【识别语言】按钮 + self.window['-RUN-'].update(disabled=False) + self.window['-FILE-'].update(disabled=False) + self.window['-FILE_BTN-'].update(disabled=False) + self.sr = None + if len(self.video_paths) >= 1: + # 1) 关闭修改字幕滑块区域按钮 + self.window['-Y-SLIDER-'].update(disabled=True) + self.window['-X-SLIDER-'].update(disabled=True) + self.window['-Y-SLIDER-H-'].update(disabled=True) + self.window['-X-SLIDER-W-'].update(disabled=True) + # 2) 关闭【运行】、【打开】和【识别语言】按钮 + self.window['-RUN-'].update(disabled=True) + self.window['-FILE-'].update(disabled=True) + self.window['-FILE_BTN-'].update(disabled=True) + + def _create_layout(self): + """ + 创建字幕提取器布局 + """ + garbage = os.path.join(os.path.dirname(__file__), 'output') + if os.path.exists(garbage): + import shutil + shutil.rmtree(garbage, True) + self.layout = [ + # 显示视频预览 + [sg.Image(size=(self.video_preview_width, self.video_preview_height), background_color='black', + key='-DISPLAY-')], + # 打开按钮 + 快进快退条 + [sg.Input(key='-FILE-', visible=False, enable_events=True), + sg.FilesBrowse(button_text='Open', file_types=(( + 'All Files', '*.*'), ('mp4', '*.mp4'), + ('flv', '*.flv'), + ('wmv', '*.wmv'), + ('avi', '*.avi')), + key='-FILE_BTN-', size=(10, 1), font=self.font), + sg.Slider(size=self.horizontal_slider_size, range=(1, 1), key='-SLIDER-', orientation='h', + enable_events=True, font=self.font, + disable_number_display=True), + ], + # 输出区域 + [sg.Output(size=self.output_size, font=self.font), + sg.Frame(title='Vertical', font=self.font, key='-FRAME1-', + layout=[[ + sg.Slider(range=(0, 0), orientation='v', size=(10, 20), + disable_number_display=True, + enable_events=True, font=self.font, + pad=((10, 10), (20, 20)), + default_value=0, key='-Y-SLIDER-'), + sg.Slider(range=(0, 0), orientation='v', size=(10, 20), + disable_number_display=True, + enable_events=True, font=self.font, + pad=((10, 10), (20, 20)), + default_value=0, key='-Y-SLIDER-H-'), + ]], pad=((15, 5), (0, 0))), + sg.Frame(title='Horizontal', font=self.font, key='-FRAME2-', + layout=[[ + sg.Slider(range=(0, 0), orientation='v', size=(10, 20), + disable_number_display=True, + pad=((10, 10), (20, 20)), + enable_events=True, font=self.font, + default_value=0, key='-X-SLIDER-'), + sg.Slider(range=(0, 0), orientation='v', size=(10, 20), + disable_number_display=True, + pad=((10, 10), (20, 20)), + enable_events=True, font=self.font, + default_value=0, key='-X-SLIDER-W-'), + ]], pad=((15, 5), (0, 0))) + ], + + # 运行按钮 + 进度条 + [sg.Button(button_text='Run', key='-RUN-', + font=self.font, size=(20, 1)), + sg.ProgressBar(100, orientation='h', size=self.progressbar_size, key='-PROG-', auto_size_text=True) + ], + ] + + def _file_event_handler(self, event, values): + """ + 当点击打开按钮时: + 1)打开视频文件,将画布显示视频帧 + 2)获取视频信息,初始化进度条滑块范围 + """ + if event == '-FILE-': + self.video_paths = values['-FILE-'].split(';') + self.video_path = self.video_paths[0] + if self.video_path != '': + self.video_cap = cv2.VideoCapture(self.video_path) + if self.video_cap is None: + return + if self.video_cap.isOpened(): + ret, frame = self.video_cap.read() + if ret: + for video in self.video_paths: + print(f"Open Video Success:{video}") + # 获取视频的帧数 + self.frame_count = self.video_cap.get(cv2.CAP_PROP_FRAME_COUNT) + # 获取视频的高度 + self.frame_height = self.video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT) + # 获取视频的宽度 + self.frame_width = self.video_cap.get(cv2.CAP_PROP_FRAME_WIDTH) + # 获取视频的帧率 + self.fps = self.video_cap.get(cv2.CAP_PROP_FPS) + # 调整视频帧大小,使播放器能够显示 + resized_frame = self._img_resize(frame) + # resized_frame = cv2.resize(src=frame, dsize=(self.video_preview_width, self.video_preview_height)) + # 显示视频帧 + self.window['-DISPLAY-'].update(data=cv2.imencode('.png', resized_frame)[1].tobytes()) + # 更新视频进度条滑块range + self.window['-SLIDER-'].update(range=(1, self.frame_count)) + self.window['-SLIDER-'].update(1) + # 预设字幕区域位置 + y_p, h_p, x_p, w_p = self.parse_subtitle_config() + y = self.frame_height * y_p + h = self.frame_height * h_p + x = self.frame_width * x_p + w = self.frame_width * w_p + # 更新视频字幕位置滑块range + # 更新Y-SLIDER范围 + self.window['-Y-SLIDER-'].update(range=(0, self.frame_height), disabled=False) + # 更新Y-SLIDER默认值 + self.window['-Y-SLIDER-'].update(y) + # 更新X-SLIDER范围 + self.window['-X-SLIDER-'].update(range=(0, self.frame_width), disabled=False) + # 更新X-SLIDER默认值 + self.window['-X-SLIDER-'].update(x) + # 更新Y-SLIDER-H范围 + self.window['-Y-SLIDER-H-'].update(range=(0, self.frame_height - y)) + # 更新Y-SLIDER-H默认值 + self.window['-Y-SLIDER-H-'].update(h) + # 更新X-SLIDER-W范围 + self.window['-X-SLIDER-W-'].update(range=(0, self.frame_width - x)) + # 更新X-SLIDER-W默认值 + self.window['-X-SLIDER-W-'].update(w) + self._update_preview(frame, (y, h, x, w)) + + def _run_event_handler(self, event, values): + """ + 当点击运行按钮时: + 1) 禁止修改字幕滑块区域 + 2) 禁止再次点击【运行】和【打开】按钮 + 3) 设定字幕区域位置 + """ + if event == '-RUN-': + if self.video_cap is None: + print('Please Open Video First') + else: + # 1) 禁止修改字幕滑块区域 + self.window['-Y-SLIDER-'].update(disabled=True) + self.window['-X-SLIDER-'].update(disabled=True) + self.window['-Y-SLIDER-H-'].update(disabled=True) + self.window['-X-SLIDER-W-'].update(disabled=True) + # 2) 禁止再次点击【运行】、【打开】和【识别语言】按钮 + self.window['-RUN-'].update(disabled=True) + self.window['-FILE-'].update(disabled=True) + self.window['-FILE_BTN-'].update(disabled=True) + # 3) 设定字幕区域位置 + self.xmin = int(values['-X-SLIDER-']) + self.xmax = int(values['-X-SLIDER-'] + values['-X-SLIDER-W-']) + self.ymin = int(values['-Y-SLIDER-']) + self.ymax = int(values['-Y-SLIDER-'] + values['-Y-SLIDER-H-']) + if self.ymax > self.frame_height: + self.ymax = self.frame_height + if self.xmax > self.frame_width: + self.xmax = self.frame_width + print(f"{'SubtitleArea'}:({self.ymin},{self.ymax},{self.xmin},{self.xmax})") + subtitle_area = (self.ymin, self.ymax, self.xmin, self.xmax) + y_p = self.ymin / self.frame_height + h_p = (self.ymax - self.ymin) / self.frame_height + x_p = self.xmin / self.frame_width + w_p = (self.xmax - self.xmin) / self.frame_width + self.set_subtitle_config(y_p, h_p, x_p, w_p) + + def task(): + while self.video_paths: + video_path = self.video_paths.pop() + self.sr = backend.main.SubtitleRemover(video_path, subtitle_area) + self.sr.run() + Thread(target=task, daemon=True).start() + self.video_cap.release() + self.video_cap = None + + def _slide_event_handler(self, event, values): + """ + 当滑动视频进度条/滑动字幕选择区域滑块时: + 1) 判断视频是否存在,如果存在则显示对应的视频帧 + 2) 绘制rectangle + """ + if event == '-SLIDER-' or event == '-Y-SLIDER-' or event == '-Y-SLIDER-H-' or event == '-X-SLIDER-' or event \ + == '-X-SLIDER-W-': + if self.video_cap is not None and self.video_cap.isOpened(): + frame_no = int(values['-SLIDER-']) + self.video_cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no) + ret, frame = self.video_cap.read() + if ret: + self.window['-Y-SLIDER-H-'].update(range=(0, self.frame_height-values['-Y-SLIDER-'])) + self.window['-X-SLIDER-W-'].update(range=(0, self.frame_width-values['-X-SLIDER-'])) + # 画字幕框 + y = int(values['-Y-SLIDER-']) + h = int(values['-Y-SLIDER-H-']) + x = int(values['-X-SLIDER-']) + w = int(values['-X-SLIDER-W-']) + self._update_preview(frame, (y, h, x, w)) + + def _update_preview(self, frame, y_h_x_w): + y, h, x, w = y_h_x_w + # 画字幕框 + draw = cv2.rectangle(img=frame, pt1=(int(x), int(y)), pt2=(int(x) + int(w), int(y) + int(h)), + color=(0, 255, 0), thickness=3) + # 调整视频帧大小,使播放器能够显示 + resized_frame = self._img_resize(draw) + # 显示视频帧 + self.window['-DISPLAY-'].update(data=cv2.imencode('.png', resized_frame)[1].tobytes()) + + def _img_resize(self, image): + top, bottom, left, right = (0, 0, 0, 0) + height, width = image.shape[0], image.shape[1] + # 对长短不想等的图片,找到最长的一边 + longest_edge = height + # 计算短边需要增加多少像素宽度使其与长边等长 + if width < longest_edge: + dw = longest_edge - width + left = dw // 2 + right = dw - left + else: + pass + # 给图像增加边界 + constant = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0]) + return cv2.resize(constant, (self.video_preview_width, self.video_preview_height)) + + def set_subtitle_config(self, y, h, x, w): + # 写入配置文件 + with open(self.subtitle_config_file, mode='w', encoding='utf-8') as f: + f.write('[AREA]\n') + f.write(f'Y = {y}\n') + f.write(f'H = {h}\n') + f.write(f'X = {x}\n') + f.write(f'W = {w}\n') + + def parse_subtitle_config(self): + y_p, h_p, x_p, w_p = .78, .21, .05, .9 + # 如果配置文件不存在,则写入配置文件 + if not os.path.exists(self.subtitle_config_file): + self.set_subtitle_config(y_p, h_p, x_p, w_p) + return y_p, h_p, x_p, w_p + else: + try: + config = configparser.ConfigParser() + config.read(self.subtitle_config_file, encoding='utf-8') + conf_y_p, conf_h_p, conf_x_p, conf_w_p = float(config['AREA']['Y']), float(config['AREA']['H']), float(config['AREA']['X']), float(config['AREA']['W']) + return conf_y_p, conf_h_p, conf_x_p, conf_w_p + except Exception: + self.set_subtitle_config(y_p, h_p, x_p, w_p) + return y_p, h_p, x_p, w_p + + +if __name__ == '__main__': + try: + multiprocessing.set_start_method("spawn") + # 运行图形化界面 + subtitleRemoverGUI = SubtitleRemoverGUI() + subtitleRemoverGUI.run() + except Exception as e: + print(f'[{type(e)}] {e}') + import traceback + traceback.print_exc() + msg = traceback.format_exc() + err_log_path = os.path.join(os.path.expanduser('~'), 'VSR-Error-Message.log') + with open(err_log_path, 'w', encoding='utf-8') as f: + f.writelines(msg) + import platform + if platform.system() == 'Windows': + os.system('pause') + else: + input() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..56c024c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +albumentations==0.5.2 +filesplit==3.0.2 +opencv-python==4.8.1.78 +scikit-image==0.17.2 +imgaug==0.4.0 +kornia==0.5.0 +pyclipper==1.3.0.post5 +lmdb==1.4.1 +PyYAML==6.0.1 +omegaconf==2.1.2 +tqdm==4.66.1 +PySimpleGUI==4.55.1 +easydict==1.9 +scikit-learn==0.24.2 +pandas==2.0.3 +webdataset==0.2.57 +pytorch-lightning==1.2.9 +numpy==1.23.1 +protobuf==3.20.0 +av==11.0.0 +einops==0.7.0 \ No newline at end of file diff --git a/test/test1.mp4 b/test/test1.mp4 new file mode 100644 index 0000000..f6d4fd7 Binary files /dev/null and b/test/test1.mp4 differ diff --git a/test/test2.mp4 b/test/test2.mp4 new file mode 100644 index 0000000..f58061e Binary files /dev/null and b/test/test2.mp4 differ diff --git a/test/test3.mp4 b/test/test3.mp4 new file mode 100644 index 0000000..5fe56c1 Binary files /dev/null and b/test/test3.mp4 differ