mirror of
https://github.com/YaoFANGUK/video-subtitle-remover.git
synced 2026-02-21 00:44:46 +08:00
init
This commit is contained in:
64
backend/ppocr/modeling/backbones/__init__.py
Executable file
64
backend/ppocr/modeling/backbones/__init__.py
Executable file
@@ -0,0 +1,64 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ["build_backbone"]
|
||||
|
||||
|
||||
def build_backbone(config, model_type):
|
||||
if model_type == "det" or model_type == "table":
|
||||
from .det_mobilenet_v3 import MobileNetV3
|
||||
from .det_resnet_vd import ResNet
|
||||
from .det_resnet_vd_sast import ResNet_SAST
|
||||
support_dict = ["MobileNetV3", "ResNet", "ResNet_SAST"]
|
||||
elif model_type == "rec" or model_type == "cls":
|
||||
from .rec_mobilenet_v3 import MobileNetV3
|
||||
from .rec_resnet_vd import ResNet
|
||||
from .rec_resnet_fpn import ResNetFPN
|
||||
from .rec_mv1_enhance import MobileNetV1Enhance
|
||||
from .rec_nrtr_mtb import MTB
|
||||
from .rec_resnet_31 import ResNet31
|
||||
from .rec_resnet_aster import ResNet_ASTER
|
||||
from .rec_micronet import MicroNet
|
||||
from .rec_efficientb3_pren import EfficientNetb3_PREN
|
||||
from .rec_svtrnet import SVTRNet
|
||||
support_dict = [
|
||||
'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB',
|
||||
"ResNet31", "ResNet_ASTER", 'MicroNet', 'EfficientNetb3_PREN',
|
||||
'SVTRNet'
|
||||
]
|
||||
elif model_type == "e2e":
|
||||
from .e2e_resnet_vd_pg import ResNet
|
||||
support_dict = ['ResNet']
|
||||
elif model_type == 'kie':
|
||||
from .kie_unet_sdmgr import Kie_backbone
|
||||
support_dict = ['Kie_backbone']
|
||||
elif model_type == "table":
|
||||
from .table_resnet_vd import ResNet
|
||||
from .table_mobilenet_v3 import MobileNetV3
|
||||
support_dict = ["ResNet", "MobileNetV3"]
|
||||
elif model_type == 'vqa':
|
||||
from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe
|
||||
support_dict = [
|
||||
"LayoutLMForSer", "LayoutLMv2ForSer", 'LayoutLMv2ForRe',
|
||||
"LayoutXLMForSer", 'LayoutXLMForRe'
|
||||
]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
module_name = config.pop("name")
|
||||
assert module_name in support_dict, Exception(
|
||||
"when model typs is {}, backbone only support {}".format(model_type,
|
||||
support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
||||
268
backend/ppocr/modeling/backbones/det_mobilenet_v3.py
Executable file
268
backend/ppocr/modeling/backbones/det_mobilenet_v3.py
Executable file
@@ -0,0 +1,268 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class MobileNetV3(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
model_name='large',
|
||||
scale=0.5,
|
||||
disable_se=False,
|
||||
**kwargs):
|
||||
"""
|
||||
the MobilenetV3 backbone network for detection module.
|
||||
Args:
|
||||
params(dict): the super parameters for build network
|
||||
"""
|
||||
super(MobileNetV3, self).__init__()
|
||||
|
||||
self.disable_se = disable_se
|
||||
|
||||
if model_name == "large":
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, 'relu', 1],
|
||||
[3, 64, 24, False, 'relu', 2],
|
||||
[3, 72, 24, False, 'relu', 1],
|
||||
[5, 72, 40, True, 'relu', 2],
|
||||
[5, 120, 40, True, 'relu', 1],
|
||||
[5, 120, 40, True, 'relu', 1],
|
||||
[3, 240, 80, False, 'hardswish', 2],
|
||||
[3, 200, 80, False, 'hardswish', 1],
|
||||
[3, 184, 80, False, 'hardswish', 1],
|
||||
[3, 184, 80, False, 'hardswish', 1],
|
||||
[3, 480, 112, True, 'hardswish', 1],
|
||||
[3, 672, 112, True, 'hardswish', 1],
|
||||
[5, 672, 160, True, 'hardswish', 2],
|
||||
[5, 960, 160, True, 'hardswish', 1],
|
||||
[5, 960, 160, True, 'hardswish', 1],
|
||||
]
|
||||
cls_ch_squeeze = 960
|
||||
elif model_name == "small":
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, 'relu', 2],
|
||||
[3, 72, 24, False, 'relu', 2],
|
||||
[3, 88, 24, False, 'relu', 1],
|
||||
[5, 96, 40, True, 'hardswish', 2],
|
||||
[5, 240, 40, True, 'hardswish', 1],
|
||||
[5, 240, 40, True, 'hardswish', 1],
|
||||
[5, 120, 48, True, 'hardswish', 1],
|
||||
[5, 144, 48, True, 'hardswish', 1],
|
||||
[5, 288, 96, True, 'hardswish', 2],
|
||||
[5, 576, 96, True, 'hardswish', 1],
|
||||
[5, 576, 96, True, 'hardswish', 1],
|
||||
]
|
||||
cls_ch_squeeze = 576
|
||||
else:
|
||||
raise NotImplementedError("mode[" + model_name +
|
||||
"_model] is not implemented!")
|
||||
|
||||
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
|
||||
assert scale in supported_scale, \
|
||||
"supported scale are {} but input scale is {}".format(supported_scale, scale)
|
||||
inplanes = 16
|
||||
# conv1
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=make_divisible(inplanes * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hardswish')
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = []
|
||||
block_list = []
|
||||
i = 0
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in cfg:
|
||||
se = se and not self.disable_se
|
||||
start_idx = 2 if model_name == 'large' else 0
|
||||
if s == 2 and i > start_idx:
|
||||
self.out_channels.append(inplanes)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
block_list = []
|
||||
block_list.append(
|
||||
ResidualUnit(
|
||||
in_channels=inplanes,
|
||||
mid_channels=make_divisible(scale * exp),
|
||||
out_channels=make_divisible(scale * c),
|
||||
kernel_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl))
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
block_list.append(
|
||||
ConvBNLayer(
|
||||
in_channels=inplanes,
|
||||
out_channels=make_divisible(scale * cls_ch_squeeze),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hardswish'))
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
|
||||
for i, stage in enumerate(self.stages):
|
||||
self.add_sublayer(sublayer=stage, name="stage{}".format(i))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
out_list = []
|
||||
for stage in self.stages:
|
||||
x = stage(x)
|
||||
out_list.append(x)
|
||||
return out_list
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.if_act = if_act
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm(num_channels=out_channels, act=None)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.if_act:
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "hardswish":
|
||||
x = F.hardswish(x)
|
||||
else:
|
||||
print("The activation function({}) is selected incorrectly.".
|
||||
format(self.act))
|
||||
exit()
|
||||
return x
|
||||
|
||||
|
||||
class ResidualUnit(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
use_se,
|
||||
act=None):
|
||||
super(ResidualUnit, self).__init__()
|
||||
self.if_shortcut = stride == 1 and in_channels == out_channels
|
||||
self.if_se = use_se
|
||||
|
||||
self.expand_conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=True,
|
||||
act=act)
|
||||
self.bottleneck_conv = ConvBNLayer(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=int((kernel_size - 1) // 2),
|
||||
groups=mid_channels,
|
||||
if_act=True,
|
||||
act=act)
|
||||
if self.if_se:
|
||||
self.mid_se = SEModule(mid_channels)
|
||||
self.linear_conv = ConvBNLayer(
|
||||
in_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=False,
|
||||
act=None)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.expand_conv(inputs)
|
||||
x = self.bottleneck_conv(x)
|
||||
if self.if_se:
|
||||
x = self.mid_se(x)
|
||||
x = self.linear_conv(x)
|
||||
if self.if_shortcut:
|
||||
x = paddle.add(inputs, x)
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, in_channels, reduction=4):
|
||||
super(SEModule, self).__init__()
|
||||
self.avg_pool = nn.AdaptiveAvgPool2D(1)
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels // reduction,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=in_channels // reduction,
|
||||
out_channels=in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.avg_pool(inputs)
|
||||
outputs = self.conv1(outputs)
|
||||
outputs = F.relu(outputs)
|
||||
outputs = self.conv2(outputs)
|
||||
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
|
||||
return inputs * outputs
|
||||
351
backend/ppocr/modeling/backbones/det_resnet_vd.py
Normal file
351
backend/ppocr/modeling/backbones/det_resnet_vd.py
Normal file
@@ -0,0 +1,351 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from paddle.vision.ops import DeformConv2D
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Normal, Constant, XavierUniform
|
||||
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class DeformableConvV2(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
weight_attr=None,
|
||||
bias_attr=None,
|
||||
lr_scale=1,
|
||||
regularizer=None,
|
||||
skip_quant=False,
|
||||
dcn_bias_regularizer=L2Decay(0.),
|
||||
dcn_bias_lr_scale=2.):
|
||||
super(DeformableConvV2, self).__init__()
|
||||
self.offset_channel = 2 * kernel_size**2 * groups
|
||||
self.mask_channel = kernel_size**2 * groups
|
||||
|
||||
if bias_attr:
|
||||
# in FCOS-DCN head, specifically need learning_rate and regularizer
|
||||
dcn_bias_attr = ParamAttr(
|
||||
initializer=Constant(value=0),
|
||||
regularizer=dcn_bias_regularizer,
|
||||
learning_rate=dcn_bias_lr_scale)
|
||||
else:
|
||||
# in ResNet backbone, do not need bias
|
||||
dcn_bias_attr = False
|
||||
self.conv_dcn = DeformConv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2 * dilation,
|
||||
dilation=dilation,
|
||||
deformable_groups=groups,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=dcn_bias_attr)
|
||||
|
||||
if lr_scale == 1 and regularizer is None:
|
||||
offset_bias_attr = ParamAttr(initializer=Constant(0.))
|
||||
else:
|
||||
offset_bias_attr = ParamAttr(
|
||||
initializer=Constant(0.),
|
||||
learning_rate=lr_scale,
|
||||
regularizer=regularizer)
|
||||
self.conv_offset = nn.Conv2D(
|
||||
in_channels,
|
||||
groups * 3 * kernel_size**2,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
weight_attr=ParamAttr(initializer=Constant(0.0)),
|
||||
bias_attr=offset_bias_attr)
|
||||
if skip_quant:
|
||||
self.conv_offset.skip_quant = True
|
||||
|
||||
def forward(self, x):
|
||||
offset_mask = self.conv_offset(x)
|
||||
offset, mask = paddle.split(
|
||||
offset_mask,
|
||||
num_or_sections=[self.offset_channel, self.mask_channel],
|
||||
axis=1)
|
||||
mask = F.sigmoid(mask)
|
||||
y = self.conv_dcn(x, offset, mask=mask)
|
||||
return y
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
is_vd_mode=False,
|
||||
act=None,
|
||||
is_dcn=False):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.is_vd_mode = is_vd_mode
|
||||
self._pool2d_avg = nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
if not is_dcn:
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
bias_attr=False)
|
||||
else:
|
||||
self._conv = DeformableConvV2(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=2, #groups,
|
||||
bias_attr=False)
|
||||
self._batch_norm = nn.BatchNorm(out_channels, act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
if self.is_vd_mode:
|
||||
inputs = self._pool2d_avg(inputs)
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
return y
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
is_dcn=False, ):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu')
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
is_dcn=is_dcn)
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None)
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True)
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
conv2 = self.conv2(conv1)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv2)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False, ):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.stride = stride
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu')
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None)
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True)
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv1)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
layers=50,
|
||||
dcn_stage=None,
|
||||
out_indices=None,
|
||||
**kwargs):
|
||||
super(ResNet, self).__init__()
|
||||
|
||||
self.layers = layers
|
||||
supported_layers = [18, 34, 50, 101, 152, 200]
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(
|
||||
supported_layers, layers)
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
depth = [3, 4, 6, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
num_channels = [64, 256, 512,
|
||||
1024] if layers >= 50 else [64, 64, 128, 256]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
|
||||
self.dcn_stage = dcn_stage if dcn_stage is not None else [
|
||||
False, False, False, False
|
||||
]
|
||||
self.out_indices = out_indices if out_indices is not None else [
|
||||
0, 1, 2, 3
|
||||
]
|
||||
|
||||
self.conv1_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
act='relu')
|
||||
self.conv1_2 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu')
|
||||
self.conv1_3 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu')
|
||||
self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = []
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
is_dcn = self.dcn_stage[block]
|
||||
for i in range(depth[block]):
|
||||
bottleneck_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BottleneckBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block] * 4,
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
is_dcn=is_dcn))
|
||||
shortcut = True
|
||||
block_list.append(bottleneck_block)
|
||||
if block in self.out_indices:
|
||||
self.out_channels.append(num_filters[block] * 4)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
# is_dcn = self.dcn_stage[block]
|
||||
for i in range(depth[block]):
|
||||
basic_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BasicBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block],
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0))
|
||||
shortcut = True
|
||||
block_list.append(basic_block)
|
||||
if block in self.out_indices:
|
||||
self.out_channels.append(num_filters[block])
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv1_1(inputs)
|
||||
y = self.conv1_2(y)
|
||||
y = self.conv1_3(y)
|
||||
y = self.pool2d_max(y)
|
||||
out = []
|
||||
for i, block in enumerate(self.stages):
|
||||
y = block(y)
|
||||
if i in self.out_indices:
|
||||
out.append(y)
|
||||
return out
|
||||
285
backend/ppocr/modeling/backbones/det_resnet_vd_sast.py
Normal file
285
backend/ppocr/modeling/backbones/det_resnet_vd_sast.py
Normal file
@@ -0,0 +1,285 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
__all__ = ["ResNet_SAST"]
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
is_vd_mode=False,
|
||||
act=None,
|
||||
name=None, ):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.is_vd_mode = is_vd_mode
|
||||
self._pool2d_avg = nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
self._batch_norm = nn.BatchNorm(
|
||||
out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def forward(self, inputs):
|
||||
if self.is_vd_mode:
|
||||
inputs = self._pool2d_avg(inputs)
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
return y
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
conv2 = self.conv2(conv1)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv2)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.stride = stride
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv1)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class ResNet_SAST(nn.Layer):
|
||||
def __init__(self, in_channels=3, layers=50, **kwargs):
|
||||
super(ResNet_SAST, self).__init__()
|
||||
|
||||
self.layers = layers
|
||||
supported_layers = [18, 34, 50, 101, 152, 200]
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(
|
||||
supported_layers, layers)
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
# depth = [3, 4, 6, 3]
|
||||
depth = [3, 4, 6, 3, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
# num_channels = [64, 256, 512,
|
||||
# 1024] if layers >= 50 else [64, 64, 128, 256]
|
||||
# num_filters = [64, 128, 256, 512]
|
||||
num_channels = [64, 256, 512,
|
||||
1024, 2048] if layers >= 50 else [64, 64, 128, 256]
|
||||
num_filters = [64, 128, 256, 512, 512]
|
||||
|
||||
self.conv1_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name="conv1_1")
|
||||
self.conv1_2 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_2")
|
||||
self.conv1_3 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_3")
|
||||
self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = [3, 64]
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
bottleneck_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BottleneckBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block] * 4,
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
block_list.append(bottleneck_block)
|
||||
self.out_channels.append(num_filters[block] * 4)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
basic_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BasicBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block],
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
block_list.append(basic_block)
|
||||
self.out_channels.append(num_filters[block])
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
|
||||
def forward(self, inputs):
|
||||
out = [inputs]
|
||||
y = self.conv1_1(inputs)
|
||||
y = self.conv1_2(y)
|
||||
y = self.conv1_3(y)
|
||||
out.append(y)
|
||||
y = self.pool2d_max(y)
|
||||
for block in self.stages:
|
||||
y = block(y)
|
||||
out.append(y)
|
||||
return out
|
||||
265
backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py
Normal file
265
backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py
Normal file
@@ -0,0 +1,265 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
is_vd_mode=False,
|
||||
act=None,
|
||||
name=None, ):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.is_vd_mode = is_vd_mode
|
||||
self._pool2d_avg = nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
self._batch_norm = nn.BatchNorm(
|
||||
out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
return y
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
is_vd_mode=False if if_first else True,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
conv2 = self.conv2(conv1)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv2)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.stride = stride
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv1)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self, in_channels=3, layers=50, **kwargs):
|
||||
super(ResNet, self).__init__()
|
||||
|
||||
self.layers = layers
|
||||
supported_layers = [18, 34, 50, 101, 152, 200]
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(
|
||||
supported_layers, layers)
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
# depth = [3, 4, 6, 3]
|
||||
depth = [3, 4, 6, 3, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
num_channels = [64, 256, 512, 1024,
|
||||
2048] if layers >= 50 else [64, 64, 128, 256]
|
||||
num_filters = [64, 128, 256, 512, 512]
|
||||
|
||||
self.conv1_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=7,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name="conv1_1")
|
||||
self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = [3, 64]
|
||||
# num_filters = [64, 128, 256, 512, 512]
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
bottleneck_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BottleneckBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block] * 4,
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
block_list.append(bottleneck_block)
|
||||
self.out_channels.append(num_filters[block] * 4)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
basic_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BasicBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block],
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
block_list.append(basic_block)
|
||||
self.out_channels.append(num_filters[block])
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
|
||||
def forward(self, inputs):
|
||||
out = [inputs]
|
||||
y = self.conv1_1(inputs)
|
||||
out.append(y)
|
||||
y = self.pool2d_max(y)
|
||||
for block in self.stages:
|
||||
y = block(y)
|
||||
out.append(y)
|
||||
return out
|
||||
186
backend/ppocr/modeling/backbones/kie_unet_sdmgr.py
Normal file
186
backend/ppocr/modeling/backbones/kie_unet_sdmgr.py
Normal file
@@ -0,0 +1,186 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
__all__ = ["Kie_backbone"]
|
||||
|
||||
|
||||
class Encoder(nn.Layer):
|
||||
def __init__(self, num_channels, num_filters):
|
||||
super(Encoder, self).__init__()
|
||||
self.conv1 = nn.Conv2D(
|
||||
num_channels,
|
||||
num_filters,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias_attr=False)
|
||||
self.bn1 = nn.BatchNorm(num_filters, act='relu')
|
||||
|
||||
self.conv2 = nn.Conv2D(
|
||||
num_filters,
|
||||
num_filters,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias_attr=False)
|
||||
self.bn2 = nn.BatchNorm(num_filters, act='relu')
|
||||
|
||||
self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.conv1(inputs)
|
||||
x = self.bn1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.bn2(x)
|
||||
x_pooled = self.pool(x)
|
||||
return x, x_pooled
|
||||
|
||||
|
||||
class Decoder(nn.Layer):
|
||||
def __init__(self, num_channels, num_filters):
|
||||
super(Decoder, self).__init__()
|
||||
|
||||
self.conv1 = nn.Conv2D(
|
||||
num_channels,
|
||||
num_filters,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias_attr=False)
|
||||
self.bn1 = nn.BatchNorm(num_filters, act='relu')
|
||||
|
||||
self.conv2 = nn.Conv2D(
|
||||
num_filters,
|
||||
num_filters,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias_attr=False)
|
||||
self.bn2 = nn.BatchNorm(num_filters, act='relu')
|
||||
|
||||
self.conv0 = nn.Conv2D(
|
||||
num_channels,
|
||||
num_filters,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias_attr=False)
|
||||
self.bn0 = nn.BatchNorm(num_filters, act='relu')
|
||||
|
||||
def forward(self, inputs_prev, inputs):
|
||||
x = self.conv0(inputs)
|
||||
x = self.bn0(x)
|
||||
x = paddle.nn.functional.interpolate(
|
||||
x, scale_factor=2, mode='bilinear', align_corners=False)
|
||||
x = paddle.concat([inputs_prev, x], axis=1)
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.bn2(x)
|
||||
return x
|
||||
|
||||
|
||||
class UNet(nn.Layer):
|
||||
def __init__(self):
|
||||
super(UNet, self).__init__()
|
||||
self.down1 = Encoder(num_channels=3, num_filters=16)
|
||||
self.down2 = Encoder(num_channels=16, num_filters=32)
|
||||
self.down3 = Encoder(num_channels=32, num_filters=64)
|
||||
self.down4 = Encoder(num_channels=64, num_filters=128)
|
||||
self.down5 = Encoder(num_channels=128, num_filters=256)
|
||||
|
||||
self.up1 = Decoder(32, 16)
|
||||
self.up2 = Decoder(64, 32)
|
||||
self.up3 = Decoder(128, 64)
|
||||
self.up4 = Decoder(256, 128)
|
||||
self.out_channels = 16
|
||||
|
||||
def forward(self, inputs):
|
||||
x1, _ = self.down1(inputs)
|
||||
_, x2 = self.down2(x1)
|
||||
_, x3 = self.down3(x2)
|
||||
_, x4 = self.down4(x3)
|
||||
_, x5 = self.down5(x4)
|
||||
|
||||
x = self.up4(x4, x5)
|
||||
x = self.up3(x3, x)
|
||||
x = self.up2(x2, x)
|
||||
x = self.up1(x1, x)
|
||||
return x
|
||||
|
||||
|
||||
class Kie_backbone(nn.Layer):
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super(Kie_backbone, self).__init__()
|
||||
self.out_channels = 16
|
||||
self.img_feat = UNet()
|
||||
self.maxpool = nn.MaxPool2D(kernel_size=7)
|
||||
|
||||
def bbox2roi(self, bbox_list):
|
||||
rois_list = []
|
||||
rois_num = []
|
||||
for img_id, bboxes in enumerate(bbox_list):
|
||||
rois_num.append(bboxes.shape[0])
|
||||
rois_list.append(bboxes)
|
||||
rois = paddle.concat(rois_list, 0)
|
||||
rois_num = paddle.to_tensor(rois_num, dtype='int32')
|
||||
return rois, rois_num
|
||||
|
||||
def pre_process(self, img, relations, texts, gt_bboxes, tag, img_size):
|
||||
img, relations, texts, gt_bboxes, tag, img_size = img.numpy(
|
||||
), relations.numpy(), texts.numpy(), gt_bboxes.numpy(), tag.numpy(
|
||||
).tolist(), img_size.numpy()
|
||||
temp_relations, temp_texts, temp_gt_bboxes = [], [], []
|
||||
h, w = int(np.max(img_size[:, 0])), int(np.max(img_size[:, 1]))
|
||||
img = paddle.to_tensor(img[:, :, :h, :w])
|
||||
batch = len(tag)
|
||||
for i in range(batch):
|
||||
num, recoder_len = tag[i][0], tag[i][1]
|
||||
temp_relations.append(
|
||||
paddle.to_tensor(
|
||||
relations[i, :num, :num, :], dtype='float32'))
|
||||
temp_texts.append(
|
||||
paddle.to_tensor(
|
||||
texts[i, :num, :recoder_len], dtype='float32'))
|
||||
temp_gt_bboxes.append(
|
||||
paddle.to_tensor(
|
||||
gt_bboxes[i, :num, ...], dtype='float32'))
|
||||
return img, temp_relations, temp_texts, temp_gt_bboxes
|
||||
|
||||
def forward(self, inputs):
|
||||
img = inputs[0]
|
||||
relations, texts, gt_bboxes, tag, img_size = inputs[1], inputs[
|
||||
2], inputs[3], inputs[5], inputs[-1]
|
||||
img, relations, texts, gt_bboxes = self.pre_process(
|
||||
img, relations, texts, gt_bboxes, tag, img_size)
|
||||
x = self.img_feat(img)
|
||||
boxes, rois_num = self.bbox2roi(gt_bboxes)
|
||||
feats = paddle.fluid.layers.roi_align(
|
||||
x,
|
||||
boxes,
|
||||
spatial_scale=1.0,
|
||||
pooled_height=7,
|
||||
pooled_width=7,
|
||||
rois_num=rois_num)
|
||||
feats = self.maxpool(feats).squeeze(-1).squeeze(-1)
|
||||
return [relations, texts, feats]
|
||||
228
backend/ppocr/modeling/backbones/rec_efficientb3_pren.py
Normal file
228
backend/ppocr/modeling/backbones/rec_efficientb3_pren.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Code is refer from:
|
||||
https://github.com/RuijieJ/pren/blob/main/Nets/EfficientNet.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
from collections import namedtuple
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
__all__ = ['EfficientNetb3']
|
||||
|
||||
|
||||
class EffB3Params:
|
||||
@staticmethod
|
||||
def get_global_params():
|
||||
"""
|
||||
The fllowing are efficientnetb3's arch superparams, but to fit for scene
|
||||
text recognition task, the resolution(image_size) here is changed
|
||||
from 300 to 64.
|
||||
"""
|
||||
GlobalParams = namedtuple('GlobalParams', [
|
||||
'drop_connect_rate', 'width_coefficient', 'depth_coefficient',
|
||||
'depth_divisor', 'image_size'
|
||||
])
|
||||
global_params = GlobalParams(
|
||||
drop_connect_rate=0.3,
|
||||
width_coefficient=1.2,
|
||||
depth_coefficient=1.4,
|
||||
depth_divisor=8,
|
||||
image_size=64)
|
||||
return global_params
|
||||
|
||||
@staticmethod
|
||||
def get_block_params():
|
||||
BlockParams = namedtuple('BlockParams', [
|
||||
'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
|
||||
'expand_ratio', 'id_skip', 'se_ratio', 'stride'
|
||||
])
|
||||
block_params = [
|
||||
BlockParams(3, 1, 32, 16, 1, True, 0.25, 1),
|
||||
BlockParams(3, 2, 16, 24, 6, True, 0.25, 2),
|
||||
BlockParams(5, 2, 24, 40, 6, True, 0.25, 2),
|
||||
BlockParams(3, 3, 40, 80, 6, True, 0.25, 2),
|
||||
BlockParams(5, 3, 80, 112, 6, True, 0.25, 1),
|
||||
BlockParams(5, 4, 112, 192, 6, True, 0.25, 2),
|
||||
BlockParams(3, 1, 192, 320, 6, True, 0.25, 1)
|
||||
]
|
||||
return block_params
|
||||
|
||||
|
||||
class EffUtils:
|
||||
@staticmethod
|
||||
def round_filters(filters, global_params):
|
||||
"""Calculate and round number of filters based on depth multiplier."""
|
||||
multiplier = global_params.width_coefficient
|
||||
if not multiplier:
|
||||
return filters
|
||||
divisor = global_params.depth_divisor
|
||||
filters *= multiplier
|
||||
new_filters = int(filters + divisor / 2) // divisor * divisor
|
||||
if new_filters < 0.9 * filters:
|
||||
new_filters += divisor
|
||||
return int(new_filters)
|
||||
|
||||
@staticmethod
|
||||
def round_repeats(repeats, global_params):
|
||||
"""Round number of filters based on depth multiplier."""
|
||||
multiplier = global_params.depth_coefficient
|
||||
if not multiplier:
|
||||
return repeats
|
||||
return int(math.ceil(multiplier * repeats))
|
||||
|
||||
|
||||
class ConvBlock(nn.Layer):
|
||||
def __init__(self, block_params):
|
||||
super(ConvBlock, self).__init__()
|
||||
self.block_args = block_params
|
||||
self.has_se = (self.block_args.se_ratio is not None) and \
|
||||
(0 < self.block_args.se_ratio <= 1)
|
||||
self.id_skip = block_params.id_skip
|
||||
|
||||
# expansion phase
|
||||
self.input_filters = self.block_args.input_filters
|
||||
output_filters = \
|
||||
self.block_args.input_filters * self.block_args.expand_ratio
|
||||
if self.block_args.expand_ratio != 1:
|
||||
self.expand_conv = nn.Conv2D(
|
||||
self.input_filters, output_filters, 1, bias_attr=False)
|
||||
self.bn0 = nn.BatchNorm(output_filters)
|
||||
|
||||
# depthwise conv phase
|
||||
k = self.block_args.kernel_size
|
||||
s = self.block_args.stride
|
||||
self.depthwise_conv = nn.Conv2D(
|
||||
output_filters,
|
||||
output_filters,
|
||||
groups=output_filters,
|
||||
kernel_size=k,
|
||||
stride=s,
|
||||
padding='same',
|
||||
bias_attr=False)
|
||||
self.bn1 = nn.BatchNorm(output_filters)
|
||||
|
||||
# squeeze and excitation layer, if desired
|
||||
if self.has_se:
|
||||
num_squeezed_channels = max(1,
|
||||
int(self.block_args.input_filters *
|
||||
self.block_args.se_ratio))
|
||||
self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1)
|
||||
self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1)
|
||||
|
||||
# output phase
|
||||
self.final_oup = self.block_args.output_filters
|
||||
self.project_conv = nn.Conv2D(
|
||||
output_filters, self.final_oup, 1, bias_attr=False)
|
||||
self.bn2 = nn.BatchNorm(self.final_oup)
|
||||
self.swish = nn.Swish()
|
||||
|
||||
def drop_connect(self, inputs, p, training):
|
||||
if not training:
|
||||
return inputs
|
||||
|
||||
batch_size = inputs.shape[0]
|
||||
keep_prob = 1 - p
|
||||
random_tensor = keep_prob
|
||||
random_tensor += paddle.rand([batch_size, 1, 1, 1], dtype=inputs.dtype)
|
||||
random_tensor = paddle.to_tensor(random_tensor, place=inputs.place)
|
||||
binary_tensor = paddle.floor(random_tensor)
|
||||
output = inputs / keep_prob * binary_tensor
|
||||
return output
|
||||
|
||||
def forward(self, inputs, drop_connect_rate=None):
|
||||
# expansion and depthwise conv
|
||||
x = inputs
|
||||
if self.block_args.expand_ratio != 1:
|
||||
x = self.swish(self.bn0(self.expand_conv(inputs)))
|
||||
x = self.swish(self.bn1(self.depthwise_conv(x)))
|
||||
|
||||
# squeeze and excitation
|
||||
if self.has_se:
|
||||
x_squeezed = F.adaptive_avg_pool2d(x, 1)
|
||||
x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed)))
|
||||
x = F.sigmoid(x_squeezed) * x
|
||||
x = self.bn2(self.project_conv(x))
|
||||
|
||||
# skip conntection and drop connect
|
||||
if self.id_skip and self.block_args.stride == 1 and \
|
||||
self.input_filters == self.final_oup:
|
||||
if drop_connect_rate:
|
||||
x = self.drop_connect(
|
||||
x, p=drop_connect_rate, training=self.training)
|
||||
x = x + inputs
|
||||
return x
|
||||
|
||||
|
||||
class EfficientNetb3_PREN(nn.Layer):
|
||||
def __init__(self, in_channels):
|
||||
super(EfficientNetb3_PREN, self).__init__()
|
||||
self.blocks_params = EffB3Params.get_block_params()
|
||||
self.global_params = EffB3Params.get_global_params()
|
||||
self.out_channels = []
|
||||
# stem
|
||||
stem_channels = EffUtils.round_filters(32, self.global_params)
|
||||
self.conv_stem = nn.Conv2D(
|
||||
in_channels, stem_channels, 3, 2, padding='same', bias_attr=False)
|
||||
self.bn0 = nn.BatchNorm(stem_channels)
|
||||
|
||||
self.blocks = []
|
||||
# to extract three feature maps for fpn based on efficientnetb3 backbone
|
||||
self.concerned_block_idxes = [7, 17, 25]
|
||||
concerned_idx = 0
|
||||
for i, block_params in enumerate(self.blocks_params):
|
||||
block_params = block_params._replace(
|
||||
input_filters=EffUtils.round_filters(block_params.input_filters,
|
||||
self.global_params),
|
||||
output_filters=EffUtils.round_filters(
|
||||
block_params.output_filters, self.global_params),
|
||||
num_repeat=EffUtils.round_repeats(block_params.num_repeat,
|
||||
self.global_params))
|
||||
self.blocks.append(
|
||||
self.add_sublayer("{}-0".format(i), ConvBlock(block_params)))
|
||||
concerned_idx += 1
|
||||
if concerned_idx in self.concerned_block_idxes:
|
||||
self.out_channels.append(block_params.output_filters)
|
||||
if block_params.num_repeat > 1:
|
||||
block_params = block_params._replace(
|
||||
input_filters=block_params.output_filters, stride=1)
|
||||
for j in range(block_params.num_repeat - 1):
|
||||
self.blocks.append(
|
||||
self.add_sublayer('{}-{}'.format(i, j + 1),
|
||||
ConvBlock(block_params)))
|
||||
concerned_idx += 1
|
||||
if concerned_idx in self.concerned_block_idxes:
|
||||
self.out_channels.append(block_params.output_filters)
|
||||
|
||||
self.swish = nn.Swish()
|
||||
|
||||
def forward(self, inputs):
|
||||
outs = []
|
||||
|
||||
x = self.swish(self.bn0(self.conv_stem(inputs)))
|
||||
for idx, block in enumerate(self.blocks):
|
||||
drop_connect_rate = self.global_params.drop_connect_rate
|
||||
if drop_connect_rate:
|
||||
drop_connect_rate *= float(idx) / len(self.blocks)
|
||||
x = block(x, drop_connect_rate=drop_connect_rate)
|
||||
if idx in self.concerned_block_idxes:
|
||||
outs.append(x)
|
||||
return outs
|
||||
528
backend/ppocr/modeling/backbones/rec_micronet.py
Normal file
528
backend/ppocr/modeling/backbones/rec_micronet.py
Normal file
@@ -0,0 +1,528 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/liyunsheng13/micronet/blob/main/backbone/micronet.py
|
||||
https://github.com/liyunsheng13/micronet/blob/main/backbone/activation.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
from ppocr.modeling.backbones.det_mobilenet_v3 import make_divisible
|
||||
|
||||
M0_cfgs = [
|
||||
# s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r
|
||||
[2, 1, 8, 3, 2, 2, 0, 4, 8, 2, 2, 2, 0, 1, 1],
|
||||
[2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 2, 1, 1],
|
||||
[2, 1, 16, 5, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1],
|
||||
[1, 1, 32, 5, 1, 4, 4, 4, 32, 4, 4, 2, 2, 1, 1],
|
||||
[2, 1, 64, 5, 1, 4, 8, 8, 64, 8, 8, 2, 2, 1, 1],
|
||||
[1, 1, 96, 3, 1, 4, 8, 8, 96, 8, 8, 2, 2, 1, 2],
|
||||
[1, 1, 384, 3, 1, 4, 12, 12, 0, 0, 0, 2, 2, 1, 2],
|
||||
]
|
||||
M1_cfgs = [
|
||||
# s, n, c, ks, c1, c2, g1, g2, c3, g3, g4
|
||||
[2, 1, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1, 1],
|
||||
[2, 1, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1, 1],
|
||||
[2, 1, 16, 5, 2, 2, 0, 16, 16, 4, 4, 2, 2, 1, 1],
|
||||
[1, 1, 32, 5, 1, 6, 4, 4, 32, 4, 4, 2, 2, 1, 1],
|
||||
[2, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 1],
|
||||
[1, 1, 96, 3, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2],
|
||||
[1, 1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1, 2],
|
||||
]
|
||||
M2_cfgs = [
|
||||
# s, n, c, ks, c1, c2, g1, g2, c3, g3, g4
|
||||
[2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 0, 1, 1],
|
||||
[2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1],
|
||||
[1, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 2, 2, 1, 1],
|
||||
[2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 2, 2, 1, 1],
|
||||
[1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 2, 2, 1, 2],
|
||||
[1, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 2],
|
||||
[2, 1, 96, 5, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2],
|
||||
[1, 1, 128, 3, 1, 6, 12, 12, 128, 8, 8, 2, 2, 1, 2],
|
||||
[1, 1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1, 2],
|
||||
]
|
||||
M3_cfgs = [
|
||||
# s, n, c, ks, c1, c2, g1, g2, c3, g3, g4
|
||||
[2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 0, 2, 0, 1],
|
||||
[2, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 0, 2, 0, 1],
|
||||
[1, 1, 24, 3, 2, 2, 0, 24, 24, 4, 4, 0, 2, 0, 1],
|
||||
[2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 0, 2, 0, 1],
|
||||
[1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 0, 2, 0, 2],
|
||||
[1, 1, 64, 5, 1, 6, 8, 8, 48, 8, 8, 0, 2, 0, 2],
|
||||
[1, 1, 80, 5, 1, 6, 8, 8, 80, 8, 8, 0, 2, 0, 2],
|
||||
[1, 1, 80, 5, 1, 6, 10, 10, 80, 8, 8, 0, 2, 0, 2],
|
||||
[1, 1, 120, 5, 1, 6, 10, 10, 120, 10, 10, 0, 2, 0, 2],
|
||||
[1, 1, 120, 5, 1, 6, 12, 12, 120, 10, 10, 0, 2, 0, 2],
|
||||
[1, 1, 144, 3, 1, 6, 12, 12, 144, 12, 12, 0, 2, 0, 2],
|
||||
[1, 1, 432, 3, 1, 3, 12, 12, 0, 0, 0, 0, 2, 0, 2],
|
||||
]
|
||||
|
||||
|
||||
def get_micronet_config(mode):
|
||||
return eval(mode + '_cfgs')
|
||||
|
||||
|
||||
class MaxGroupPooling(nn.Layer):
|
||||
def __init__(self, channel_per_group=2):
|
||||
super(MaxGroupPooling, self).__init__()
|
||||
self.channel_per_group = channel_per_group
|
||||
|
||||
def forward(self, x):
|
||||
if self.channel_per_group == 1:
|
||||
return x
|
||||
# max op
|
||||
b, c, h, w = x.shape
|
||||
|
||||
# reshape
|
||||
y = paddle.reshape(x, [b, c // self.channel_per_group, -1, h, w])
|
||||
out = paddle.max(y, axis=2)
|
||||
return out
|
||||
|
||||
|
||||
class SpatialSepConvSF(nn.Layer):
|
||||
def __init__(self, inp, oups, kernel_size, stride):
|
||||
super(SpatialSepConvSF, self).__init__()
|
||||
|
||||
oup1, oup2 = oups
|
||||
self.conv = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
inp,
|
||||
oup1, (kernel_size, 1), (stride, 1), (kernel_size // 2, 0),
|
||||
bias_attr=False,
|
||||
groups=1),
|
||||
nn.BatchNorm2D(oup1),
|
||||
nn.Conv2D(
|
||||
oup1,
|
||||
oup1 * oup2, (1, kernel_size), (1, stride),
|
||||
(0, kernel_size // 2),
|
||||
bias_attr=False,
|
||||
groups=oup1),
|
||||
nn.BatchNorm2D(oup1 * oup2),
|
||||
ChannelShuffle(oup1), )
|
||||
|
||||
def forward(self, x):
|
||||
out = self.conv(x)
|
||||
return out
|
||||
|
||||
|
||||
class ChannelShuffle(nn.Layer):
|
||||
def __init__(self, groups):
|
||||
super(ChannelShuffle, self).__init__()
|
||||
self.groups = groups
|
||||
|
||||
def forward(self, x):
|
||||
b, c, h, w = x.shape
|
||||
|
||||
channels_per_group = c // self.groups
|
||||
|
||||
# reshape
|
||||
x = paddle.reshape(x, [b, self.groups, channels_per_group, h, w])
|
||||
|
||||
x = paddle.transpose(x, (0, 2, 1, 3, 4))
|
||||
out = paddle.reshape(x, [b, -1, h, w])
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class StemLayer(nn.Layer):
|
||||
def __init__(self, inp, oup, stride, groups=(4, 4)):
|
||||
super(StemLayer, self).__init__()
|
||||
|
||||
g1, g2 = groups
|
||||
self.stem = nn.Sequential(
|
||||
SpatialSepConvSF(inp, groups, 3, stride),
|
||||
MaxGroupPooling(2) if g1 * g2 == 2 * oup else nn.ReLU6())
|
||||
|
||||
def forward(self, x):
|
||||
out = self.stem(x)
|
||||
return out
|
||||
|
||||
|
||||
class DepthSpatialSepConv(nn.Layer):
|
||||
def __init__(self, inp, expand, kernel_size, stride):
|
||||
super(DepthSpatialSepConv, self).__init__()
|
||||
|
||||
exp1, exp2 = expand
|
||||
|
||||
hidden_dim = inp * exp1
|
||||
oup = inp * exp1 * exp2
|
||||
|
||||
self.conv = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
inp,
|
||||
inp * exp1, (kernel_size, 1), (stride, 1),
|
||||
(kernel_size // 2, 0),
|
||||
bias_attr=False,
|
||||
groups=inp),
|
||||
nn.BatchNorm2D(inp * exp1),
|
||||
nn.Conv2D(
|
||||
hidden_dim,
|
||||
oup, (1, kernel_size),
|
||||
1, (0, kernel_size // 2),
|
||||
bias_attr=False,
|
||||
groups=hidden_dim),
|
||||
nn.BatchNorm2D(oup))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class GroupConv(nn.Layer):
|
||||
def __init__(self, inp, oup, groups=2):
|
||||
super(GroupConv, self).__init__()
|
||||
self.inp = inp
|
||||
self.oup = oup
|
||||
self.groups = groups
|
||||
self.conv = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
inp, oup, 1, 1, 0, bias_attr=False, groups=self.groups[0]),
|
||||
nn.BatchNorm2D(oup))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class DepthConv(nn.Layer):
|
||||
def __init__(self, inp, oup, kernel_size, stride):
|
||||
super(DepthConv, self).__init__()
|
||||
self.conv = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
inp,
|
||||
oup,
|
||||
kernel_size,
|
||||
stride,
|
||||
kernel_size // 2,
|
||||
bias_attr=False,
|
||||
groups=inp),
|
||||
nn.BatchNorm2D(oup))
|
||||
|
||||
def forward(self, x):
|
||||
out = self.conv(x)
|
||||
return out
|
||||
|
||||
|
||||
class DYShiftMax(nn.Layer):
|
||||
def __init__(self,
|
||||
inp,
|
||||
oup,
|
||||
reduction=4,
|
||||
act_max=1.0,
|
||||
act_relu=True,
|
||||
init_a=[0.0, 0.0],
|
||||
init_b=[0.0, 0.0],
|
||||
relu_before_pool=False,
|
||||
g=None,
|
||||
expansion=False):
|
||||
super(DYShiftMax, self).__init__()
|
||||
self.oup = oup
|
||||
self.act_max = act_max * 2
|
||||
self.act_relu = act_relu
|
||||
self.avg_pool = nn.Sequential(nn.ReLU() if relu_before_pool == True else
|
||||
nn.Sequential(), nn.AdaptiveAvgPool2D(1))
|
||||
|
||||
self.exp = 4 if act_relu else 2
|
||||
self.init_a = init_a
|
||||
self.init_b = init_b
|
||||
|
||||
# determine squeeze
|
||||
squeeze = make_divisible(inp // reduction, 4)
|
||||
if squeeze < 4:
|
||||
squeeze = 4
|
||||
|
||||
self.fc = nn.Sequential(
|
||||
nn.Linear(inp, squeeze),
|
||||
nn.ReLU(), nn.Linear(squeeze, oup * self.exp), nn.Hardsigmoid())
|
||||
|
||||
if g is None:
|
||||
g = 1
|
||||
self.g = g[1]
|
||||
if self.g != 1 and expansion:
|
||||
self.g = inp // self.g
|
||||
|
||||
self.gc = inp // self.g
|
||||
index = paddle.to_tensor([range(inp)])
|
||||
index = paddle.reshape(index, [1, inp, 1, 1])
|
||||
index = paddle.reshape(index, [1, self.g, self.gc, 1, 1])
|
||||
indexgs = paddle.split(index, [1, self.g - 1], axis=1)
|
||||
indexgs = paddle.concat((indexgs[1], indexgs[0]), axis=1)
|
||||
indexs = paddle.split(indexgs, [1, self.gc - 1], axis=2)
|
||||
indexs = paddle.concat((indexs[1], indexs[0]), axis=2)
|
||||
self.index = paddle.reshape(indexs, [inp])
|
||||
self.expansion = expansion
|
||||
|
||||
def forward(self, x):
|
||||
x_in = x
|
||||
x_out = x
|
||||
|
||||
b, c, _, _ = x_in.shape
|
||||
y = self.avg_pool(x_in)
|
||||
y = paddle.reshape(y, [b, c])
|
||||
y = self.fc(y)
|
||||
y = paddle.reshape(y, [b, self.oup * self.exp, 1, 1])
|
||||
y = (y - 0.5) * self.act_max
|
||||
|
||||
n2, c2, h2, w2 = x_out.shape
|
||||
x2 = paddle.to_tensor(x_out.numpy()[:, self.index.numpy(), :, :])
|
||||
|
||||
if self.exp == 4:
|
||||
temp = y.shape
|
||||
a1, b1, a2, b2 = paddle.split(y, temp[1] // self.oup, axis=1)
|
||||
|
||||
a1 = a1 + self.init_a[0]
|
||||
a2 = a2 + self.init_a[1]
|
||||
|
||||
b1 = b1 + self.init_b[0]
|
||||
b2 = b2 + self.init_b[1]
|
||||
|
||||
z1 = x_out * a1 + x2 * b1
|
||||
z2 = x_out * a2 + x2 * b2
|
||||
|
||||
out = paddle.maximum(z1, z2)
|
||||
|
||||
elif self.exp == 2:
|
||||
temp = y.shape
|
||||
a1, b1 = paddle.split(y, temp[1] // self.oup, axis=1)
|
||||
a1 = a1 + self.init_a[0]
|
||||
b1 = b1 + self.init_b[0]
|
||||
out = x_out * a1 + x2 * b1
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class DYMicroBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
inp,
|
||||
oup,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
ch_exp=(2, 2),
|
||||
ch_per_group=4,
|
||||
groups_1x1=(1, 1),
|
||||
depthsep=True,
|
||||
shuffle=False,
|
||||
activation_cfg=None):
|
||||
super(DYMicroBlock, self).__init__()
|
||||
|
||||
self.identity = stride == 1 and inp == oup
|
||||
|
||||
y1, y2, y3 = activation_cfg['dy']
|
||||
act_reduction = 8 * activation_cfg['ratio']
|
||||
init_a = activation_cfg['init_a']
|
||||
init_b = activation_cfg['init_b']
|
||||
|
||||
t1 = ch_exp
|
||||
gs1 = ch_per_group
|
||||
hidden_fft, g1, g2 = groups_1x1
|
||||
hidden_dim2 = inp * t1[0] * t1[1]
|
||||
|
||||
if gs1[0] == 0:
|
||||
self.layers = nn.Sequential(
|
||||
DepthSpatialSepConv(inp, t1, kernel_size, stride),
|
||||
DYShiftMax(
|
||||
hidden_dim2,
|
||||
hidden_dim2,
|
||||
act_max=2.0,
|
||||
act_relu=True if y2 == 2 else False,
|
||||
init_a=init_a,
|
||||
reduction=act_reduction,
|
||||
init_b=init_b,
|
||||
g=gs1,
|
||||
expansion=False) if y2 > 0 else nn.ReLU6(),
|
||||
ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(),
|
||||
ChannelShuffle(hidden_dim2 // 2)
|
||||
if shuffle and y2 != 0 else nn.Sequential(),
|
||||
GroupConv(hidden_dim2, oup, (g1, g2)),
|
||||
DYShiftMax(
|
||||
oup,
|
||||
oup,
|
||||
act_max=2.0,
|
||||
act_relu=False,
|
||||
init_a=[1.0, 0.0],
|
||||
reduction=act_reduction // 2,
|
||||
init_b=[0.0, 0.0],
|
||||
g=(g1, g2),
|
||||
expansion=False) if y3 > 0 else nn.Sequential(),
|
||||
ChannelShuffle(g2) if shuffle else nn.Sequential(),
|
||||
ChannelShuffle(oup // 2)
|
||||
if shuffle and oup % 2 == 0 and y3 != 0 else nn.Sequential(), )
|
||||
elif g2 == 0:
|
||||
self.layers = nn.Sequential(
|
||||
GroupConv(inp, hidden_dim2, gs1),
|
||||
DYShiftMax(
|
||||
hidden_dim2,
|
||||
hidden_dim2,
|
||||
act_max=2.0,
|
||||
act_relu=False,
|
||||
init_a=[1.0, 0.0],
|
||||
reduction=act_reduction,
|
||||
init_b=[0.0, 0.0],
|
||||
g=gs1,
|
||||
expansion=False) if y3 > 0 else nn.Sequential(), )
|
||||
else:
|
||||
self.layers = nn.Sequential(
|
||||
GroupConv(inp, hidden_dim2, gs1),
|
||||
DYShiftMax(
|
||||
hidden_dim2,
|
||||
hidden_dim2,
|
||||
act_max=2.0,
|
||||
act_relu=True if y1 == 2 else False,
|
||||
init_a=init_a,
|
||||
reduction=act_reduction,
|
||||
init_b=init_b,
|
||||
g=gs1,
|
||||
expansion=False) if y1 > 0 else nn.ReLU6(),
|
||||
ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(),
|
||||
DepthSpatialSepConv(hidden_dim2, (1, 1), kernel_size, stride)
|
||||
if depthsep else
|
||||
DepthConv(hidden_dim2, hidden_dim2, kernel_size, stride),
|
||||
nn.Sequential(),
|
||||
DYShiftMax(
|
||||
hidden_dim2,
|
||||
hidden_dim2,
|
||||
act_max=2.0,
|
||||
act_relu=True if y2 == 2 else False,
|
||||
init_a=init_a,
|
||||
reduction=act_reduction,
|
||||
init_b=init_b,
|
||||
g=gs1,
|
||||
expansion=True) if y2 > 0 else nn.ReLU6(),
|
||||
ChannelShuffle(hidden_dim2 // 4)
|
||||
if shuffle and y1 != 0 and y2 != 0 else nn.Sequential()
|
||||
if y1 == 0 and y2 == 0 else ChannelShuffle(hidden_dim2 // 2),
|
||||
GroupConv(hidden_dim2, oup, (g1, g2)),
|
||||
DYShiftMax(
|
||||
oup,
|
||||
oup,
|
||||
act_max=2.0,
|
||||
act_relu=False,
|
||||
init_a=[1.0, 0.0],
|
||||
reduction=act_reduction // 2
|
||||
if oup < hidden_dim2 else act_reduction,
|
||||
init_b=[0.0, 0.0],
|
||||
g=(g1, g2),
|
||||
expansion=False) if y3 > 0 else nn.Sequential(),
|
||||
ChannelShuffle(g2) if shuffle else nn.Sequential(),
|
||||
ChannelShuffle(oup // 2)
|
||||
if shuffle and y3 != 0 else nn.Sequential(), )
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
out = self.layers(x)
|
||||
|
||||
if self.identity:
|
||||
out = out + identity
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class MicroNet(nn.Layer):
|
||||
"""
|
||||
the MicroNet backbone network for recognition module.
|
||||
Args:
|
||||
mode(str): {'M0', 'M1', 'M2', 'M3'}
|
||||
Four models are proposed based on four different computational costs (4M, 6M, 12M, 21M MAdds)
|
||||
Default: 'M3'.
|
||||
"""
|
||||
|
||||
def __init__(self, mode='M3', **kwargs):
|
||||
super(MicroNet, self).__init__()
|
||||
|
||||
self.cfgs = get_micronet_config(mode)
|
||||
|
||||
activation_cfg = {}
|
||||
if mode == 'M0':
|
||||
input_channel = 4
|
||||
stem_groups = 2, 2
|
||||
out_ch = 384
|
||||
activation_cfg['init_a'] = 1.0, 1.0
|
||||
activation_cfg['init_b'] = 0.0, 0.0
|
||||
elif mode == 'M1':
|
||||
input_channel = 6
|
||||
stem_groups = 3, 2
|
||||
out_ch = 576
|
||||
activation_cfg['init_a'] = 1.0, 1.0
|
||||
activation_cfg['init_b'] = 0.0, 0.0
|
||||
elif mode == 'M2':
|
||||
input_channel = 8
|
||||
stem_groups = 4, 2
|
||||
out_ch = 768
|
||||
activation_cfg['init_a'] = 1.0, 1.0
|
||||
activation_cfg['init_b'] = 0.0, 0.0
|
||||
elif mode == 'M3':
|
||||
input_channel = 12
|
||||
stem_groups = 4, 3
|
||||
out_ch = 432
|
||||
activation_cfg['init_a'] = 1.0, 0.5
|
||||
activation_cfg['init_b'] = 0.0, 0.5
|
||||
else:
|
||||
raise NotImplementedError("mode[" + mode +
|
||||
"_model] is not implemented!")
|
||||
|
||||
layers = [StemLayer(3, input_channel, stride=2, groups=stem_groups)]
|
||||
|
||||
for idx, val in enumerate(self.cfgs):
|
||||
s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r = val
|
||||
|
||||
t1 = (c1, c2)
|
||||
gs1 = (g1, g2)
|
||||
gs2 = (c3, g3, g4)
|
||||
activation_cfg['dy'] = [y1, y2, y3]
|
||||
activation_cfg['ratio'] = r
|
||||
|
||||
output_channel = c
|
||||
layers.append(
|
||||
DYMicroBlock(
|
||||
input_channel,
|
||||
output_channel,
|
||||
kernel_size=ks,
|
||||
stride=s,
|
||||
ch_exp=t1,
|
||||
ch_per_group=gs1,
|
||||
groups_1x1=gs2,
|
||||
depthsep=True,
|
||||
shuffle=True,
|
||||
activation_cfg=activation_cfg, ))
|
||||
input_channel = output_channel
|
||||
for i in range(1, n):
|
||||
layers.append(
|
||||
DYMicroBlock(
|
||||
input_channel,
|
||||
output_channel,
|
||||
kernel_size=ks,
|
||||
stride=1,
|
||||
ch_exp=t1,
|
||||
ch_per_group=gs1,
|
||||
groups_1x1=gs2,
|
||||
depthsep=True,
|
||||
shuffle=True,
|
||||
activation_cfg=activation_cfg, ))
|
||||
input_channel = output_channel
|
||||
self.features = nn.Sequential(*layers)
|
||||
|
||||
self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
|
||||
|
||||
self.out_channels = make_divisible(out_ch)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.features(x)
|
||||
x = self.pool(x)
|
||||
return x
|
||||
138
backend/ppocr/modeling/backbones/rec_mobilenet_v3.py
Normal file
138
backend/ppocr/modeling/backbones/rec_mobilenet_v3.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle import nn
|
||||
|
||||
from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible
|
||||
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
class MobileNetV3(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
model_name='small',
|
||||
scale=0.5,
|
||||
large_stride=None,
|
||||
small_stride=None,
|
||||
disable_se=False,
|
||||
**kwargs):
|
||||
super(MobileNetV3, self).__init__()
|
||||
self.disable_se = disable_se
|
||||
if small_stride is None:
|
||||
small_stride = [2, 2, 2, 2]
|
||||
if large_stride is None:
|
||||
large_stride = [1, 2, 2, 2]
|
||||
|
||||
assert isinstance(large_stride, list), "large_stride type must " \
|
||||
"be list but got {}".format(type(large_stride))
|
||||
assert isinstance(small_stride, list), "small_stride type must " \
|
||||
"be list but got {}".format(type(small_stride))
|
||||
assert len(large_stride) == 4, "large_stride length must be " \
|
||||
"4 but got {}".format(len(large_stride))
|
||||
assert len(small_stride) == 4, "small_stride length must be " \
|
||||
"4 but got {}".format(len(small_stride))
|
||||
|
||||
if model_name == "large":
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, 'relu', large_stride[0]],
|
||||
[3, 64, 24, False, 'relu', (large_stride[1], 1)],
|
||||
[3, 72, 24, False, 'relu', 1],
|
||||
[5, 72, 40, True, 'relu', (large_stride[2], 1)],
|
||||
[5, 120, 40, True, 'relu', 1],
|
||||
[5, 120, 40, True, 'relu', 1],
|
||||
[3, 240, 80, False, 'hardswish', 1],
|
||||
[3, 200, 80, False, 'hardswish', 1],
|
||||
[3, 184, 80, False, 'hardswish', 1],
|
||||
[3, 184, 80, False, 'hardswish', 1],
|
||||
[3, 480, 112, True, 'hardswish', 1],
|
||||
[3, 672, 112, True, 'hardswish', 1],
|
||||
[5, 672, 160, True, 'hardswish', (large_stride[3], 1)],
|
||||
[5, 960, 160, True, 'hardswish', 1],
|
||||
[5, 960, 160, True, 'hardswish', 1],
|
||||
]
|
||||
cls_ch_squeeze = 960
|
||||
elif model_name == "small":
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, 'relu', (small_stride[0], 1)],
|
||||
[3, 72, 24, False, 'relu', (small_stride[1], 1)],
|
||||
[3, 88, 24, False, 'relu', 1],
|
||||
[5, 96, 40, True, 'hardswish', (small_stride[2], 1)],
|
||||
[5, 240, 40, True, 'hardswish', 1],
|
||||
[5, 240, 40, True, 'hardswish', 1],
|
||||
[5, 120, 48, True, 'hardswish', 1],
|
||||
[5, 144, 48, True, 'hardswish', 1],
|
||||
[5, 288, 96, True, 'hardswish', (small_stride[3], 1)],
|
||||
[5, 576, 96, True, 'hardswish', 1],
|
||||
[5, 576, 96, True, 'hardswish', 1],
|
||||
]
|
||||
cls_ch_squeeze = 576
|
||||
else:
|
||||
raise NotImplementedError("mode[" + model_name +
|
||||
"_model] is not implemented!")
|
||||
|
||||
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
|
||||
assert scale in supported_scale, \
|
||||
"supported scales are {} but input scale is {}".format(supported_scale, scale)
|
||||
|
||||
inplanes = 16
|
||||
# conv1
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=make_divisible(inplanes * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hardswish')
|
||||
i = 0
|
||||
block_list = []
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in cfg:
|
||||
se = se and not self.disable_se
|
||||
block_list.append(
|
||||
ResidualUnit(
|
||||
in_channels=inplanes,
|
||||
mid_channels=make_divisible(scale * exp),
|
||||
out_channels=make_divisible(scale * c),
|
||||
kernel_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl))
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
self.blocks = nn.Sequential(*block_list)
|
||||
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=inplanes,
|
||||
out_channels=make_divisible(scale * cls_ch_squeeze),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hardswish')
|
||||
|
||||
self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
|
||||
self.out_channels = make_divisible(scale * cls_ch_squeeze)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.blocks(x)
|
||||
x = self.conv2(x)
|
||||
x = self.pool(x)
|
||||
return x
|
||||
256
backend/ppocr/modeling/backbones/rec_mv1_enhance.py
Normal file
256
backend/ppocr/modeling/backbones/rec_mv1_enhance.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This code is refer from: https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/arch/backbone/legendary_models/pp_lcnet.py
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import ParamAttr, reshape, transpose
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
|
||||
from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.functional import hardswish, hardsigmoid
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
filter_size,
|
||||
num_filters,
|
||||
stride,
|
||||
padding,
|
||||
channels=None,
|
||||
num_groups=1,
|
||||
act='hard_swish'):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self._conv = Conv2D(
|
||||
in_channels=num_channels,
|
||||
out_channels=num_filters,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()),
|
||||
bias_attr=False)
|
||||
|
||||
self._batch_norm = BatchNorm(
|
||||
num_filters,
|
||||
act=act,
|
||||
param_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
return y
|
||||
|
||||
|
||||
class DepthwiseSeparable(nn.Layer):
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
num_filters1,
|
||||
num_filters2,
|
||||
num_groups,
|
||||
stride,
|
||||
scale,
|
||||
dw_size=3,
|
||||
padding=1,
|
||||
use_se=False):
|
||||
super(DepthwiseSeparable, self).__init__()
|
||||
self.use_se = use_se
|
||||
self._depthwise_conv = ConvBNLayer(
|
||||
num_channels=num_channels,
|
||||
num_filters=int(num_filters1 * scale),
|
||||
filter_size=dw_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
num_groups=int(num_groups * scale))
|
||||
if use_se:
|
||||
self._se = SEModule(int(num_filters1 * scale))
|
||||
self._pointwise_conv = ConvBNLayer(
|
||||
num_channels=int(num_filters1 * scale),
|
||||
filter_size=1,
|
||||
num_filters=int(num_filters2 * scale),
|
||||
stride=1,
|
||||
padding=0)
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._depthwise_conv(inputs)
|
||||
if self.use_se:
|
||||
y = self._se(y)
|
||||
y = self._pointwise_conv(y)
|
||||
return y
|
||||
|
||||
|
||||
class MobileNetV1Enhance(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
scale=0.5,
|
||||
last_conv_stride=1,
|
||||
last_pool_type='max',
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
self.scale = scale
|
||||
self.block_list = []
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
num_channels=3,
|
||||
filter_size=3,
|
||||
channels=3,
|
||||
num_filters=int(32 * scale),
|
||||
stride=2,
|
||||
padding=1)
|
||||
|
||||
conv2_1 = DepthwiseSeparable(
|
||||
num_channels=int(32 * scale),
|
||||
num_filters1=32,
|
||||
num_filters2=64,
|
||||
num_groups=32,
|
||||
stride=1,
|
||||
scale=scale)
|
||||
self.block_list.append(conv2_1)
|
||||
|
||||
conv2_2 = DepthwiseSeparable(
|
||||
num_channels=int(64 * scale),
|
||||
num_filters1=64,
|
||||
num_filters2=128,
|
||||
num_groups=64,
|
||||
stride=1,
|
||||
scale=scale)
|
||||
self.block_list.append(conv2_2)
|
||||
|
||||
conv3_1 = DepthwiseSeparable(
|
||||
num_channels=int(128 * scale),
|
||||
num_filters1=128,
|
||||
num_filters2=128,
|
||||
num_groups=128,
|
||||
stride=1,
|
||||
scale=scale)
|
||||
self.block_list.append(conv3_1)
|
||||
|
||||
conv3_2 = DepthwiseSeparable(
|
||||
num_channels=int(128 * scale),
|
||||
num_filters1=128,
|
||||
num_filters2=256,
|
||||
num_groups=128,
|
||||
stride=(2, 1),
|
||||
scale=scale)
|
||||
self.block_list.append(conv3_2)
|
||||
|
||||
conv4_1 = DepthwiseSeparable(
|
||||
num_channels=int(256 * scale),
|
||||
num_filters1=256,
|
||||
num_filters2=256,
|
||||
num_groups=256,
|
||||
stride=1,
|
||||
scale=scale)
|
||||
self.block_list.append(conv4_1)
|
||||
|
||||
conv4_2 = DepthwiseSeparable(
|
||||
num_channels=int(256 * scale),
|
||||
num_filters1=256,
|
||||
num_filters2=512,
|
||||
num_groups=256,
|
||||
stride=(2, 1),
|
||||
scale=scale)
|
||||
self.block_list.append(conv4_2)
|
||||
|
||||
for _ in range(5):
|
||||
conv5 = DepthwiseSeparable(
|
||||
num_channels=int(512 * scale),
|
||||
num_filters1=512,
|
||||
num_filters2=512,
|
||||
num_groups=512,
|
||||
stride=1,
|
||||
dw_size=5,
|
||||
padding=2,
|
||||
scale=scale,
|
||||
use_se=False)
|
||||
self.block_list.append(conv5)
|
||||
|
||||
conv5_6 = DepthwiseSeparable(
|
||||
num_channels=int(512 * scale),
|
||||
num_filters1=512,
|
||||
num_filters2=1024,
|
||||
num_groups=512,
|
||||
stride=(2, 1),
|
||||
dw_size=5,
|
||||
padding=2,
|
||||
scale=scale,
|
||||
use_se=True)
|
||||
self.block_list.append(conv5_6)
|
||||
|
||||
conv6 = DepthwiseSeparable(
|
||||
num_channels=int(1024 * scale),
|
||||
num_filters1=1024,
|
||||
num_filters2=1024,
|
||||
num_groups=1024,
|
||||
stride=last_conv_stride,
|
||||
dw_size=5,
|
||||
padding=2,
|
||||
use_se=True,
|
||||
scale=scale)
|
||||
self.block_list.append(conv6)
|
||||
|
||||
self.block_list = nn.Sequential(*self.block_list)
|
||||
if last_pool_type == 'avg':
|
||||
self.pool = nn.AvgPool2D(kernel_size=2, stride=2, padding=0)
|
||||
else:
|
||||
self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
|
||||
self.out_channels = int(1024 * scale)
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv1(inputs)
|
||||
y = self.block_list(y)
|
||||
y = self.pool(y)
|
||||
return y
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, channel, reduction=4):
|
||||
super(SEModule, self).__init__()
|
||||
self.avg_pool = AdaptiveAvgPool2D(1)
|
||||
self.conv1 = Conv2D(
|
||||
in_channels=channel,
|
||||
out_channels=channel // reduction,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(),
|
||||
bias_attr=ParamAttr())
|
||||
self.conv2 = Conv2D(
|
||||
in_channels=channel // reduction,
|
||||
out_channels=channel,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(),
|
||||
bias_attr=ParamAttr())
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.avg_pool(inputs)
|
||||
outputs = self.conv1(outputs)
|
||||
outputs = F.relu(outputs)
|
||||
outputs = self.conv2(outputs)
|
||||
outputs = hardsigmoid(outputs)
|
||||
return paddle.multiply(x=inputs, y=outputs)
|
||||
48
backend/ppocr/modeling/backbones/rec_nrtr_mtb.py
Normal file
48
backend/ppocr/modeling/backbones/rec_nrtr_mtb.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle import nn
|
||||
import paddle
|
||||
|
||||
|
||||
class MTB(nn.Layer):
|
||||
def __init__(self, cnn_num, in_channels):
|
||||
super(MTB, self).__init__()
|
||||
self.block = nn.Sequential()
|
||||
self.out_channels = in_channels
|
||||
self.cnn_num = cnn_num
|
||||
if self.cnn_num == 2:
|
||||
for i in range(self.cnn_num):
|
||||
self.block.add_sublayer(
|
||||
'conv_{}'.format(i),
|
||||
nn.Conv2D(
|
||||
in_channels=in_channels
|
||||
if i == 0 else 32 * (2**(i - 1)),
|
||||
out_channels=32 * (2**i),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1))
|
||||
self.block.add_sublayer('relu_{}'.format(i), nn.ReLU())
|
||||
self.block.add_sublayer('bn_{}'.format(i),
|
||||
nn.BatchNorm2D(32 * (2**i)))
|
||||
|
||||
def forward(self, images):
|
||||
x = self.block(images)
|
||||
if self.cnn_num == 2:
|
||||
# (b, w, h, c)
|
||||
x = paddle.transpose(x, [0, 3, 2, 1])
|
||||
x_shape = paddle.shape(x)
|
||||
x = paddle.reshape(
|
||||
x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
|
||||
return x
|
||||
210
backend/ppocr/modeling/backbones/rec_resnet_31.py
Normal file
210
backend/ppocr/modeling/backbones/rec_resnet_31.py
Normal file
@@ -0,0 +1,210 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
__all__ = ["ResNet31"]
|
||||
|
||||
|
||||
def conv3x3(in_channel, out_channel, stride=1):
|
||||
return nn.Conv2D(
|
||||
in_channel,
|
||||
out_channel,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
bias_attr=False)
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, in_channels, channels, stride=1, downsample=False):
|
||||
super().__init__()
|
||||
self.conv1 = conv3x3(in_channels, channels, stride)
|
||||
self.bn1 = nn.BatchNorm2D(channels)
|
||||
self.relu = nn.ReLU()
|
||||
self.conv2 = conv3x3(channels, channels)
|
||||
self.bn2 = nn.BatchNorm2D(channels)
|
||||
self.downsample = downsample
|
||||
if downsample:
|
||||
self.downsample = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
channels * self.expansion,
|
||||
1,
|
||||
stride,
|
||||
bias_attr=False),
|
||||
nn.BatchNorm2D(channels * self.expansion), )
|
||||
else:
|
||||
self.downsample = nn.Sequential()
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet31(nn.Layer):
|
||||
'''
|
||||
Args:
|
||||
in_channels (int): Number of channels of input image tensor.
|
||||
layers (list[int]): List of BasicBlock number for each stage.
|
||||
channels (list[int]): List of out_channels of Conv2d layer.
|
||||
out_indices (None | Sequence[int]): Indices of output stages.
|
||||
last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage.
|
||||
'''
|
||||
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
layers=[1, 2, 5, 3],
|
||||
channels=[64, 128, 256, 256, 512, 512, 512],
|
||||
out_indices=None,
|
||||
last_stage_pool=False):
|
||||
super(ResNet31, self).__init__()
|
||||
assert isinstance(in_channels, int)
|
||||
assert isinstance(last_stage_pool, bool)
|
||||
|
||||
self.out_indices = out_indices
|
||||
self.last_stage_pool = last_stage_pool
|
||||
|
||||
# conv 1 (Conv Conv)
|
||||
self.conv1_1 = nn.Conv2D(
|
||||
in_channels, channels[0], kernel_size=3, stride=1, padding=1)
|
||||
self.bn1_1 = nn.BatchNorm2D(channels[0])
|
||||
self.relu1_1 = nn.ReLU()
|
||||
|
||||
self.conv1_2 = nn.Conv2D(
|
||||
channels[0], channels[1], kernel_size=3, stride=1, padding=1)
|
||||
self.bn1_2 = nn.BatchNorm2D(channels[1])
|
||||
self.relu1_2 = nn.ReLU()
|
||||
|
||||
# conv 2 (Max-pooling, Residual block, Conv)
|
||||
self.pool2 = nn.MaxPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
self.block2 = self._make_layer(channels[1], channels[2], layers[0])
|
||||
self.conv2 = nn.Conv2D(
|
||||
channels[2], channels[2], kernel_size=3, stride=1, padding=1)
|
||||
self.bn2 = nn.BatchNorm2D(channels[2])
|
||||
self.relu2 = nn.ReLU()
|
||||
|
||||
# conv 3 (Max-pooling, Residual block, Conv)
|
||||
self.pool3 = nn.MaxPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
self.block3 = self._make_layer(channels[2], channels[3], layers[1])
|
||||
self.conv3 = nn.Conv2D(
|
||||
channels[3], channels[3], kernel_size=3, stride=1, padding=1)
|
||||
self.bn3 = nn.BatchNorm2D(channels[3])
|
||||
self.relu3 = nn.ReLU()
|
||||
|
||||
# conv 4 (Max-pooling, Residual block, Conv)
|
||||
self.pool4 = nn.MaxPool2D(
|
||||
kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True)
|
||||
self.block4 = self._make_layer(channels[3], channels[4], layers[2])
|
||||
self.conv4 = nn.Conv2D(
|
||||
channels[4], channels[4], kernel_size=3, stride=1, padding=1)
|
||||
self.bn4 = nn.BatchNorm2D(channels[4])
|
||||
self.relu4 = nn.ReLU()
|
||||
|
||||
# conv 5 ((Max-pooling), Residual block, Conv)
|
||||
self.pool5 = None
|
||||
if self.last_stage_pool:
|
||||
self.pool5 = nn.MaxPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
self.block5 = self._make_layer(channels[4], channels[5], layers[3])
|
||||
self.conv5 = nn.Conv2D(
|
||||
channels[5], channels[5], kernel_size=3, stride=1, padding=1)
|
||||
self.bn5 = nn.BatchNorm2D(channels[5])
|
||||
self.relu5 = nn.ReLU()
|
||||
|
||||
self.out_channels = channels[-1]
|
||||
|
||||
def _make_layer(self, input_channels, output_channels, blocks):
|
||||
layers = []
|
||||
for _ in range(blocks):
|
||||
downsample = None
|
||||
if input_channels != output_channels:
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
input_channels,
|
||||
output_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
bias_attr=False),
|
||||
nn.BatchNorm2D(output_channels), )
|
||||
|
||||
layers.append(
|
||||
BasicBlock(
|
||||
input_channels, output_channels, downsample=downsample))
|
||||
input_channels = output_channels
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1_1(x)
|
||||
x = self.bn1_1(x)
|
||||
x = self.relu1_1(x)
|
||||
|
||||
x = self.conv1_2(x)
|
||||
x = self.bn1_2(x)
|
||||
x = self.relu1_2(x)
|
||||
|
||||
outs = []
|
||||
for i in range(4):
|
||||
layer_index = i + 2
|
||||
pool_layer = getattr(self, f'pool{layer_index}')
|
||||
block_layer = getattr(self, f'block{layer_index}')
|
||||
conv_layer = getattr(self, f'conv{layer_index}')
|
||||
bn_layer = getattr(self, f'bn{layer_index}')
|
||||
relu_layer = getattr(self, f'relu{layer_index}')
|
||||
|
||||
if pool_layer is not None:
|
||||
x = pool_layer(x)
|
||||
x = block_layer(x)
|
||||
x = conv_layer(x)
|
||||
x = bn_layer(x)
|
||||
x = relu_layer(x)
|
||||
|
||||
outs.append(x)
|
||||
|
||||
if self.out_indices is not None:
|
||||
return tuple([outs[i] for i in self.out_indices])
|
||||
|
||||
return x
|
||||
143
backend/ppocr/modeling/backbones/rec_resnet_aster.py
Normal file
143
backend/ppocr/modeling/backbones/rec_resnet_aster.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/resnet_aster.py
|
||||
"""
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
import sys
|
||||
import math
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
"""3x3 convolution with padding"""
|
||||
return nn.Conv2D(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
bias_attr=False)
|
||||
|
||||
|
||||
def conv1x1(in_planes, out_planes, stride=1):
|
||||
"""1x1 convolution"""
|
||||
return nn.Conv2D(
|
||||
in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False)
|
||||
|
||||
|
||||
def get_sinusoid_encoding(n_position, feat_dim, wave_length=10000):
|
||||
# [n_position]
|
||||
positions = paddle.arange(0, n_position)
|
||||
# [feat_dim]
|
||||
dim_range = paddle.arange(0, feat_dim)
|
||||
dim_range = paddle.pow(wave_length, 2 * (dim_range // 2) / feat_dim)
|
||||
# [n_position, feat_dim]
|
||||
angles = paddle.unsqueeze(
|
||||
positions, axis=1) / paddle.unsqueeze(
|
||||
dim_range, axis=0)
|
||||
angles = paddle.cast(angles, "float32")
|
||||
angles[:, 0::2] = paddle.sin(angles[:, 0::2])
|
||||
angles[:, 1::2] = paddle.cos(angles[:, 1::2])
|
||||
return angles
|
||||
|
||||
|
||||
class AsterBlock(nn.Layer):
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(AsterBlock, self).__init__()
|
||||
self.conv1 = conv1x1(inplanes, planes, stride)
|
||||
self.bn1 = nn.BatchNorm2D(planes)
|
||||
self.relu = nn.ReLU()
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = nn.BatchNorm2D(planes)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
return out
|
||||
|
||||
|
||||
class ResNet_ASTER(nn.Layer):
|
||||
"""For aster or crnn"""
|
||||
|
||||
def __init__(self, with_lstm=True, n_group=1, in_channels=3):
|
||||
super(ResNet_ASTER, self).__init__()
|
||||
self.with_lstm = with_lstm
|
||||
self.n_group = n_group
|
||||
|
||||
self.layer0 = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
32,
|
||||
kernel_size=(3, 3),
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias_attr=False),
|
||||
nn.BatchNorm2D(32),
|
||||
nn.ReLU())
|
||||
|
||||
self.inplanes = 32
|
||||
self.layer1 = self._make_layer(32, 3, [2, 2]) # [16, 50]
|
||||
self.layer2 = self._make_layer(64, 4, [2, 2]) # [8, 25]
|
||||
self.layer3 = self._make_layer(128, 6, [2, 1]) # [4, 25]
|
||||
self.layer4 = self._make_layer(256, 6, [2, 1]) # [2, 25]
|
||||
self.layer5 = self._make_layer(512, 3, [2, 1]) # [1, 25]
|
||||
|
||||
if with_lstm:
|
||||
self.rnn = nn.LSTM(512, 256, direction="bidirect", num_layers=2)
|
||||
self.out_channels = 2 * 256
|
||||
else:
|
||||
self.out_channels = 512
|
||||
|
||||
def _make_layer(self, planes, blocks, stride):
|
||||
downsample = None
|
||||
if stride != [1, 1] or self.inplanes != planes:
|
||||
downsample = nn.Sequential(
|
||||
conv1x1(self.inplanes, planes, stride), nn.BatchNorm2D(planes))
|
||||
|
||||
layers = []
|
||||
layers.append(AsterBlock(self.inplanes, planes, stride, downsample))
|
||||
self.inplanes = planes
|
||||
for _ in range(1, blocks):
|
||||
layers.append(AsterBlock(self.inplanes, planes))
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x0 = self.layer0(x)
|
||||
x1 = self.layer1(x0)
|
||||
x2 = self.layer2(x1)
|
||||
x3 = self.layer3(x2)
|
||||
x4 = self.layer4(x3)
|
||||
x5 = self.layer5(x4)
|
||||
|
||||
cnn_feat = x5.squeeze(2) # [N, c, w]
|
||||
cnn_feat = paddle.transpose(cnn_feat, perm=[0, 2, 1])
|
||||
if self.with_lstm:
|
||||
rnn_feat, _ = self.rnn(cnn_feat)
|
||||
return rnn_feat
|
||||
else:
|
||||
return cnn_feat
|
||||
307
backend/ppocr/modeling/backbones/rec_resnet_fpn.py
Normal file
307
backend/ppocr/modeling/backbones/rec_resnet_fpn.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from paddle import nn, ParamAttr
|
||||
from paddle.nn import functional as F
|
||||
import paddle.fluid as fluid
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
__all__ = ["ResNetFPN"]
|
||||
|
||||
|
||||
class ResNetFPN(nn.Layer):
|
||||
def __init__(self, in_channels=1, layers=50, **kwargs):
|
||||
super(ResNetFPN, self).__init__()
|
||||
supported_layers = {
|
||||
18: {
|
||||
'depth': [2, 2, 2, 2],
|
||||
'block_class': BasicBlock
|
||||
},
|
||||
34: {
|
||||
'depth': [3, 4, 6, 3],
|
||||
'block_class': BasicBlock
|
||||
},
|
||||
50: {
|
||||
'depth': [3, 4, 6, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
101: {
|
||||
'depth': [3, 4, 23, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
152: {
|
||||
'depth': [3, 8, 36, 3],
|
||||
'block_class': BottleneckBlock
|
||||
}
|
||||
}
|
||||
stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
self.depth = supported_layers[layers]['depth']
|
||||
self.F = []
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=7,
|
||||
stride=2,
|
||||
act="relu",
|
||||
name="conv1")
|
||||
self.block_list = []
|
||||
in_ch = 64
|
||||
if layers >= 50:
|
||||
for block in range(len(self.depth)):
|
||||
for i in range(self.depth[block]):
|
||||
if layers in [101, 152] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
block_list = self.add_sublayer(
|
||||
"bottleneckBlock_{}_{}".format(block, i),
|
||||
BottleneckBlock(
|
||||
in_channels=in_ch,
|
||||
out_channels=num_filters[block],
|
||||
stride=stride_list[block] if i == 0 else 1,
|
||||
name=conv_name))
|
||||
in_ch = num_filters[block] * 4
|
||||
self.block_list.append(block_list)
|
||||
self.F.append(block_list)
|
||||
else:
|
||||
for block in range(len(self.depth)):
|
||||
for i in range(self.depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
if i == 0 and block != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
basic_block = self.add_sublayer(
|
||||
conv_name,
|
||||
BasicBlock(
|
||||
in_channels=in_ch,
|
||||
out_channels=num_filters[block],
|
||||
stride=stride_list[block] if i == 0 else 1,
|
||||
is_first=block == i == 0,
|
||||
name=conv_name))
|
||||
in_ch = basic_block.out_channels
|
||||
self.block_list.append(basic_block)
|
||||
out_ch_list = [in_ch // 4, in_ch // 2, in_ch]
|
||||
self.base_block = []
|
||||
self.conv_trans = []
|
||||
self.bn_block = []
|
||||
for i in [-2, -3]:
|
||||
in_channels = out_ch_list[i + 1] + out_ch_list[i]
|
||||
|
||||
self.base_block.append(
|
||||
self.add_sublayer(
|
||||
"F_{}_base_block_0".format(i),
|
||||
nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_ch_list[i],
|
||||
kernel_size=1,
|
||||
weight_attr=ParamAttr(trainable=True),
|
||||
bias_attr=ParamAttr(trainable=True))))
|
||||
self.base_block.append(
|
||||
self.add_sublayer(
|
||||
"F_{}_base_block_1".format(i),
|
||||
nn.Conv2D(
|
||||
in_channels=out_ch_list[i],
|
||||
out_channels=out_ch_list[i],
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(trainable=True),
|
||||
bias_attr=ParamAttr(trainable=True))))
|
||||
self.base_block.append(
|
||||
self.add_sublayer(
|
||||
"F_{}_base_block_2".format(i),
|
||||
nn.BatchNorm(
|
||||
num_channels=out_ch_list[i],
|
||||
act="relu",
|
||||
param_attr=ParamAttr(trainable=True),
|
||||
bias_attr=ParamAttr(trainable=True))))
|
||||
self.base_block.append(
|
||||
self.add_sublayer(
|
||||
"F_{}_base_block_3".format(i),
|
||||
nn.Conv2D(
|
||||
in_channels=out_ch_list[i],
|
||||
out_channels=512,
|
||||
kernel_size=1,
|
||||
bias_attr=ParamAttr(trainable=True),
|
||||
weight_attr=ParamAttr(trainable=True))))
|
||||
self.out_channels = 512
|
||||
|
||||
def __call__(self, x):
|
||||
x = self.conv(x)
|
||||
fpn_list = []
|
||||
F = []
|
||||
for i in range(len(self.depth)):
|
||||
fpn_list.append(np.sum(self.depth[:i + 1]))
|
||||
|
||||
for i, block in enumerate(self.block_list):
|
||||
x = block(x)
|
||||
for number in fpn_list:
|
||||
if i + 1 == number:
|
||||
F.append(x)
|
||||
base = F[-1]
|
||||
|
||||
j = 0
|
||||
for i, block in enumerate(self.base_block):
|
||||
if i % 3 == 0 and i < 6:
|
||||
j = j + 1
|
||||
b, c, w, h = F[-j - 1].shape
|
||||
if [w, h] == list(base.shape[2:]):
|
||||
base = base
|
||||
else:
|
||||
base = self.conv_trans[j - 1](base)
|
||||
base = self.bn_block[j - 1](base)
|
||||
base = paddle.concat([base, F[-j - 1]], axis=1)
|
||||
base = block(base)
|
||||
return base
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=2 if stride == (1, 1) else kernel_size,
|
||||
dilation=2 if stride == (1, 1) else 1,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + '.conv2d.output.1.w_0'),
|
||||
bias_attr=False, )
|
||||
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=name + '.output.1.w_0'),
|
||||
bias_attr=ParamAttr(name=name + '.output.1.b_0'),
|
||||
moving_mean_name=bn_name + "_mean",
|
||||
moving_variance_name=bn_name + "_variance")
|
||||
|
||||
def __call__(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
|
||||
class ShortCut(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, is_first=False):
|
||||
super(ShortCut, self).__init__()
|
||||
self.use_conv = True
|
||||
|
||||
if in_channels != out_channels or stride != 1 or is_first == True:
|
||||
if stride == (1, 1):
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels, out_channels, 1, 1, name=name)
|
||||
else: # stride==(2,2)
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
else:
|
||||
self.use_conv = False
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_conv:
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
self.short = ShortCut(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
stride=stride,
|
||||
is_first=False,
|
||||
name=name + "_branch1")
|
||||
self.out_channels = out_channels * 4
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv0(x)
|
||||
y = self.conv1(y)
|
||||
y = self.conv2(y)
|
||||
y = y + self.short(x)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, is_first):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act='relu',
|
||||
stride=stride,
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
self.short = ShortCut(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
stride=stride,
|
||||
is_first=is_first,
|
||||
name=name + "_branch1")
|
||||
self.out_channels = out_channels
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv0(x)
|
||||
y = self.conv1(y)
|
||||
y = y + self.short(x)
|
||||
return F.relu(y)
|
||||
286
backend/ppocr/modeling/backbones/rec_resnet_vd.py
Normal file
286
backend/ppocr/modeling/backbones/rec_resnet_vd.py
Normal file
@@ -0,0 +1,286 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
is_vd_mode=False,
|
||||
act=None,
|
||||
name=None, ):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.is_vd_mode = is_vd_mode
|
||||
self._pool2d_avg = nn.AvgPool2D(
|
||||
kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=1 if is_vd_mode else stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
self._batch_norm = nn.BatchNorm(
|
||||
out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def forward(self, inputs):
|
||||
if self.is_vd_mode:
|
||||
inputs = self._pool2d_avg(inputs)
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
return y
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
is_vd_mode=not if_first and stride[0] != 1,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
|
||||
conv1 = self.conv1(y)
|
||||
conv2 = self.conv2(conv1)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv2)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.stride = stride
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
is_vd_mode=not if_first and stride[0] != 1,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv1)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self, in_channels=3, layers=50, **kwargs):
|
||||
super(ResNet, self).__init__()
|
||||
|
||||
self.layers = layers
|
||||
supported_layers = [18, 34, 50, 101, 152, 200]
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(
|
||||
supported_layers, layers)
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
depth = [3, 4, 6, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
num_channels = [64, 256, 512,
|
||||
1024] if layers >= 50 else [64, 64, 128, 256]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
|
||||
self.conv1_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_1")
|
||||
self.conv1_2 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_2")
|
||||
self.conv1_3 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_3")
|
||||
self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.block_list = []
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152, 200] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
|
||||
if i == 0 and block != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
bottleneck_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BottleneckBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block] * 4,
|
||||
out_channels=num_filters[block],
|
||||
stride=stride,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
self.block_list.append(bottleneck_block)
|
||||
self.out_channels = num_filters[block] * 4
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
if i == 0 and block != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
|
||||
basic_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BasicBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block],
|
||||
out_channels=num_filters[block],
|
||||
stride=stride,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
self.block_list.append(basic_block)
|
||||
self.out_channels = num_filters[block]
|
||||
self.out_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv1_1(inputs)
|
||||
y = self.conv1_2(y)
|
||||
y = self.conv1_3(y)
|
||||
y = self.pool2d_max(y)
|
||||
for block in self.block_list:
|
||||
y = block(y)
|
||||
y = self.out_pool(y)
|
||||
return y
|
||||
584
backend/ppocr/modeling/backbones/rec_svtrnet.py
Normal file
584
backend/ppocr/modeling/backbones/rec_svtrnet.py
Normal file
@@ -0,0 +1,584 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
|
||||
|
||||
trunc_normal_ = TruncatedNormal(std=.02)
|
||||
normal_ = Normal
|
||||
zeros_ = Constant(value=0.)
|
||||
ones_ = Constant(value=1.)
|
||||
|
||||
|
||||
def drop_path(x, drop_prob=0., training=False):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
keep_prob = paddle.to_tensor(1 - drop_prob)
|
||||
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
|
||||
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
|
||||
random_tensor = paddle.floor(random_tensor) # binarize
|
||||
output = x.divide(keep_prob) * random_tensor
|
||||
return output
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias_attr=False,
|
||||
groups=1,
|
||||
act=nn.GELU):
|
||||
super().__init__()
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
weight_attr=paddle.ParamAttr(
|
||||
initializer=nn.initializer.KaimingUniform()),
|
||||
bias_attr=bias_attr)
|
||||
self.norm = nn.BatchNorm2D(out_channels)
|
||||
self.act = act()
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.conv(inputs)
|
||||
out = self.norm(out)
|
||||
out = self.act(out)
|
||||
return out
|
||||
|
||||
|
||||
class DropPath(nn.Layer):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=None):
|
||||
super(DropPath, self).__init__()
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def forward(self, x):
|
||||
return drop_path(x, self.drop_prob, self.training)
|
||||
|
||||
|
||||
class Identity(nn.Layer):
|
||||
def __init__(self):
|
||||
super(Identity, self).__init__()
|
||||
|
||||
def forward(self, input):
|
||||
return input
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer=nn.GELU,
|
||||
drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.drop(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class ConvMixer(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
HW=[8, 25],
|
||||
local_k=[3, 3], ):
|
||||
super().__init__()
|
||||
self.HW = HW
|
||||
self.dim = dim
|
||||
self.local_mixer = nn.Conv2D(
|
||||
dim,
|
||||
dim,
|
||||
local_k,
|
||||
1, [local_k[0] // 2, local_k[1] // 2],
|
||||
groups=num_heads,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()))
|
||||
|
||||
def forward(self, x):
|
||||
h = self.HW[0]
|
||||
w = self.HW[1]
|
||||
x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
|
||||
x = self.local_mixer(x)
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
mixer='Global',
|
||||
HW=[8, 25],
|
||||
local_k=[7, 11],
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
self.HW = HW
|
||||
if HW is not None:
|
||||
H = HW[0]
|
||||
W = HW[1]
|
||||
self.N = H * W
|
||||
self.C = dim
|
||||
if mixer == 'Local' and HW is not None:
|
||||
hk = local_k[0]
|
||||
wk = local_k[1]
|
||||
mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype='float32')
|
||||
for h in range(0, H):
|
||||
for w in range(0, W):
|
||||
mask[h * W + w, h:h + hk, w:w + wk] = 0.
|
||||
mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
|
||||
2].flatten(1)
|
||||
mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32')
|
||||
mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf)
|
||||
self.mask = mask.unsqueeze([0, 1])
|
||||
self.mixer = mixer
|
||||
|
||||
def forward(self, x):
|
||||
if self.HW is not None:
|
||||
N = self.N
|
||||
C = self.C
|
||||
else:
|
||||
_, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape((0, N, 3, self.num_heads, C //
|
||||
self.num_heads)).transpose((2, 0, 3, 1, 4))
|
||||
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
||||
|
||||
attn = (q.matmul(k.transpose((0, 1, 3, 2))))
|
||||
if self.mixer == 'Local':
|
||||
attn += self.mask
|
||||
attn = nn.functional.softmax(attn, axis=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, N, C))
|
||||
x = self.proj(x)
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
mixer='Global',
|
||||
local_mixer=[7, 11],
|
||||
HW=[8, 25],
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer='nn.LayerNorm',
|
||||
epsilon=1e-6,
|
||||
prenorm=True):
|
||||
super().__init__()
|
||||
if isinstance(norm_layer, str):
|
||||
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
|
||||
else:
|
||||
self.norm1 = norm_layer(dim)
|
||||
if mixer == 'Global' or mixer == 'Local':
|
||||
self.mixer = Attention(
|
||||
dim,
|
||||
num_heads=num_heads,
|
||||
mixer=mixer,
|
||||
HW=HW,
|
||||
local_k=local_mixer,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop)
|
||||
elif mixer == 'Conv':
|
||||
self.mixer = ConvMixer(
|
||||
dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
|
||||
else:
|
||||
raise TypeError("The mixer must be one of [Global, Local, Conv]")
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
if isinstance(norm_layer, str):
|
||||
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
|
||||
else:
|
||||
self.norm2 = norm_layer(dim)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
self.prenorm = prenorm
|
||||
|
||||
def forward(self, x):
|
||||
if self.prenorm:
|
||||
x = self.norm1(x + self.drop_path(self.mixer(x)))
|
||||
x = self.norm2(x + self.drop_path(self.mlp(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.mixer(self.norm1(x)))
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=[32, 100],
|
||||
in_channels=3,
|
||||
embed_dim=768,
|
||||
sub_num=2):
|
||||
super().__init__()
|
||||
num_patches = (img_size[1] // (2 ** sub_num)) * \
|
||||
(img_size[0] // (2 ** sub_num))
|
||||
self.img_size = img_size
|
||||
self.num_patches = num_patches
|
||||
self.embed_dim = embed_dim
|
||||
self.norm = None
|
||||
if sub_num == 2:
|
||||
self.proj = nn.Sequential(
|
||||
ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=embed_dim // 2,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None),
|
||||
ConvBNLayer(
|
||||
in_channels=embed_dim // 2,
|
||||
out_channels=embed_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None))
|
||||
if sub_num == 3:
|
||||
self.proj = nn.Sequential(
|
||||
ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=embed_dim // 4,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None),
|
||||
ConvBNLayer(
|
||||
in_channels=embed_dim // 4,
|
||||
out_channels=embed_dim // 2,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None),
|
||||
ConvBNLayer(
|
||||
in_channels=embed_dim // 2,
|
||||
out_channels=embed_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None))
|
||||
|
||||
def forward(self, x):
|
||||
B, C, H, W = x.shape
|
||||
assert H == self.img_size[0] and W == self.img_size[1], \
|
||||
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
|
||||
x = self.proj(x).flatten(2).transpose((0, 2, 1))
|
||||
return x
|
||||
|
||||
|
||||
class SubSample(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
types='Pool',
|
||||
stride=[2, 1],
|
||||
sub_norm='nn.LayerNorm',
|
||||
act=None):
|
||||
super().__init__()
|
||||
self.types = types
|
||||
if types == 'Pool':
|
||||
self.avgpool = nn.AvgPool2D(
|
||||
kernel_size=[3, 5], stride=stride, padding=[1, 2])
|
||||
self.maxpool = nn.MaxPool2D(
|
||||
kernel_size=[3, 5], stride=stride, padding=[1, 2])
|
||||
self.proj = nn.Linear(in_channels, out_channels)
|
||||
else:
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()))
|
||||
self.norm = eval(sub_norm)(out_channels)
|
||||
if act is not None:
|
||||
self.act = act()
|
||||
else:
|
||||
self.act = None
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
if self.types == 'Pool':
|
||||
x1 = self.avgpool(x)
|
||||
x2 = self.maxpool(x)
|
||||
x = (x1 + x2) * 0.5
|
||||
out = self.proj(x.flatten(2).transpose((0, 2, 1)))
|
||||
else:
|
||||
x = self.conv(x)
|
||||
out = x.flatten(2).transpose((0, 2, 1))
|
||||
out = self.norm(out)
|
||||
if self.act is not None:
|
||||
out = self.act(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class SVTRNet(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
img_size=[32, 100],
|
||||
in_channels=3,
|
||||
embed_dim=[64, 128, 256],
|
||||
depth=[3, 6, 3],
|
||||
num_heads=[2, 4, 8],
|
||||
mixer=['Local'] * 6 + ['Global'] *
|
||||
6, # Local atten, Global atten, Conv
|
||||
local_mixer=[[7, 11], [7, 11], [7, 11]],
|
||||
patch_merging='Conv', # Conv, Pool, None
|
||||
mlp_ratio=4,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop_rate=0.,
|
||||
last_drop=0.1,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.1,
|
||||
norm_layer='nn.LayerNorm',
|
||||
sub_norm='nn.LayerNorm',
|
||||
epsilon=1e-6,
|
||||
out_channels=192,
|
||||
out_char_num=25,
|
||||
block_unit='Block',
|
||||
act='nn.GELU',
|
||||
last_stage=True,
|
||||
sub_num=2,
|
||||
prenorm=True,
|
||||
use_lenhead=False,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.embed_dim = embed_dim
|
||||
self.out_channels = out_channels
|
||||
self.prenorm = prenorm
|
||||
patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
|
||||
self.patch_embed = PatchEmbed(
|
||||
img_size=img_size,
|
||||
in_channels=in_channels,
|
||||
embed_dim=embed_dim[0],
|
||||
sub_num=sub_num)
|
||||
num_patches = self.patch_embed.num_patches
|
||||
self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
|
||||
self.pos_embed = self.create_parameter(
|
||||
shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
|
||||
self.add_parameter("pos_embed", self.pos_embed)
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
Block_unit = eval(block_unit)
|
||||
|
||||
dpr = np.linspace(0, drop_path_rate, sum(depth))
|
||||
self.blocks1 = nn.LayerList([
|
||||
Block_unit(
|
||||
dim=embed_dim[0],
|
||||
num_heads=num_heads[0],
|
||||
mixer=mixer[0:depth[0]][i],
|
||||
HW=self.HW,
|
||||
local_mixer=local_mixer[0],
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
act_layer=eval(act),
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[0:depth[0]][i],
|
||||
norm_layer=norm_layer,
|
||||
epsilon=epsilon,
|
||||
prenorm=prenorm) for i in range(depth[0])
|
||||
])
|
||||
if patch_merging is not None:
|
||||
self.sub_sample1 = SubSample(
|
||||
embed_dim[0],
|
||||
embed_dim[1],
|
||||
sub_norm=sub_norm,
|
||||
stride=[2, 1],
|
||||
types=patch_merging)
|
||||
HW = [self.HW[0] // 2, self.HW[1]]
|
||||
else:
|
||||
HW = self.HW
|
||||
self.patch_merging = patch_merging
|
||||
self.blocks2 = nn.LayerList([
|
||||
Block_unit(
|
||||
dim=embed_dim[1],
|
||||
num_heads=num_heads[1],
|
||||
mixer=mixer[depth[0]:depth[0] + depth[1]][i],
|
||||
HW=HW,
|
||||
local_mixer=local_mixer[1],
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
act_layer=eval(act),
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
|
||||
norm_layer=norm_layer,
|
||||
epsilon=epsilon,
|
||||
prenorm=prenorm) for i in range(depth[1])
|
||||
])
|
||||
if patch_merging is not None:
|
||||
self.sub_sample2 = SubSample(
|
||||
embed_dim[1],
|
||||
embed_dim[2],
|
||||
sub_norm=sub_norm,
|
||||
stride=[2, 1],
|
||||
types=patch_merging)
|
||||
HW = [self.HW[0] // 4, self.HW[1]]
|
||||
else:
|
||||
HW = self.HW
|
||||
self.blocks3 = nn.LayerList([
|
||||
Block_unit(
|
||||
dim=embed_dim[2],
|
||||
num_heads=num_heads[2],
|
||||
mixer=mixer[depth[0] + depth[1]:][i],
|
||||
HW=HW,
|
||||
local_mixer=local_mixer[2],
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
act_layer=eval(act),
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[depth[0] + depth[1]:][i],
|
||||
norm_layer=norm_layer,
|
||||
epsilon=epsilon,
|
||||
prenorm=prenorm) for i in range(depth[2])
|
||||
])
|
||||
self.last_stage = last_stage
|
||||
if last_stage:
|
||||
self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num])
|
||||
self.last_conv = nn.Conv2D(
|
||||
in_channels=embed_dim[2],
|
||||
out_channels=self.out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias_attr=False)
|
||||
self.hardswish = nn.Hardswish()
|
||||
self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
|
||||
if not prenorm:
|
||||
self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
|
||||
self.use_lenhead = use_lenhead
|
||||
if use_lenhead:
|
||||
self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
|
||||
self.hardswish_len = nn.Hardswish()
|
||||
self.dropout_len = nn.Dropout(
|
||||
p=last_drop, mode="downscale_in_infer")
|
||||
|
||||
trunc_normal_(self.pos_embed)
|
||||
self.apply(self._init_weights)
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
zeros_(m.bias)
|
||||
ones_(m.weight)
|
||||
|
||||
def forward_features(self, x):
|
||||
x = self.patch_embed(x)
|
||||
x = x + self.pos_embed
|
||||
x = self.pos_drop(x)
|
||||
for blk in self.blocks1:
|
||||
x = blk(x)
|
||||
if self.patch_merging is not None:
|
||||
x = self.sub_sample1(
|
||||
x.transpose([0, 2, 1]).reshape(
|
||||
[0, self.embed_dim[0], self.HW[0], self.HW[1]]))
|
||||
for blk in self.blocks2:
|
||||
x = blk(x)
|
||||
if self.patch_merging is not None:
|
||||
x = self.sub_sample2(
|
||||
x.transpose([0, 2, 1]).reshape(
|
||||
[0, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
|
||||
for blk in self.blocks3:
|
||||
x = blk(x)
|
||||
if not self.prenorm:
|
||||
x = self.norm(x)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
x = self.forward_features(x)
|
||||
if self.use_lenhead:
|
||||
len_x = self.len_conv(x.mean(1))
|
||||
len_x = self.dropout_len(self.hardswish_len(len_x))
|
||||
if self.last_stage:
|
||||
if self.patch_merging is not None:
|
||||
h = self.HW[0] // 4
|
||||
else:
|
||||
h = self.HW[0]
|
||||
x = self.avg_pool(
|
||||
x.transpose([0, 2, 1]).reshape(
|
||||
[0, self.embed_dim[2], h, self.HW[1]]))
|
||||
x = self.last_conv(x)
|
||||
x = self.hardswish(x)
|
||||
x = self.dropout(x)
|
||||
if self.use_lenhead:
|
||||
return x, len_x
|
||||
return x
|
||||
172
backend/ppocr/modeling/backbones/vqa_layoutlm.py
Normal file
172
backend/ppocr/modeling/backbones/vqa_layoutlm.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
from paddle import nn
|
||||
|
||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMForTokenClassification, LayoutXLMForRelationExtraction
|
||||
from paddlenlp.transformers import LayoutLMModel, LayoutLMForTokenClassification
|
||||
from paddlenlp.transformers import LayoutLMv2Model, LayoutLMv2ForTokenClassification, LayoutLMv2ForRelationExtraction
|
||||
|
||||
__all__ = ["LayoutXLMForSer", 'LayoutLMForSer']
|
||||
|
||||
pretrained_model_dict = {
|
||||
LayoutXLMModel: 'layoutxlm-base-uncased',
|
||||
LayoutLMModel: 'layoutlm-base-uncased',
|
||||
LayoutLMv2Model: 'layoutlmv2-base-uncased'
|
||||
}
|
||||
|
||||
|
||||
class NLPBaseModel(nn.Layer):
|
||||
def __init__(self,
|
||||
base_model_class,
|
||||
model_class,
|
||||
type='ser',
|
||||
pretrained=True,
|
||||
checkpoints=None,
|
||||
**kwargs):
|
||||
super(NLPBaseModel, self).__init__()
|
||||
if checkpoints is not None:
|
||||
self.model = model_class.from_pretrained(checkpoints)
|
||||
else:
|
||||
pretrained_model_name = pretrained_model_dict[base_model_class]
|
||||
if pretrained:
|
||||
base_model = base_model_class.from_pretrained(
|
||||
pretrained_model_name)
|
||||
else:
|
||||
base_model = base_model_class(
|
||||
**base_model_class.pretrained_init_configuration[
|
||||
pretrained_model_name])
|
||||
if type == 'ser':
|
||||
self.model = model_class(
|
||||
base_model, num_classes=kwargs['num_classes'], dropout=None)
|
||||
else:
|
||||
self.model = model_class(base_model, dropout=None)
|
||||
self.out_channels = 1
|
||||
|
||||
|
||||
class LayoutLMForSer(NLPBaseModel):
|
||||
def __init__(self, num_classes, pretrained=True, checkpoints=None,
|
||||
**kwargs):
|
||||
super(LayoutLMForSer, self).__init__(
|
||||
LayoutLMModel,
|
||||
LayoutLMForTokenClassification,
|
||||
'ser',
|
||||
pretrained,
|
||||
checkpoints,
|
||||
num_classes=num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.model(
|
||||
input_ids=x[0],
|
||||
bbox=x[2],
|
||||
attention_mask=x[4],
|
||||
token_type_ids=x[5],
|
||||
position_ids=None,
|
||||
output_hidden_states=False)
|
||||
return x
|
||||
|
||||
|
||||
class LayoutLMv2ForSer(NLPBaseModel):
|
||||
def __init__(self, num_classes, pretrained=True, checkpoints=None,
|
||||
**kwargs):
|
||||
super(LayoutLMv2ForSer, self).__init__(
|
||||
LayoutLMv2Model,
|
||||
LayoutLMv2ForTokenClassification,
|
||||
'ser',
|
||||
pretrained,
|
||||
checkpoints,
|
||||
num_classes=num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.model(
|
||||
input_ids=x[0],
|
||||
bbox=x[2],
|
||||
image=x[3],
|
||||
attention_mask=x[4],
|
||||
token_type_ids=x[5],
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None)
|
||||
return x[0]
|
||||
|
||||
|
||||
class LayoutXLMForSer(NLPBaseModel):
|
||||
def __init__(self, num_classes, pretrained=True, checkpoints=None,
|
||||
**kwargs):
|
||||
super(LayoutXLMForSer, self).__init__(
|
||||
LayoutXLMModel,
|
||||
LayoutXLMForTokenClassification,
|
||||
'ser',
|
||||
pretrained,
|
||||
checkpoints,
|
||||
num_classes=num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.model(
|
||||
input_ids=x[0],
|
||||
bbox=x[2],
|
||||
image=x[3],
|
||||
attention_mask=x[4],
|
||||
token_type_ids=x[5],
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None)
|
||||
return x[0]
|
||||
|
||||
|
||||
class LayoutLMv2ForRe(NLPBaseModel):
|
||||
def __init__(self, pretrained=True, checkpoints=None, **kwargs):
|
||||
super(LayoutLMv2ForRe, self).__init__(LayoutLMv2Model,
|
||||
LayoutLMv2ForRelationExtraction,
|
||||
're', pretrained, checkpoints)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.model(
|
||||
input_ids=x[0],
|
||||
bbox=x[1],
|
||||
labels=None,
|
||||
image=x[2],
|
||||
attention_mask=x[3],
|
||||
token_type_ids=x[4],
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
entities=x[5],
|
||||
relations=x[6])
|
||||
return x
|
||||
|
||||
|
||||
class LayoutXLMForRe(NLPBaseModel):
|
||||
def __init__(self, pretrained=True, checkpoints=None, **kwargs):
|
||||
super(LayoutXLMForRe, self).__init__(LayoutXLMModel,
|
||||
LayoutXLMForRelationExtraction,
|
||||
're', pretrained, checkpoints)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.model(
|
||||
input_ids=x[0],
|
||||
bbox=x[1],
|
||||
labels=None,
|
||||
image=x[2],
|
||||
attention_mask=x[3],
|
||||
token_type_ids=x[4],
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
entities=x[5],
|
||||
relations=x[6])
|
||||
return x
|
||||
Reference in New Issue
Block a user