init

2026-02-21 00:44:46 +08:00 · 2023-10-25 16:38:16 +08:00
commit 2b9360c299
602 changed files with 152490 additions and 0 deletions
--- a/backend/ppocr/modeling/backbones/init.py
+++ b/backend/ppocr/modeling/backbones/init.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["build_backbone"]
+
+
+def build_backbone(config, model_type):
+    if model_type == "det" or model_type == "table":
+        from .det_mobilenet_v3 import MobileNetV3
+        from .det_resnet_vd import ResNet
+        from .det_resnet_vd_sast import ResNet_SAST
+        support_dict = ["MobileNetV3", "ResNet", "ResNet_SAST"]
+    elif model_type == "rec" or model_type == "cls":
+        from .rec_mobilenet_v3 import MobileNetV3
+        from .rec_resnet_vd import ResNet
+        from .rec_resnet_fpn import ResNetFPN
+        from .rec_mv1_enhance import MobileNetV1Enhance
+        from .rec_nrtr_mtb import MTB
+        from .rec_resnet_31 import ResNet31
+        from .rec_resnet_aster import ResNet_ASTER
+        from .rec_micronet import MicroNet
+        from .rec_efficientb3_pren import EfficientNetb3_PREN
+        from .rec_svtrnet import SVTRNet
+        support_dict = [
+            'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB',
+            "ResNet31", "ResNet_ASTER", 'MicroNet', 'EfficientNetb3_PREN',
+            'SVTRNet'
+        ]
+    elif model_type == "e2e":
+        from .e2e_resnet_vd_pg import ResNet
+        support_dict = ['ResNet']
+    elif model_type == 'kie':
+        from .kie_unet_sdmgr import Kie_backbone
+        support_dict = ['Kie_backbone']
+    elif model_type == "table":
+        from .table_resnet_vd import ResNet
+        from .table_mobilenet_v3 import MobileNetV3
+        support_dict = ["ResNet", "MobileNetV3"]
+    elif model_type == 'vqa':
+        from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe
+        support_dict = [
+            "LayoutLMForSer", "LayoutLMv2ForSer", 'LayoutLMv2ForRe',
+            "LayoutXLMForSer", 'LayoutXLMForRe'
+        ]
+    else:
+        raise NotImplementedError
+
+    module_name = config.pop("name")
+    assert module_name in support_dict, Exception(
+        "when model typs is {}, backbone only support {}".format(model_type,
+                                                                 support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/backend/ppocr/modeling/backbones/det_mobilenet_v3.py
+++ b/backend/ppocr/modeling/backbones/det_mobilenet_v3.py
@@ -0,0 +1,268 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+__all__ = ['MobileNetV3']
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MobileNetV3(nn.Layer):
+    def __init__(self,
+                 in_channels=3,
+                 model_name='large',
+                 scale=0.5,
+                 disable_se=False,
+                 **kwargs):
+        """
+        the MobilenetV3 backbone network for detection module.
+        Args:
+            params(dict): the super parameters for build network
+        """
+        super(MobileNetV3, self).__init__()
+
+        self.disable_se = disable_se
+
+        if model_name == "large":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hardswish', 2],
+                [3, 200, 80, False, 'hardswish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
+                [3, 480, 112, True, 'hardswish', 1],
+                [3, 672, 112, True, 'hardswish', 1],
+                [5, 672, 160, True, 'hardswish', 2],
+                [5, 960, 160, True, 'hardswish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
+            ]
+            cls_ch_squeeze = 960
+        elif model_name == "small":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hardswish', 2],
+                [5, 240, 40, True, 'hardswish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
+                [5, 120, 48, True, 'hardswish', 1],
+                [5, 144, 48, True, 'hardswish', 1],
+                [5, 288, 96, True, 'hardswish', 2],
+                [5, 576, 96, True, 'hardswish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
+            ]
+            cls_ch_squeeze = 576
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+
+        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
+        assert scale in supported_scale, \
+            "supported scale are {} but input scale is {}".format(supported_scale, scale)
+        inplanes = 16
+        # conv1
+        self.conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=make_divisible(inplanes * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            if_act=True,
+            act='hardswish')
+
+        self.stages = []
+        self.out_channels = []
+        block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in cfg:
+            se = se and not self.disable_se
+            start_idx = 2 if model_name == 'large' else 0
+            if s == 2 and i > start_idx:
+                self.out_channels.append(inplanes)
+                self.stages.append(nn.Sequential(*block_list))
+                block_list = []
+            block_list.append(
+                ResidualUnit(
+                    in_channels=inplanes,
+                    mid_channels=make_divisible(scale * exp),
+                    out_channels=make_divisible(scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl))
+            inplanes = make_divisible(scale * c)
+            i += 1
+        block_list.append(
+            ConvBNLayer(
+                in_channels=inplanes,
+                out_channels=make_divisible(scale * cls_ch_squeeze),
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                if_act=True,
+                act='hardswish'))
+        self.stages.append(nn.Sequential(*block_list))
+        self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
+        for i, stage in enumerate(self.stages):
+            self.add_sublayer(sublayer=stage, name="stage{}".format(i))
+
+    def forward(self, x):
+        x = self.conv(x)
+        out_list = []
+        for stage in self.stages:
+            x = stage(x)
+            out_list.append(x)
+        return out_list
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm(num_channels=out_channels, act=None)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "hardswish":
+                x = F.hardswish(x)
+            else:
+                print("The activation function({}) is selected incorrectly.".
+                      format(self.act))
+                exit()
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 act=None):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_channels == out_channels
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.bottleneck_conv = ConvBNLayer(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=int((kernel_size - 1) // 2),
+            groups=mid_channels,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_channels)
+        self.linear_conv = ConvBNLayer(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+
+    def forward(self, inputs):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(inputs, x)
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, in_channels, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.conv2 = nn.Conv2D(
+            in_channels=in_channels // reduction,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return inputs * outputs
--- a/backend/ppocr/modeling/backbones/det_resnet_vd.py
+++ b/backend/ppocr/modeling/backbones/det_resnet_vd.py
@@ -0,0 +1,351 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.vision.ops import DeformConv2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant, XavierUniform
+
+__all__ = ["ResNet"]
+
+
+class DeformableConvV2(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None,
+                 lr_scale=1,
+                 regularizer=None,
+                 skip_quant=False,
+                 dcn_bias_regularizer=L2Decay(0.),
+                 dcn_bias_lr_scale=2.):
+        super(DeformableConvV2, self).__init__()
+        self.offset_channel = 2 * kernel_size**2 * groups
+        self.mask_channel = kernel_size**2 * groups
+
+        if bias_attr:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            dcn_bias_attr = ParamAttr(
+                initializer=Constant(value=0),
+                regularizer=dcn_bias_regularizer,
+                learning_rate=dcn_bias_lr_scale)
+        else:
+            # in ResNet backbone, do not need bias
+            dcn_bias_attr = False
+        self.conv_dcn = DeformConv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2 * dilation,
+            dilation=dilation,
+            deformable_groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=dcn_bias_attr)
+
+        if lr_scale == 1 and regularizer is None:
+            offset_bias_attr = ParamAttr(initializer=Constant(0.))
+        else:
+            offset_bias_attr = ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=lr_scale,
+                regularizer=regularizer)
+        self.conv_offset = nn.Conv2D(
+            in_channels,
+            groups * 3 * kernel_size**2,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            weight_attr=ParamAttr(initializer=Constant(0.0)),
+            bias_attr=offset_bias_attr)
+        if skip_quant:
+            self.conv_offset.skip_quant = True
+
+    def forward(self, x):
+        offset_mask = self.conv_offset(x)
+        offset, mask = paddle.split(
+            offset_mask,
+            num_or_sections=[self.offset_channel, self.mask_channel],
+            axis=1)
+        mask = F.sigmoid(mask)
+        y = self.conv_dcn(x, offset, mask=mask)
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 is_dcn=False):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        if not is_dcn:
+            self._conv = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias_attr=False)
+        else:
+            self._conv = DeformableConvV2(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=(kernel_size - 1) // 2,
+                groups=2,  #groups,
+                bias_attr=False)
+        self._batch_norm = nn.BatchNorm(out_channels, act=act)
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            stride,
+            shortcut=True,
+            if_first=False,
+            is_dcn=False, ):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            is_dcn=is_dcn)
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            stride,
+            shortcut=True,
+            if_first=False, ):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 in_channels=3,
+                 layers=50,
+                 dcn_stage=None,
+                 out_indices=None,
+                 **kwargs):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.dcn_stage = dcn_stage if dcn_stage is not None else [
+            False, False, False, False
+        ]
+        self.out_indices = out_indices if out_indices is not None else [
+            0, 1, 2, 3
+        ]
+
+        self.conv1_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=32,
+            kernel_size=3,
+            stride=2,
+            act='relu')
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu')
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu')
+        self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.stages = []
+        self.out_channels = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                block_list = []
+                shortcut = False
+                is_dcn = self.dcn_stage[block]
+                for i in range(depth[block]):
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            is_dcn=is_dcn))
+                    shortcut = True
+                    block_list.append(bottleneck_block)
+                if block in self.out_indices:
+                    self.out_channels.append(num_filters[block] * 4)
+                self.stages.append(nn.Sequential(*block_list))
+        else:
+            for block in range(len(depth)):
+                block_list = []
+                shortcut = False
+                # is_dcn = self.dcn_stage[block]
+                for i in range(depth[block]):
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0))
+                    shortcut = True
+                    block_list.append(basic_block)
+                if block in self.out_indices:
+                    self.out_channels.append(num_filters[block])
+                self.stages.append(nn.Sequential(*block_list))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        out = []
+        for i, block in enumerate(self.stages):
+            y = block(y)
+            if i in self.out_indices:
+                out.append(y)
+        return out
--- a/backend/ppocr/modeling/backbones/det_resnet_vd_sast.py
+++ b/backend/ppocr/modeling/backbones/det_resnet_vd_sast.py
@@ -0,0 +1,285 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ["ResNet_SAST"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = nn.BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet_SAST(nn.Layer):
+    def __init__(self, in_channels=3, layers=50, **kwargs):
+        super(ResNet_SAST, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            # depth = [3, 4, 6, 3]
+            depth = [3, 4, 6, 3, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        # num_channels = [64, 256, 512,
+        #                 1024] if layers >= 50 else [64, 64, 128, 256]
+        # num_filters = [64, 128, 256, 512]
+        num_channels = [64, 256, 512,
+                        1024, 2048] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=32,
+            kernel_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.stages = []
+        self.out_channels = [3, 64]
+        if layers >= 50:
+            for block in range(len(depth)):
+                block_list = []
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    shortcut = True
+                    block_list.append(bottleneck_block)
+                self.out_channels.append(num_filters[block] * 4)
+                self.stages.append(nn.Sequential(*block_list))
+        else:
+            for block in range(len(depth)):
+                block_list = []
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    shortcut = True
+                    block_list.append(basic_block)
+                self.out_channels.append(num_filters[block])
+                self.stages.append(nn.Sequential(*block_list))
+
+    def forward(self, inputs):
+        out = [inputs]
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        out.append(y)
+        y = self.pool2d_max(y)
+        for block in self.stages:
+            y = block(y)
+            out.append(y)
+        return out
--- a/backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py
+++ b/backend/ppocr/modeling/backbones/e2e_resnet_vd_pg.py
@@ -0,0 +1,265 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ["ResNet"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = nn.BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=stride,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet(nn.Layer):
+    def __init__(self, in_channels=3, layers=50, **kwargs):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            # depth = [3, 4, 6, 3]
+            depth = [3, 4, 6, 3, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024,
+                        2048] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=7,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.stages = []
+        self.out_channels = [3, 64]
+        # num_filters = [64, 128, 256, 512, 512]
+        if layers >= 50:
+            for block in range(len(depth)):
+                block_list = []
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    shortcut = True
+                    block_list.append(bottleneck_block)
+                self.out_channels.append(num_filters[block] * 4)
+                self.stages.append(nn.Sequential(*block_list))
+        else:
+            for block in range(len(depth)):
+                block_list = []
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    shortcut = True
+                    block_list.append(basic_block)
+                self.out_channels.append(num_filters[block])
+                self.stages.append(nn.Sequential(*block_list))
+
+    def forward(self, inputs):
+        out = [inputs]
+        y = self.conv1_1(inputs)
+        out.append(y)
+        y = self.pool2d_max(y)
+        for block in self.stages:
+            y = block(y)
+            out.append(y)
+        return out
--- a/backend/ppocr/modeling/backbones/kie_unet_sdmgr.py
+++ b/backend/ppocr/modeling/backbones/kie_unet_sdmgr.py
@@ -0,0 +1,186 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import numpy as np
+import cv2
+
+__all__ = ["Kie_backbone"]
+
+
+class Encoder(nn.Layer):
+    def __init__(self, num_channels, num_filters):
+        super(Encoder, self).__init__()
+        self.conv1 = nn.Conv2D(
+            num_channels,
+            num_filters,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm(num_filters, act='relu')
+
+        self.conv2 = nn.Conv2D(
+            num_filters,
+            num_filters,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        self.bn2 = nn.BatchNorm(num_filters, act='relu')
+
+        self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x_pooled = self.pool(x)
+        return x, x_pooled
+
+
+class Decoder(nn.Layer):
+    def __init__(self, num_channels, num_filters):
+        super(Decoder, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            num_channels,
+            num_filters,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm(num_filters, act='relu')
+
+        self.conv2 = nn.Conv2D(
+            num_filters,
+            num_filters,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        self.bn2 = nn.BatchNorm(num_filters, act='relu')
+
+        self.conv0 = nn.Conv2D(
+            num_channels,
+            num_filters,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+        self.bn0 = nn.BatchNorm(num_filters, act='relu')
+
+    def forward(self, inputs_prev, inputs):
+        x = self.conv0(inputs)
+        x = self.bn0(x)
+        x = paddle.nn.functional.interpolate(
+            x, scale_factor=2, mode='bilinear', align_corners=False)
+        x = paddle.concat([inputs_prev, x], axis=1)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        return x
+
+
+class UNet(nn.Layer):
+    def __init__(self):
+        super(UNet, self).__init__()
+        self.down1 = Encoder(num_channels=3, num_filters=16)
+        self.down2 = Encoder(num_channels=16, num_filters=32)
+        self.down3 = Encoder(num_channels=32, num_filters=64)
+        self.down4 = Encoder(num_channels=64, num_filters=128)
+        self.down5 = Encoder(num_channels=128, num_filters=256)
+
+        self.up1 = Decoder(32, 16)
+        self.up2 = Decoder(64, 32)
+        self.up3 = Decoder(128, 64)
+        self.up4 = Decoder(256, 128)
+        self.out_channels = 16
+
+    def forward(self, inputs):
+        x1, _ = self.down1(inputs)
+        _, x2 = self.down2(x1)
+        _, x3 = self.down3(x2)
+        _, x4 = self.down4(x3)
+        _, x5 = self.down5(x4)
+
+        x = self.up4(x4, x5)
+        x = self.up3(x3, x)
+        x = self.up2(x2, x)
+        x = self.up1(x1, x)
+        return x
+
+
+class Kie_backbone(nn.Layer):
+    def __init__(self, in_channels, **kwargs):
+        super(Kie_backbone, self).__init__()
+        self.out_channels = 16
+        self.img_feat = UNet()
+        self.maxpool = nn.MaxPool2D(kernel_size=7)
+
+    def bbox2roi(self, bbox_list):
+        rois_list = []
+        rois_num = []
+        for img_id, bboxes in enumerate(bbox_list):
+            rois_num.append(bboxes.shape[0])
+            rois_list.append(bboxes)
+        rois = paddle.concat(rois_list, 0)
+        rois_num = paddle.to_tensor(rois_num, dtype='int32')
+        return rois, rois_num
+
+    def pre_process(self, img, relations, texts, gt_bboxes, tag, img_size):
+        img, relations, texts, gt_bboxes, tag, img_size = img.numpy(
+        ), relations.numpy(), texts.numpy(), gt_bboxes.numpy(), tag.numpy(
+        ).tolist(), img_size.numpy()
+        temp_relations, temp_texts, temp_gt_bboxes = [], [], []
+        h, w = int(np.max(img_size[:, 0])), int(np.max(img_size[:, 1]))
+        img = paddle.to_tensor(img[:, :, :h, :w])
+        batch = len(tag)
+        for i in range(batch):
+            num, recoder_len = tag[i][0], tag[i][1]
+            temp_relations.append(
+                paddle.to_tensor(
+                    relations[i, :num, :num, :], dtype='float32'))
+            temp_texts.append(
+                paddle.to_tensor(
+                    texts[i, :num, :recoder_len], dtype='float32'))
+            temp_gt_bboxes.append(
+                paddle.to_tensor(
+                    gt_bboxes[i, :num, ...], dtype='float32'))
+        return img, temp_relations, temp_texts, temp_gt_bboxes
+
+    def forward(self, inputs):
+        img = inputs[0]
+        relations, texts, gt_bboxes, tag, img_size = inputs[1], inputs[
+            2], inputs[3], inputs[5], inputs[-1]
+        img, relations, texts, gt_bboxes = self.pre_process(
+            img, relations, texts, gt_bboxes, tag, img_size)
+        x = self.img_feat(img)
+        boxes, rois_num = self.bbox2roi(gt_bboxes)
+        feats = paddle.fluid.layers.roi_align(
+            x,
+            boxes,
+            spatial_scale=1.0,
+            pooled_height=7,
+            pooled_width=7,
+            rois_num=rois_num)
+        feats = self.maxpool(feats).squeeze(-1).squeeze(-1)
+        return [relations, texts, feats]
--- a/backend/ppocr/modeling/backbones/rec_efficientb3_pren.py
+++ b/backend/ppocr/modeling/backbones/rec_efficientb3_pren.py
@@ -0,0 +1,228 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Code is refer from:
+https://github.com/RuijieJ/pren/blob/main/Nets/EfficientNet.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from collections import namedtuple
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ['EfficientNetb3']
+
+
+class EffB3Params:
+    @staticmethod
+    def get_global_params():
+        """
+        The fllowing are efficientnetb3's arch superparams, but to fit for scene 
+        text recognition task, the resolution(image_size) here is changed 
+        from 300 to 64.
+        """
+        GlobalParams = namedtuple('GlobalParams', [
+            'drop_connect_rate', 'width_coefficient', 'depth_coefficient',
+            'depth_divisor', 'image_size'
+        ])
+        global_params = GlobalParams(
+            drop_connect_rate=0.3,
+            width_coefficient=1.2,
+            depth_coefficient=1.4,
+            depth_divisor=8,
+            image_size=64)
+        return global_params
+
+    @staticmethod
+    def get_block_params():
+        BlockParams = namedtuple('BlockParams', [
+            'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+            'expand_ratio', 'id_skip', 'se_ratio', 'stride'
+        ])
+        block_params = [
+            BlockParams(3, 1, 32, 16, 1, True, 0.25, 1),
+            BlockParams(3, 2, 16, 24, 6, True, 0.25, 2),
+            BlockParams(5, 2, 24, 40, 6, True, 0.25, 2),
+            BlockParams(3, 3, 40, 80, 6, True, 0.25, 2),
+            BlockParams(5, 3, 80, 112, 6, True, 0.25, 1),
+            BlockParams(5, 4, 112, 192, 6, True, 0.25, 2),
+            BlockParams(3, 1, 192, 320, 6, True, 0.25, 1)
+        ]
+        return block_params
+
+
+class EffUtils:
+    @staticmethod
+    def round_filters(filters, global_params):
+        """Calculate and round number of filters based on depth multiplier."""
+        multiplier = global_params.width_coefficient
+        if not multiplier:
+            return filters
+        divisor = global_params.depth_divisor
+        filters *= multiplier
+        new_filters = int(filters + divisor / 2) // divisor * divisor
+        if new_filters < 0.9 * filters:
+            new_filters += divisor
+        return int(new_filters)
+
+    @staticmethod
+    def round_repeats(repeats, global_params):
+        """Round number of filters based on depth multiplier."""
+        multiplier = global_params.depth_coefficient
+        if not multiplier:
+            return repeats
+        return int(math.ceil(multiplier * repeats))
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, block_params):
+        super(ConvBlock, self).__init__()
+        self.block_args = block_params
+        self.has_se = (self.block_args.se_ratio is not None) and \
+            (0 < self.block_args.se_ratio <= 1)
+        self.id_skip = block_params.id_skip
+
+        # expansion phase
+        self.input_filters = self.block_args.input_filters
+        output_filters = \
+            self.block_args.input_filters * self.block_args.expand_ratio
+        if self.block_args.expand_ratio != 1:
+            self.expand_conv = nn.Conv2D(
+                self.input_filters, output_filters, 1, bias_attr=False)
+            self.bn0 = nn.BatchNorm(output_filters)
+
+        # depthwise conv phase
+        k = self.block_args.kernel_size
+        s = self.block_args.stride
+        self.depthwise_conv = nn.Conv2D(
+            output_filters,
+            output_filters,
+            groups=output_filters,
+            kernel_size=k,
+            stride=s,
+            padding='same',
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm(output_filters)
+
+        # squeeze and excitation layer, if desired
+        if self.has_se:
+            num_squeezed_channels = max(1,
+                                        int(self.block_args.input_filters *
+                                            self.block_args.se_ratio))
+            self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1)
+            self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1)
+
+        # output phase
+        self.final_oup = self.block_args.output_filters
+        self.project_conv = nn.Conv2D(
+            output_filters, self.final_oup, 1, bias_attr=False)
+        self.bn2 = nn.BatchNorm(self.final_oup)
+        self.swish = nn.Swish()
+
+    def drop_connect(self, inputs, p, training):
+        if not training:
+            return inputs
+
+        batch_size = inputs.shape[0]
+        keep_prob = 1 - p
+        random_tensor = keep_prob
+        random_tensor += paddle.rand([batch_size, 1, 1, 1], dtype=inputs.dtype)
+        random_tensor = paddle.to_tensor(random_tensor, place=inputs.place)
+        binary_tensor = paddle.floor(random_tensor)
+        output = inputs / keep_prob * binary_tensor
+        return output
+
+    def forward(self, inputs, drop_connect_rate=None):
+        # expansion and depthwise conv
+        x = inputs
+        if self.block_args.expand_ratio != 1:
+            x = self.swish(self.bn0(self.expand_conv(inputs)))
+        x = self.swish(self.bn1(self.depthwise_conv(x)))
+
+        # squeeze and excitation
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed)))
+            x = F.sigmoid(x_squeezed) * x
+        x = self.bn2(self.project_conv(x))
+
+        # skip conntection and drop connect
+        if self.id_skip and self.block_args.stride == 1 and \
+            self.input_filters == self.final_oup:
+            if drop_connect_rate:
+                x = self.drop_connect(
+                    x, p=drop_connect_rate, training=self.training)
+            x = x + inputs
+        return x
+
+
+class EfficientNetb3_PREN(nn.Layer):
+    def __init__(self, in_channels):
+        super(EfficientNetb3_PREN, self).__init__()
+        self.blocks_params = EffB3Params.get_block_params()
+        self.global_params = EffB3Params.get_global_params()
+        self.out_channels = []
+        # stem
+        stem_channels = EffUtils.round_filters(32, self.global_params)
+        self.conv_stem = nn.Conv2D(
+            in_channels, stem_channels, 3, 2, padding='same', bias_attr=False)
+        self.bn0 = nn.BatchNorm(stem_channels)
+
+        self.blocks = []
+        # to extract three feature maps for fpn based on efficientnetb3 backbone
+        self.concerned_block_idxes = [7, 17, 25]
+        concerned_idx = 0
+        for i, block_params in enumerate(self.blocks_params):
+            block_params = block_params._replace(
+                input_filters=EffUtils.round_filters(block_params.input_filters,
+                                                     self.global_params),
+                output_filters=EffUtils.round_filters(
+                    block_params.output_filters, self.global_params),
+                num_repeat=EffUtils.round_repeats(block_params.num_repeat,
+                                                  self.global_params))
+            self.blocks.append(
+                self.add_sublayer("{}-0".format(i), ConvBlock(block_params)))
+            concerned_idx += 1
+            if concerned_idx in self.concerned_block_idxes:
+                self.out_channels.append(block_params.output_filters)
+            if block_params.num_repeat > 1:
+                block_params = block_params._replace(
+                    input_filters=block_params.output_filters, stride=1)
+            for j in range(block_params.num_repeat - 1):
+                self.blocks.append(
+                    self.add_sublayer('{}-{}'.format(i, j + 1),
+                                      ConvBlock(block_params)))
+                concerned_idx += 1
+                if concerned_idx in self.concerned_block_idxes:
+                    self.out_channels.append(block_params.output_filters)
+
+        self.swish = nn.Swish()
+
+    def forward(self, inputs):
+        outs = []
+        
+        x = self.swish(self.bn0(self.conv_stem(inputs)))
+        for idx, block in enumerate(self.blocks):
+            drop_connect_rate = self.global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self.blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if idx in self.concerned_block_idxes:
+                outs.append(x)
+        return outs
--- a/backend/ppocr/modeling/backbones/rec_micronet.py
+++ b/backend/ppocr/modeling/backbones/rec_micronet.py
@@ -0,0 +1,528 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from: 
+https://github.com/liyunsheng13/micronet/blob/main/backbone/micronet.py
+https://github.com/liyunsheng13/micronet/blob/main/backbone/activation.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+from ppocr.modeling.backbones.det_mobilenet_v3 import make_divisible
+
+M0_cfgs = [
+    # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r
+    [2, 1, 8, 3, 2, 2, 0, 4, 8, 2, 2, 2, 0, 1, 1],
+    [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 2, 1, 1],
+    [2, 1, 16, 5, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1],
+    [1, 1, 32, 5, 1, 4, 4, 4, 32, 4, 4, 2, 2, 1, 1],
+    [2, 1, 64, 5, 1, 4, 8, 8, 64, 8, 8, 2, 2, 1, 1],
+    [1, 1, 96, 3, 1, 4, 8, 8, 96, 8, 8, 2, 2, 1, 2],
+    [1, 1, 384, 3, 1, 4, 12, 12, 0, 0, 0, 2, 2, 1, 2],
+]
+M1_cfgs = [
+    # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4
+    [2, 1, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1, 1],
+    [2, 1, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1, 1],
+    [2, 1, 16, 5, 2, 2, 0, 16, 16, 4, 4, 2, 2, 1, 1],
+    [1, 1, 32, 5, 1, 6, 4, 4, 32, 4, 4, 2, 2, 1, 1],
+    [2, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 1],
+    [1, 1, 96, 3, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2],
+    [1, 1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1, 2],
+]
+M2_cfgs = [
+    # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4
+    [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 0, 1, 1],
+    [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1],
+    [1, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 2, 2, 1, 1],
+    [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 2, 2, 1, 1],
+    [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 2, 2, 1, 2],
+    [1, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 2],
+    [2, 1, 96, 5, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2],
+    [1, 1, 128, 3, 1, 6, 12, 12, 128, 8, 8, 2, 2, 1, 2],
+    [1, 1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1, 2],
+]
+M3_cfgs = [
+    # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4
+    [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 0, 2, 0, 1],
+    [2, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 0, 2, 0, 1],
+    [1, 1, 24, 3, 2, 2, 0, 24, 24, 4, 4, 0, 2, 0, 1],
+    [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 0, 2, 0, 1],
+    [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 0, 2, 0, 2],
+    [1, 1, 64, 5, 1, 6, 8, 8, 48, 8, 8, 0, 2, 0, 2],
+    [1, 1, 80, 5, 1, 6, 8, 8, 80, 8, 8, 0, 2, 0, 2],
+    [1, 1, 80, 5, 1, 6, 10, 10, 80, 8, 8, 0, 2, 0, 2],
+    [1, 1, 120, 5, 1, 6, 10, 10, 120, 10, 10, 0, 2, 0, 2],
+    [1, 1, 120, 5, 1, 6, 12, 12, 120, 10, 10, 0, 2, 0, 2],
+    [1, 1, 144, 3, 1, 6, 12, 12, 144, 12, 12, 0, 2, 0, 2],
+    [1, 1, 432, 3, 1, 3, 12, 12, 0, 0, 0, 0, 2, 0, 2],
+]
+
+
+def get_micronet_config(mode):
+    return eval(mode + '_cfgs')
+
+
+class MaxGroupPooling(nn.Layer):
+    def __init__(self, channel_per_group=2):
+        super(MaxGroupPooling, self).__init__()
+        self.channel_per_group = channel_per_group
+
+    def forward(self, x):
+        if self.channel_per_group == 1:
+            return x
+        # max op
+        b, c, h, w = x.shape
+
+        # reshape
+        y = paddle.reshape(x, [b, c // self.channel_per_group, -1, h, w])
+        out = paddle.max(y, axis=2)
+        return out
+
+
+class SpatialSepConvSF(nn.Layer):
+    def __init__(self, inp, oups, kernel_size, stride):
+        super(SpatialSepConvSF, self).__init__()
+
+        oup1, oup2 = oups
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                oup1, (kernel_size, 1), (stride, 1), (kernel_size // 2, 0),
+                bias_attr=False,
+                groups=1),
+            nn.BatchNorm2D(oup1),
+            nn.Conv2D(
+                oup1,
+                oup1 * oup2, (1, kernel_size), (1, stride),
+                (0, kernel_size // 2),
+                bias_attr=False,
+                groups=oup1),
+            nn.BatchNorm2D(oup1 * oup2),
+            ChannelShuffle(oup1), )
+
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+
+
+class ChannelShuffle(nn.Layer):
+    def __init__(self, groups):
+        super(ChannelShuffle, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+
+        channels_per_group = c // self.groups
+
+        # reshape
+        x = paddle.reshape(x, [b, self.groups, channels_per_group, h, w])
+
+        x = paddle.transpose(x, (0, 2, 1, 3, 4))
+        out = paddle.reshape(x, [b, -1, h, w])
+
+        return out
+
+
+class StemLayer(nn.Layer):
+    def __init__(self, inp, oup, stride, groups=(4, 4)):
+        super(StemLayer, self).__init__()
+
+        g1, g2 = groups
+        self.stem = nn.Sequential(
+            SpatialSepConvSF(inp, groups, 3, stride),
+            MaxGroupPooling(2) if g1 * g2 == 2 * oup else nn.ReLU6())
+
+    def forward(self, x):
+        out = self.stem(x)
+        return out
+
+
+class DepthSpatialSepConv(nn.Layer):
+    def __init__(self, inp, expand, kernel_size, stride):
+        super(DepthSpatialSepConv, self).__init__()
+
+        exp1, exp2 = expand
+
+        hidden_dim = inp * exp1
+        oup = inp * exp1 * exp2
+
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp * exp1, (kernel_size, 1), (stride, 1),
+                (kernel_size // 2, 0),
+                bias_attr=False,
+                groups=inp),
+            nn.BatchNorm2D(inp * exp1),
+            nn.Conv2D(
+                hidden_dim,
+                oup, (1, kernel_size),
+                1, (0, kernel_size // 2),
+                bias_attr=False,
+                groups=hidden_dim),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class GroupConv(nn.Layer):
+    def __init__(self, inp, oup, groups=2):
+        super(GroupConv, self).__init__()
+        self.inp = inp
+        self.oup = oup
+        self.groups = groups
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp, oup, 1, 1, 0, bias_attr=False, groups=self.groups[0]),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class DepthConv(nn.Layer):
+    def __init__(self, inp, oup, kernel_size, stride):
+        super(DepthConv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                oup,
+                kernel_size,
+                stride,
+                kernel_size // 2,
+                bias_attr=False,
+                groups=inp),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+
+
+class DYShiftMax(nn.Layer):
+    def __init__(self,
+                 inp,
+                 oup,
+                 reduction=4,
+                 act_max=1.0,
+                 act_relu=True,
+                 init_a=[0.0, 0.0],
+                 init_b=[0.0, 0.0],
+                 relu_before_pool=False,
+                 g=None,
+                 expansion=False):
+        super(DYShiftMax, self).__init__()
+        self.oup = oup
+        self.act_max = act_max * 2
+        self.act_relu = act_relu
+        self.avg_pool = nn.Sequential(nn.ReLU() if relu_before_pool == True else
+                                      nn.Sequential(), nn.AdaptiveAvgPool2D(1))
+
+        self.exp = 4 if act_relu else 2
+        self.init_a = init_a
+        self.init_b = init_b
+
+        # determine squeeze
+        squeeze = make_divisible(inp // reduction, 4)
+        if squeeze < 4:
+            squeeze = 4
+
+        self.fc = nn.Sequential(
+            nn.Linear(inp, squeeze),
+            nn.ReLU(), nn.Linear(squeeze, oup * self.exp), nn.Hardsigmoid())
+
+        if g is None:
+            g = 1
+        self.g = g[1]
+        if self.g != 1 and expansion:
+            self.g = inp // self.g
+
+        self.gc = inp // self.g
+        index = paddle.to_tensor([range(inp)])
+        index = paddle.reshape(index, [1, inp, 1, 1])
+        index = paddle.reshape(index, [1, self.g, self.gc, 1, 1])
+        indexgs = paddle.split(index, [1, self.g - 1], axis=1)
+        indexgs = paddle.concat((indexgs[1], indexgs[0]), axis=1)
+        indexs = paddle.split(indexgs, [1, self.gc - 1], axis=2)
+        indexs = paddle.concat((indexs[1], indexs[0]), axis=2)
+        self.index = paddle.reshape(indexs, [inp])
+        self.expansion = expansion
+
+    def forward(self, x):
+        x_in = x
+        x_out = x
+
+        b, c, _, _ = x_in.shape
+        y = self.avg_pool(x_in)
+        y = paddle.reshape(y, [b, c])
+        y = self.fc(y)
+        y = paddle.reshape(y, [b, self.oup * self.exp, 1, 1])
+        y = (y - 0.5) * self.act_max
+
+        n2, c2, h2, w2 = x_out.shape
+        x2 = paddle.to_tensor(x_out.numpy()[:, self.index.numpy(), :, :])
+
+        if self.exp == 4:
+            temp = y.shape
+            a1, b1, a2, b2 = paddle.split(y, temp[1] // self.oup, axis=1)
+
+            a1 = a1 + self.init_a[0]
+            a2 = a2 + self.init_a[1]
+
+            b1 = b1 + self.init_b[0]
+            b2 = b2 + self.init_b[1]
+
+            z1 = x_out * a1 + x2 * b1
+            z2 = x_out * a2 + x2 * b2
+
+            out = paddle.maximum(z1, z2)
+
+        elif self.exp == 2:
+            temp = y.shape
+            a1, b1 = paddle.split(y, temp[1] // self.oup, axis=1)
+            a1 = a1 + self.init_a[0]
+            b1 = b1 + self.init_b[0]
+            out = x_out * a1 + x2 * b1
+
+        return out
+
+
+class DYMicroBlock(nn.Layer):
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size=3,
+                 stride=1,
+                 ch_exp=(2, 2),
+                 ch_per_group=4,
+                 groups_1x1=(1, 1),
+                 depthsep=True,
+                 shuffle=False,
+                 activation_cfg=None):
+        super(DYMicroBlock, self).__init__()
+
+        self.identity = stride == 1 and inp == oup
+
+        y1, y2, y3 = activation_cfg['dy']
+        act_reduction = 8 * activation_cfg['ratio']
+        init_a = activation_cfg['init_a']
+        init_b = activation_cfg['init_b']
+
+        t1 = ch_exp
+        gs1 = ch_per_group
+        hidden_fft, g1, g2 = groups_1x1
+        hidden_dim2 = inp * t1[0] * t1[1]
+
+        if gs1[0] == 0:
+            self.layers = nn.Sequential(
+                DepthSpatialSepConv(inp, t1, kernel_size, stride),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=2.0,
+                    act_relu=True if y2 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=False) if y2 > 0 else nn.ReLU6(),
+                ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(),
+                ChannelShuffle(hidden_dim2 // 2)
+                if shuffle and y2 != 0 else nn.Sequential(),
+                GroupConv(hidden_dim2, oup, (g1, g2)),
+                DYShiftMax(
+                    oup,
+                    oup,
+                    act_max=2.0,
+                    act_relu=False,
+                    init_a=[1.0, 0.0],
+                    reduction=act_reduction // 2,
+                    init_b=[0.0, 0.0],
+                    g=(g1, g2),
+                    expansion=False) if y3 > 0 else nn.Sequential(),
+                ChannelShuffle(g2) if shuffle else nn.Sequential(),
+                ChannelShuffle(oup // 2)
+                if shuffle and oup % 2 == 0 and y3 != 0 else nn.Sequential(), )
+        elif g2 == 0:
+            self.layers = nn.Sequential(
+                GroupConv(inp, hidden_dim2, gs1),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=2.0,
+                    act_relu=False,
+                    init_a=[1.0, 0.0],
+                    reduction=act_reduction,
+                    init_b=[0.0, 0.0],
+                    g=gs1,
+                    expansion=False) if y3 > 0 else nn.Sequential(), )
+        else:
+            self.layers = nn.Sequential(
+                GroupConv(inp, hidden_dim2, gs1),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=2.0,
+                    act_relu=True if y1 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=False) if y1 > 0 else nn.ReLU6(),
+                ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(),
+                DepthSpatialSepConv(hidden_dim2, (1, 1), kernel_size, stride)
+                if depthsep else
+                DepthConv(hidden_dim2, hidden_dim2, kernel_size, stride),
+                nn.Sequential(),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=2.0,
+                    act_relu=True if y2 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=True) if y2 > 0 else nn.ReLU6(),
+                ChannelShuffle(hidden_dim2 // 4)
+                if shuffle and y1 != 0 and y2 != 0 else nn.Sequential()
+                if y1 == 0 and y2 == 0 else ChannelShuffle(hidden_dim2 // 2),
+                GroupConv(hidden_dim2, oup, (g1, g2)),
+                DYShiftMax(
+                    oup,
+                    oup,
+                    act_max=2.0,
+                    act_relu=False,
+                    init_a=[1.0, 0.0],
+                    reduction=act_reduction // 2
+                    if oup < hidden_dim2 else act_reduction,
+                    init_b=[0.0, 0.0],
+                    g=(g1, g2),
+                    expansion=False) if y3 > 0 else nn.Sequential(),
+                ChannelShuffle(g2) if shuffle else nn.Sequential(),
+                ChannelShuffle(oup // 2)
+                if shuffle and y3 != 0 else nn.Sequential(), )
+
+    def forward(self, x):
+        identity = x
+        out = self.layers(x)
+
+        if self.identity:
+            out = out + identity
+
+        return out
+
+
+class MicroNet(nn.Layer):
+    """
+        the MicroNet backbone network for recognition module.
+        Args:
+            mode(str): {'M0', 'M1', 'M2', 'M3'} 
+                Four models are proposed based on four different computational costs (4M, 6M, 12M, 21M MAdds)
+                Default: 'M3'.
+    """
+
+    def __init__(self, mode='M3', **kwargs):
+        super(MicroNet, self).__init__()
+
+        self.cfgs = get_micronet_config(mode)
+
+        activation_cfg = {}
+        if mode == 'M0':
+            input_channel = 4
+            stem_groups = 2, 2
+            out_ch = 384
+            activation_cfg['init_a'] = 1.0, 1.0
+            activation_cfg['init_b'] = 0.0, 0.0
+        elif mode == 'M1':
+            input_channel = 6
+            stem_groups = 3, 2
+            out_ch = 576
+            activation_cfg['init_a'] = 1.0, 1.0
+            activation_cfg['init_b'] = 0.0, 0.0
+        elif mode == 'M2':
+            input_channel = 8
+            stem_groups = 4, 2
+            out_ch = 768
+            activation_cfg['init_a'] = 1.0, 1.0
+            activation_cfg['init_b'] = 0.0, 0.0
+        elif mode == 'M3':
+            input_channel = 12
+            stem_groups = 4, 3
+            out_ch = 432
+            activation_cfg['init_a'] = 1.0, 0.5
+            activation_cfg['init_b'] = 0.0, 0.5
+        else:
+            raise NotImplementedError("mode[" + mode +
+                                      "_model] is not implemented!")
+
+        layers = [StemLayer(3, input_channel, stride=2, groups=stem_groups)]
+
+        for idx, val in enumerate(self.cfgs):
+            s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r = val
+
+            t1 = (c1, c2)
+            gs1 = (g1, g2)
+            gs2 = (c3, g3, g4)
+            activation_cfg['dy'] = [y1, y2, y3]
+            activation_cfg['ratio'] = r
+
+            output_channel = c
+            layers.append(
+                DYMicroBlock(
+                    input_channel,
+                    output_channel,
+                    kernel_size=ks,
+                    stride=s,
+                    ch_exp=t1,
+                    ch_per_group=gs1,
+                    groups_1x1=gs2,
+                    depthsep=True,
+                    shuffle=True,
+                    activation_cfg=activation_cfg, ))
+            input_channel = output_channel
+            for i in range(1, n):
+                layers.append(
+                    DYMicroBlock(
+                        input_channel,
+                        output_channel,
+                        kernel_size=ks,
+                        stride=1,
+                        ch_exp=t1,
+                        ch_per_group=gs1,
+                        groups_1x1=gs2,
+                        depthsep=True,
+                        shuffle=True,
+                        activation_cfg=activation_cfg, ))
+                input_channel = output_channel
+        self.features = nn.Sequential(*layers)
+
+        self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+
+        self.out_channels = make_divisible(out_ch)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.pool(x)
+        return x
--- a/backend/ppocr/modeling/backbones/rec_mobilenet_v3.py
+++ b/backend/ppocr/modeling/backbones/rec_mobilenet_v3.py
@@ -0,0 +1,138 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import nn
+
+from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible
+
+__all__ = ['MobileNetV3']
+
+
+class MobileNetV3(nn.Layer):
+    def __init__(self,
+                 in_channels=3,
+                 model_name='small',
+                 scale=0.5,
+                 large_stride=None,
+                 small_stride=None,
+                 disable_se=False,
+                 **kwargs):
+        super(MobileNetV3, self).__init__()
+        self.disable_se = disable_se
+        if small_stride is None:
+            small_stride = [2, 2, 2, 2]
+        if large_stride is None:
+            large_stride = [1, 2, 2, 2]
+
+        assert isinstance(large_stride, list), "large_stride type must " \
+                                               "be list but got {}".format(type(large_stride))
+        assert isinstance(small_stride, list), "small_stride type must " \
+                                               "be list but got {}".format(type(small_stride))
+        assert len(large_stride) == 4, "large_stride length must be " \
+                                       "4 but got {}".format(len(large_stride))
+        assert len(small_stride) == 4, "small_stride length must be " \
+                                       "4 but got {}".format(len(small_stride))
+
+        if model_name == "large":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', large_stride[0]],
+                [3, 64, 24, False, 'relu', (large_stride[1], 1)],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', (large_stride[2], 1)],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hardswish', 1],
+                [3, 200, 80, False, 'hardswish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
+                [3, 480, 112, True, 'hardswish', 1],
+                [3, 672, 112, True, 'hardswish', 1],
+                [5, 672, 160, True, 'hardswish', (large_stride[3], 1)],
+                [5, 960, 160, True, 'hardswish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
+            ]
+            cls_ch_squeeze = 960
+        elif model_name == "small":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', (small_stride[0], 1)],
+                [3, 72, 24, False, 'relu', (small_stride[1], 1)],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hardswish', (small_stride[2], 1)],
+                [5, 240, 40, True, 'hardswish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
+                [5, 120, 48, True, 'hardswish', 1],
+                [5, 144, 48, True, 'hardswish', 1],
+                [5, 288, 96, True, 'hardswish', (small_stride[3], 1)],
+                [5, 576, 96, True, 'hardswish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
+            ]
+            cls_ch_squeeze = 576
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+
+        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
+        assert scale in supported_scale, \
+            "supported scales are {} but input scale is {}".format(supported_scale, scale)
+
+        inplanes = 16
+        # conv1
+        self.conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=make_divisible(inplanes * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            if_act=True,
+            act='hardswish')
+        i = 0
+        block_list = []
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in cfg:
+            se = se and not self.disable_se
+            block_list.append(
+                ResidualUnit(
+                    in_channels=inplanes,
+                    mid_channels=make_divisible(scale * exp),
+                    out_channels=make_divisible(scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl))
+            inplanes = make_divisible(scale * c)
+            i += 1
+        self.blocks = nn.Sequential(*block_list)
+
+        self.conv2 = ConvBNLayer(
+            in_channels=inplanes,
+            out_channels=make_divisible(scale * cls_ch_squeeze),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            if_act=True,
+            act='hardswish')
+
+        self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.out_channels = make_divisible(scale * cls_ch_squeeze)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.pool(x)
+        return x
--- a/backend/ppocr/modeling/backbones/rec_mv1_enhance.py
+++ b/backend/ppocr/modeling/backbones/rec_mv1_enhance.py
@@ -0,0 +1,256 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is refer from: https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/arch/backbone/legendary_models/pp_lcnet.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+import paddle
+from paddle import ParamAttr, reshape, transpose
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+from paddle.nn.functional import hardswish, hardsigmoid
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='hard_swish'):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 dw_size=3,
+                 padding=1,
+                 use_se=False):
+        super(DepthwiseSeparable, self).__init__()
+        self.use_se = use_se
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=dw_size,
+            stride=stride,
+            padding=padding,
+            num_groups=int(num_groups * scale))
+        if use_se:
+            self._se = SEModule(int(num_filters1 * scale))
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        if self.use_se:
+            y = self._se(y)
+        y = self._pointwise_conv(y)
+        return y
+
+
+class MobileNetV1Enhance(nn.Layer):
+    def __init__(self,
+                 in_channels=3,
+                 scale=0.5,
+                 last_conv_stride=1,
+                 last_pool_type='max',
+                 **kwargs):
+        super().__init__()
+        self.scale = scale
+        self.block_list = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        conv2_1 = DepthwiseSeparable(
+            num_channels=int(32 * scale),
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv2_1)
+
+        conv2_2 = DepthwiseSeparable(
+            num_channels=int(64 * scale),
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv2_2)
+
+        conv3_1 = DepthwiseSeparable(
+            num_channels=int(128 * scale),
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv3_1)
+
+        conv3_2 = DepthwiseSeparable(
+            num_channels=int(128 * scale),
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=(2, 1),
+            scale=scale)
+        self.block_list.append(conv3_2)
+
+        conv4_1 = DepthwiseSeparable(
+            num_channels=int(256 * scale),
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv4_1)
+
+        conv4_2 = DepthwiseSeparable(
+            num_channels=int(256 * scale),
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=(2, 1),
+            scale=scale)
+        self.block_list.append(conv4_2)
+
+        for _ in range(5):
+            conv5 = DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                dw_size=5,
+                padding=2,
+                scale=scale,
+                use_se=False)
+            self.block_list.append(conv5)
+
+        conv5_6 = DepthwiseSeparable(
+            num_channels=int(512 * scale),
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=(2, 1),
+            dw_size=5,
+            padding=2,
+            scale=scale,
+            use_se=True)
+        self.block_list.append(conv5_6)
+
+        conv6 = DepthwiseSeparable(
+            num_channels=int(1024 * scale),
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=last_conv_stride,
+            dw_size=5,
+            padding=2,
+            use_se=True,
+            scale=scale)
+        self.block_list.append(conv6)
+
+        self.block_list = nn.Sequential(*self.block_list)
+        if last_pool_type == 'avg':
+            self.pool = nn.AvgPool2D(kernel_size=2, stride=2, padding=0)
+        else:
+            self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.out_channels = int(1024 * scale)
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        y = self.block_list(y)
+        y = self.pool(y)
+        return y
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = hardsigmoid(outputs)
+        return paddle.multiply(x=inputs, y=outputs)
--- a/backend/ppocr/modeling/backbones/rec_nrtr_mtb.py
+++ b/backend/ppocr/modeling/backbones/rec_nrtr_mtb.py
@@ -0,0 +1,48 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import nn
+import paddle
+
+
+class MTB(nn.Layer):
+    def __init__(self, cnn_num, in_channels):
+        super(MTB, self).__init__()
+        self.block = nn.Sequential()
+        self.out_channels = in_channels
+        self.cnn_num = cnn_num
+        if self.cnn_num == 2:
+            for i in range(self.cnn_num):
+                self.block.add_sublayer(
+                    'conv_{}'.format(i),
+                    nn.Conv2D(
+                        in_channels=in_channels
+                        if i == 0 else 32 * (2**(i - 1)),
+                        out_channels=32 * (2**i),
+                        kernel_size=3,
+                        stride=2,
+                        padding=1))
+                self.block.add_sublayer('relu_{}'.format(i), nn.ReLU())
+                self.block.add_sublayer('bn_{}'.format(i),
+                                        nn.BatchNorm2D(32 * (2**i)))
+
+    def forward(self, images):
+        x = self.block(images)
+        if self.cnn_num == 2:
+            # (b, w, h, c)
+            x = paddle.transpose(x, [0, 3, 2, 1])
+            x_shape = paddle.shape(x)
+            x = paddle.reshape(
+                x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
+        return x
--- a/backend/ppocr/modeling/backbones/rec_resnet_31.py
+++ b/backend/ppocr/modeling/backbones/rec_resnet_31.py
@@ -0,0 +1,210 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from: 
+https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py
+https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+__all__ = ["ResNet31"]
+
+
+def conv3x3(in_channel, out_channel, stride=1):
+    return nn.Conv2D(
+        in_channel,
+        out_channel,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias_attr=False)
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+
+    def __init__(self, in_channels, channels, stride=1, downsample=False):
+        super().__init__()
+        self.conv1 = conv3x3(in_channels, channels, stride)
+        self.bn1 = nn.BatchNorm2D(channels)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(channels, channels)
+        self.bn2 = nn.BatchNorm2D(channels)
+        self.downsample = downsample
+        if downsample:
+            self.downsample = nn.Sequential(
+                nn.Conv2D(
+                    in_channels,
+                    channels * self.expansion,
+                    1,
+                    stride,
+                    bias_attr=False),
+                nn.BatchNorm2D(channels * self.expansion), )
+        else:
+            self.downsample = nn.Sequential()
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet31(nn.Layer):
+    '''
+    Args:
+        in_channels (int): Number of channels of input image tensor.
+        layers (list[int]): List of BasicBlock number for each stage.
+        channels (list[int]): List of out_channels of Conv2d layer.
+        out_indices (None | Sequence[int]): Indices of output stages.
+        last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage.
+    '''
+
+    def __init__(self,
+                 in_channels=3,
+                 layers=[1, 2, 5, 3],
+                 channels=[64, 128, 256, 256, 512, 512, 512],
+                 out_indices=None,
+                 last_stage_pool=False):
+        super(ResNet31, self).__init__()
+        assert isinstance(in_channels, int)
+        assert isinstance(last_stage_pool, bool)
+
+        self.out_indices = out_indices
+        self.last_stage_pool = last_stage_pool
+
+        # conv 1 (Conv Conv)
+        self.conv1_1 = nn.Conv2D(
+            in_channels, channels[0], kernel_size=3, stride=1, padding=1)
+        self.bn1_1 = nn.BatchNorm2D(channels[0])
+        self.relu1_1 = nn.ReLU()
+
+        self.conv1_2 = nn.Conv2D(
+            channels[0], channels[1], kernel_size=3, stride=1, padding=1)
+        self.bn1_2 = nn.BatchNorm2D(channels[1])
+        self.relu1_2 = nn.ReLU()
+
+        # conv 2 (Max-pooling, Residual block, Conv)
+        self.pool2 = nn.MaxPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.block2 = self._make_layer(channels[1], channels[2], layers[0])
+        self.conv2 = nn.Conv2D(
+            channels[2], channels[2], kernel_size=3, stride=1, padding=1)
+        self.bn2 = nn.BatchNorm2D(channels[2])
+        self.relu2 = nn.ReLU()
+
+        # conv 3 (Max-pooling, Residual block, Conv)
+        self.pool3 = nn.MaxPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.block3 = self._make_layer(channels[2], channels[3], layers[1])
+        self.conv3 = nn.Conv2D(
+            channels[3], channels[3], kernel_size=3, stride=1, padding=1)
+        self.bn3 = nn.BatchNorm2D(channels[3])
+        self.relu3 = nn.ReLU()
+
+        # conv 4 (Max-pooling, Residual block, Conv)
+        self.pool4 = nn.MaxPool2D(
+            kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True)
+        self.block4 = self._make_layer(channels[3], channels[4], layers[2])
+        self.conv4 = nn.Conv2D(
+            channels[4], channels[4], kernel_size=3, stride=1, padding=1)
+        self.bn4 = nn.BatchNorm2D(channels[4])
+        self.relu4 = nn.ReLU()
+
+        # conv 5 ((Max-pooling), Residual block, Conv)
+        self.pool5 = None
+        if self.last_stage_pool:
+            self.pool5 = nn.MaxPool2D(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.block5 = self._make_layer(channels[4], channels[5], layers[3])
+        self.conv5 = nn.Conv2D(
+            channels[5], channels[5], kernel_size=3, stride=1, padding=1)
+        self.bn5 = nn.BatchNorm2D(channels[5])
+        self.relu5 = nn.ReLU()
+
+        self.out_channels = channels[-1]
+
+    def _make_layer(self, input_channels, output_channels, blocks):
+        layers = []
+        for _ in range(blocks):
+            downsample = None
+            if input_channels != output_channels:
+                downsample = nn.Sequential(
+                    nn.Conv2D(
+                        input_channels,
+                        output_channels,
+                        kernel_size=1,
+                        stride=1,
+                        bias_attr=False),
+                    nn.BatchNorm2D(output_channels), )
+
+            layers.append(
+                BasicBlock(
+                    input_channels, output_channels, downsample=downsample))
+            input_channels = output_channels
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1_1(x)
+        x = self.bn1_1(x)
+        x = self.relu1_1(x)
+
+        x = self.conv1_2(x)
+        x = self.bn1_2(x)
+        x = self.relu1_2(x)
+
+        outs = []
+        for i in range(4):
+            layer_index = i + 2
+            pool_layer = getattr(self, f'pool{layer_index}')
+            block_layer = getattr(self, f'block{layer_index}')
+            conv_layer = getattr(self, f'conv{layer_index}')
+            bn_layer = getattr(self, f'bn{layer_index}')
+            relu_layer = getattr(self, f'relu{layer_index}')
+
+            if pool_layer is not None:
+                x = pool_layer(x)
+            x = block_layer(x)
+            x = conv_layer(x)
+            x = bn_layer(x)
+            x = relu_layer(x)
+
+            outs.append(x)
+
+        if self.out_indices is not None:
+            return tuple([outs[i] for i in self.out_indices])
+
+        return x
--- a/backend/ppocr/modeling/backbones/rec_resnet_aster.py
+++ b/backend/ppocr/modeling/backbones/rec_resnet_aster.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/resnet_aster.py
+"""
+import paddle
+import paddle.nn as nn
+
+import sys
+import math
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2D(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias_attr=False)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2D(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False)
+
+
+def get_sinusoid_encoding(n_position, feat_dim, wave_length=10000):
+    # [n_position]
+    positions = paddle.arange(0, n_position)
+    # [feat_dim]
+    dim_range = paddle.arange(0, feat_dim)
+    dim_range = paddle.pow(wave_length, 2 * (dim_range // 2) / feat_dim)
+    # [n_position, feat_dim]
+    angles = paddle.unsqueeze(
+        positions, axis=1) / paddle.unsqueeze(
+            dim_range, axis=0)
+    angles = paddle.cast(angles, "float32")
+    angles[:, 0::2] = paddle.sin(angles[:, 0::2])
+    angles[:, 1::2] = paddle.cos(angles[:, 1::2])
+    return angles
+
+
+class AsterBlock(nn.Layer):
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(AsterBlock, self).__init__()
+        self.conv1 = conv1x1(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2D(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2D(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+
+
+class ResNet_ASTER(nn.Layer):
+    """For aster or crnn"""
+
+    def __init__(self, with_lstm=True, n_group=1, in_channels=3):
+        super(ResNet_ASTER, self).__init__()
+        self.with_lstm = with_lstm
+        self.n_group = n_group
+
+        self.layer0 = nn.Sequential(
+            nn.Conv2D(
+                in_channels,
+                32,
+                kernel_size=(3, 3),
+                stride=1,
+                padding=1,
+                bias_attr=False),
+            nn.BatchNorm2D(32),
+            nn.ReLU())
+
+        self.inplanes = 32
+        self.layer1 = self._make_layer(32, 3, [2, 2])  # [16, 50]
+        self.layer2 = self._make_layer(64, 4, [2, 2])  # [8, 25]
+        self.layer3 = self._make_layer(128, 6, [2, 1])  # [4, 25]
+        self.layer4 = self._make_layer(256, 6, [2, 1])  # [2, 25]
+        self.layer5 = self._make_layer(512, 3, [2, 1])  # [1, 25]
+
+        if with_lstm:
+            self.rnn = nn.LSTM(512, 256, direction="bidirect", num_layers=2)
+            self.out_channels = 2 * 256
+        else:
+            self.out_channels = 512
+
+    def _make_layer(self, planes, blocks, stride):
+        downsample = None
+        if stride != [1, 1] or self.inplanes != planes:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes, stride), nn.BatchNorm2D(planes))
+
+        layers = []
+        layers.append(AsterBlock(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes
+        for _ in range(1, blocks):
+            layers.append(AsterBlock(self.inplanes, planes))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x0 = self.layer0(x)
+        x1 = self.layer1(x0)
+        x2 = self.layer2(x1)
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+        x5 = self.layer5(x4)
+
+        cnn_feat = x5.squeeze(2)  # [N, c, w]
+        cnn_feat = paddle.transpose(cnn_feat, perm=[0, 2, 1])
+        if self.with_lstm:
+            rnn_feat, _ = self.rnn(cnn_feat)
+            return rnn_feat
+        else:
+            return cnn_feat
--- a/backend/ppocr/modeling/backbones/rec_resnet_fpn.py
+++ b/backend/ppocr/modeling/backbones/rec_resnet_fpn.py
@@ -0,0 +1,307 @@
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import paddle.fluid as fluid
+import paddle
+import numpy as np
+
+__all__ = ["ResNetFPN"]
+
+
+class ResNetFPN(nn.Layer):
+    def __init__(self, in_channels=1, layers=50, **kwargs):
+        super(ResNetFPN, self).__init__()
+        supported_layers = {
+            18: {
+                'depth': [2, 2, 2, 2],
+                'block_class': BasicBlock
+            },
+            34: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BasicBlock
+            },
+            50: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BottleneckBlock
+            },
+            101: {
+                'depth': [3, 4, 23, 3],
+                'block_class': BottleneckBlock
+            },
+            152: {
+                'depth': [3, 8, 36, 3],
+                'block_class': BottleneckBlock
+            }
+        }
+        stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
+        num_filters = [64, 128, 256, 512]
+        self.depth = supported_layers[layers]['depth']
+        self.F = []
+        self.conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=7,
+            stride=2,
+            act="relu",
+            name="conv1")
+        self.block_list = []
+        in_ch = 64
+        if layers >= 50:
+            for block in range(len(self.depth)):
+                for i in range(self.depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    block_list = self.add_sublayer(
+                        "bottleneckBlock_{}_{}".format(block, i),
+                        BottleneckBlock(
+                            in_channels=in_ch,
+                            out_channels=num_filters[block],
+                            stride=stride_list[block] if i == 0 else 1,
+                            name=conv_name))
+                    in_ch = num_filters[block] * 4
+                    self.block_list.append(block_list)
+                self.F.append(block_list)
+        else:
+            for block in range(len(self.depth)):
+                for i in range(self.depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(
+                            in_channels=in_ch,
+                            out_channels=num_filters[block],
+                            stride=stride_list[block] if i == 0 else 1,
+                            is_first=block == i == 0,
+                            name=conv_name))
+                    in_ch = basic_block.out_channels
+                    self.block_list.append(basic_block)
+        out_ch_list = [in_ch // 4, in_ch // 2, in_ch]
+        self.base_block = []
+        self.conv_trans = []
+        self.bn_block = []
+        for i in [-2, -3]:
+            in_channels = out_ch_list[i + 1] + out_ch_list[i]
+
+            self.base_block.append(
+                self.add_sublayer(
+                    "F_{}_base_block_0".format(i),
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=out_ch_list[i],
+                        kernel_size=1,
+                        weight_attr=ParamAttr(trainable=True),
+                        bias_attr=ParamAttr(trainable=True))))
+            self.base_block.append(
+                self.add_sublayer(
+                    "F_{}_base_block_1".format(i),
+                    nn.Conv2D(
+                        in_channels=out_ch_list[i],
+                        out_channels=out_ch_list[i],
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(trainable=True),
+                        bias_attr=ParamAttr(trainable=True))))
+            self.base_block.append(
+                self.add_sublayer(
+                    "F_{}_base_block_2".format(i),
+                    nn.BatchNorm(
+                        num_channels=out_ch_list[i],
+                        act="relu",
+                        param_attr=ParamAttr(trainable=True),
+                        bias_attr=ParamAttr(trainable=True))))
+        self.base_block.append(
+            self.add_sublayer(
+                "F_{}_base_block_3".format(i),
+                nn.Conv2D(
+                    in_channels=out_ch_list[i],
+                    out_channels=512,
+                    kernel_size=1,
+                    bias_attr=ParamAttr(trainable=True),
+                    weight_attr=ParamAttr(trainable=True))))
+        self.out_channels = 512
+
+    def __call__(self, x):
+        x = self.conv(x)
+        fpn_list = []
+        F = []
+        for i in range(len(self.depth)):
+            fpn_list.append(np.sum(self.depth[:i + 1]))
+
+        for i, block in enumerate(self.block_list):
+            x = block(x)
+            for number in fpn_list:
+                if i + 1 == number:
+                    F.append(x)
+        base = F[-1]
+
+        j = 0
+        for i, block in enumerate(self.base_block):
+            if i % 3 == 0 and i < 6:
+                j = j + 1
+                b, c, w, h = F[-j - 1].shape
+                if [w, h] == list(base.shape[2:]):
+                    base = base
+                else:
+                    base = self.conv_trans[j - 1](base)
+                    base = self.bn_block[j - 1](base)
+                base = paddle.concat([base, F[-j - 1]], axis=1)
+            base = block(base)
+        return base
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=2 if stride == (1, 1) else kernel_size,
+            dilation=2 if stride == (1, 1) else 1,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '.conv2d.output.1.w_0'),
+            bias_attr=False, )
+
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '.output.1.w_0'),
+            bias_attr=ParamAttr(name=name + '.output.1.b_0'),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def __call__(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class ShortCut(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, is_first=False):
+        super(ShortCut, self).__init__()
+        self.use_conv = True
+
+        if in_channels != out_channels or stride != 1 or is_first == True:
+            if stride == (1, 1):
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, 1, name=name)
+            else:  # stride==(2,2)
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, stride, name=name)
+        else:
+            self.use_conv = False
+
+    def forward(self, x):
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels * 4,
+            stride=stride,
+            is_first=False,
+            name=name + "_branch1")
+        self.out_channels = out_channels * 4
+
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = self.conv2(y)
+        y = y + self.short(x)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, is_first):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=stride,
+            is_first=is_first,
+            name=name + "_branch1")
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = y + self.short(x)
+        return F.relu(y)
--- a/backend/ppocr/modeling/backbones/rec_resnet_vd.py
+++ b/backend/ppocr/modeling/backbones/rec_resnet_vd.py
@@ -0,0 +1,286 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ["ResNet"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
+        self._conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1 if is_vd_mode else stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = nn.BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=stride,
+                is_vd_mode=not if_first and stride[0] != 1,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                is_vd_mode=not if_first and stride[0] != 1,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet(nn.Layer):
+    def __init__(self, in_channels=3, layers=50, **kwargs):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=stride,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    shortcut = True
+                    self.block_list.append(bottleneck_block)
+                self.out_channels = num_filters[block] * 4
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=stride,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    shortcut = True
+                    self.block_list.append(basic_block)
+                self.out_channels = num_filters[block]
+        self.out_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.out_pool(y)
+        return y
--- a/backend/ppocr/modeling/backbones/rec_svtrnet.py
+++ b/backend/ppocr/modeling/backbones/rec_svtrnet.py
@@ -0,0 +1,584 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 bias_attr=False,
+                 groups=1,
+                 act=nn.GELU):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform()),
+            bias_attr=bias_attr)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.act = act()
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class ConvMixer(nn.Layer):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            HW=[8, 25],
+            local_k=[3, 3], ):
+        super().__init__()
+        self.HW = HW
+        self.dim = dim
+        self.local_mixer = nn.Conv2D(
+            dim,
+            dim,
+            local_k,
+            1, [local_k[0] // 2, local_k[1] // 2],
+            groups=num_heads,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+    def forward(self, x):
+        h = self.HW[0]
+        w = self.HW[1]
+        x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
+        x = self.local_mixer(x)
+        x = x.flatten(2).transpose([0, 2, 1])
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 mixer='Global',
+                 HW=[8, 25],
+                 local_k=[7, 11],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.HW = HW
+        if HW is not None:
+            H = HW[0]
+            W = HW[1]
+            self.N = H * W
+            self.C = dim
+        if mixer == 'Local' and HW is not None:
+            hk = local_k[0]
+            wk = local_k[1]
+            mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype='float32')
+            for h in range(0, H):
+                for w in range(0, W):
+                    mask[h * W + w, h:h + hk, w:w + wk] = 0.
+            mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
+                               2].flatten(1)
+            mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32')
+            mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf)
+            self.mask = mask.unsqueeze([0, 1])
+        self.mixer = mixer
+
+    def forward(self, x):
+        if self.HW is not None:
+            N = self.N
+            C = self.C
+        else:
+            _, N, C = x.shape
+        qkv = self.qkv(x).reshape((0, N, 3, self.num_heads, C //
+                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2))))
+        if self.mixer == 'Local':
+            attn += self.mask
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mixer='Global',
+                 local_mixer=[7, 11],
+                 HW=[8, 25],
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-6,
+                 prenorm=True):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm1 = norm_layer(dim)
+        if mixer == 'Global' or mixer == 'Local':
+            self.mixer = Attention(
+                dim,
+                num_heads=num_heads,
+                mixer=mixer,
+                HW=HW,
+                local_k=local_mixer,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop)
+        elif mixer == 'Conv':
+            self.mixer = ConvMixer(
+                dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
+        else:
+            raise TypeError("The mixer must be one of [Global, Local, Conv]")
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp_ratio = mlp_ratio
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.prenorm = prenorm
+
+    def forward(self, x):
+        if self.prenorm:
+            x = self.norm1(x + self.drop_path(self.mixer(x)))
+            x = self.norm2(x + self.drop_path(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.mixer(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[32, 100],
+                 in_channels=3,
+                 embed_dim=768,
+                 sub_num=2):
+        super().__init__()
+        num_patches = (img_size[1] // (2 ** sub_num)) * \
+                      (img_size[0] // (2 ** sub_num))
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+        self.norm = None
+        if sub_num == 2:
+            self.proj = nn.Sequential(
+                ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=embed_dim // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    act=nn.GELU,
+                    bias_attr=None),
+                ConvBNLayer(
+                    in_channels=embed_dim // 2,
+                    out_channels=embed_dim,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    act=nn.GELU,
+                    bias_attr=None))
+        if sub_num == 3:
+            self.proj = nn.Sequential(
+                ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=embed_dim // 4,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    act=nn.GELU,
+                    bias_attr=None),
+                ConvBNLayer(
+                    in_channels=embed_dim // 4,
+                    out_channels=embed_dim // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    act=nn.GELU,
+                    bias_attr=None),
+                ConvBNLayer(
+                    in_channels=embed_dim // 2,
+                    out_channels=embed_dim,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    act=nn.GELU,
+                    bias_attr=None))
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose((0, 2, 1))
+        return x
+
+
+class SubSample(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 types='Pool',
+                 stride=[2, 1],
+                 sub_norm='nn.LayerNorm',
+                 act=None):
+        super().__init__()
+        self.types = types
+        if types == 'Pool':
+            self.avgpool = nn.AvgPool2D(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.maxpool = nn.MaxPool2D(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.proj = nn.Linear(in_channels, out_channels)
+        else:
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                weight_attr=ParamAttr(initializer=KaimingNormal()))
+        self.norm = eval(sub_norm)(out_channels)
+        if act is not None:
+            self.act = act()
+        else:
+            self.act = None
+
+    def forward(self, x):
+
+        if self.types == 'Pool':
+            x1 = self.avgpool(x)
+            x2 = self.maxpool(x)
+            x = (x1 + x2) * 0.5
+            out = self.proj(x.flatten(2).transpose((0, 2, 1)))
+        else:
+            x = self.conv(x)
+            out = x.flatten(2).transpose((0, 2, 1))
+        out = self.norm(out)
+        if self.act is not None:
+            out = self.act(out)
+
+        return out
+
+
+class SVTRNet(nn.Layer):
+    def __init__(
+            self,
+            img_size=[32, 100],
+            in_channels=3,
+            embed_dim=[64, 128, 256],
+            depth=[3, 6, 3],
+            num_heads=[2, 4, 8],
+            mixer=['Local'] * 6 + ['Global'] *
+            6,  # Local atten, Global atten, Conv
+            local_mixer=[[7, 11], [7, 11], [7, 11]],
+            patch_merging='Conv',  # Conv, Pool, None
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            last_drop=0.1,
+            attn_drop_rate=0.,
+            drop_path_rate=0.1,
+            norm_layer='nn.LayerNorm',
+            sub_norm='nn.LayerNorm',
+            epsilon=1e-6,
+            out_channels=192,
+            out_char_num=25,
+            block_unit='Block',
+            act='nn.GELU',
+            last_stage=True,
+            sub_num=2,
+            prenorm=True,
+            use_lenhead=False,
+            **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.out_channels = out_channels
+        self.prenorm = prenorm
+        patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim[0],
+            sub_num=sub_num)
+        num_patches = self.patch_embed.num_patches
+        self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
+        self.pos_embed = self.create_parameter(
+            shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        Block_unit = eval(block_unit)
+
+        dpr = np.linspace(0, drop_path_rate, sum(depth))
+        self.blocks1 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mixer=mixer[0:depth[0]][i],
+                HW=self.HW,
+                local_mixer=local_mixer[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[0:depth[0]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[0])
+        ])
+        if patch_merging is not None:
+            self.sub_sample1 = SubSample(
+                embed_dim[0],
+                embed_dim[1],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 2, self.HW[1]]
+        else:
+            HW = self.HW
+        self.patch_merging = patch_merging
+        self.blocks2 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mixer=mixer[depth[0]:depth[0] + depth[1]][i],
+                HW=HW,
+                local_mixer=local_mixer[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[1])
+        ])
+        if patch_merging is not None:
+            self.sub_sample2 = SubSample(
+                embed_dim[1],
+                embed_dim[2],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 4, self.HW[1]]
+        else:
+            HW = self.HW
+        self.blocks3 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mixer=mixer[depth[0] + depth[1]:][i],
+                HW=HW,
+                local_mixer=local_mixer[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0] + depth[1]:][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[2])
+        ])
+        self.last_stage = last_stage
+        if last_stage:
+            self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num])
+            self.last_conv = nn.Conv2D(
+                in_channels=embed_dim[2],
+                out_channels=self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.hardswish = nn.Hardswish()
+            self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
+        if not prenorm:
+            self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
+        self.use_lenhead = use_lenhead
+        if use_lenhead:
+            self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
+            self.hardswish_len = nn.Hardswish()
+            self.dropout_len = nn.Dropout(
+                p=last_drop, mode="downscale_in_infer")
+
+        trunc_normal_(self.pos_embed)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        if self.patch_merging is not None:
+            x = self.sub_sample1(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[0], self.HW[0], self.HW[1]]))
+        for blk in self.blocks2:
+            x = blk(x)
+        if self.patch_merging is not None:
+            x = self.sub_sample2(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
+        for blk in self.blocks3:
+            x = blk(x)
+        if not self.prenorm:
+            x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.use_lenhead:
+            len_x = self.len_conv(x.mean(1))
+            len_x = self.dropout_len(self.hardswish_len(len_x))
+        if self.last_stage:
+            if self.patch_merging is not None:
+                h = self.HW[0] // 4
+            else:
+                h = self.HW[0]
+            x = self.avg_pool(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[2], h, self.HW[1]]))
+            x = self.last_conv(x)
+            x = self.hardswish(x)
+            x = self.dropout(x)
+        if self.use_lenhead:
+            return x, len_x
+        return x
--- a/backend/ppocr/modeling/backbones/vqa_layoutlm.py
+++ b/backend/ppocr/modeling/backbones/vqa_layoutlm.py
@@ -0,0 +1,172 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from paddle import nn
+
+from paddlenlp.transformers import LayoutXLMModel, LayoutXLMForTokenClassification, LayoutXLMForRelationExtraction
+from paddlenlp.transformers import LayoutLMModel, LayoutLMForTokenClassification
+from paddlenlp.transformers import LayoutLMv2Model, LayoutLMv2ForTokenClassification, LayoutLMv2ForRelationExtraction
+
+__all__ = ["LayoutXLMForSer", 'LayoutLMForSer']
+
+pretrained_model_dict = {
+    LayoutXLMModel: 'layoutxlm-base-uncased',
+    LayoutLMModel: 'layoutlm-base-uncased',
+    LayoutLMv2Model: 'layoutlmv2-base-uncased'
+}
+
+
+class NLPBaseModel(nn.Layer):
+    def __init__(self,
+                 base_model_class,
+                 model_class,
+                 type='ser',
+                 pretrained=True,
+                 checkpoints=None,
+                 **kwargs):
+        super(NLPBaseModel, self).__init__()
+        if checkpoints is not None:
+            self.model = model_class.from_pretrained(checkpoints)
+        else:
+            pretrained_model_name = pretrained_model_dict[base_model_class]
+            if pretrained:
+                base_model = base_model_class.from_pretrained(
+                    pretrained_model_name)
+            else:
+                base_model = base_model_class(
+                    **base_model_class.pretrained_init_configuration[
+                        pretrained_model_name])
+            if type == 'ser':
+                self.model = model_class(
+                    base_model, num_classes=kwargs['num_classes'], dropout=None)
+            else:
+                self.model = model_class(base_model, dropout=None)
+        self.out_channels = 1
+
+
+class LayoutLMForSer(NLPBaseModel):
+    def __init__(self, num_classes, pretrained=True, checkpoints=None,
+                 **kwargs):
+        super(LayoutLMForSer, self).__init__(
+            LayoutLMModel,
+            LayoutLMForTokenClassification,
+            'ser',
+            pretrained,
+            checkpoints,
+            num_classes=num_classes)
+
+    def forward(self, x):
+        x = self.model(
+            input_ids=x[0],
+            bbox=x[2],
+            attention_mask=x[4],
+            token_type_ids=x[5],
+            position_ids=None,
+            output_hidden_states=False)
+        return x
+
+
+class LayoutLMv2ForSer(NLPBaseModel):
+    def __init__(self, num_classes, pretrained=True, checkpoints=None,
+                 **kwargs):
+        super(LayoutLMv2ForSer, self).__init__(
+            LayoutLMv2Model,
+            LayoutLMv2ForTokenClassification,
+            'ser',
+            pretrained,
+            checkpoints,
+            num_classes=num_classes)
+
+    def forward(self, x):
+        x = self.model(
+            input_ids=x[0],
+            bbox=x[2],
+            image=x[3],
+            attention_mask=x[4],
+            token_type_ids=x[5],
+            position_ids=None,
+            head_mask=None,
+            labels=None)
+        return x[0]
+
+
+class LayoutXLMForSer(NLPBaseModel):
+    def __init__(self, num_classes, pretrained=True, checkpoints=None,
+                 **kwargs):
+        super(LayoutXLMForSer, self).__init__(
+            LayoutXLMModel,
+            LayoutXLMForTokenClassification,
+            'ser',
+            pretrained,
+            checkpoints,
+            num_classes=num_classes)
+
+    def forward(self, x):
+        x = self.model(
+            input_ids=x[0],
+            bbox=x[2],
+            image=x[3],
+            attention_mask=x[4],
+            token_type_ids=x[5],
+            position_ids=None,
+            head_mask=None,
+            labels=None)
+        return x[0]
+
+
+class LayoutLMv2ForRe(NLPBaseModel):
+    def __init__(self, pretrained=True, checkpoints=None, **kwargs):
+        super(LayoutLMv2ForRe, self).__init__(LayoutLMv2Model,
+                                              LayoutLMv2ForRelationExtraction,
+                                              're', pretrained, checkpoints)
+
+    def forward(self, x):
+        x = self.model(
+            input_ids=x[0],
+            bbox=x[1],
+            labels=None,
+            image=x[2],
+            attention_mask=x[3],
+            token_type_ids=x[4],
+            position_ids=None,
+            head_mask=None,
+            entities=x[5],
+            relations=x[6])
+        return x
+
+
+class LayoutXLMForRe(NLPBaseModel):
+    def __init__(self, pretrained=True, checkpoints=None, **kwargs):
+        super(LayoutXLMForRe, self).__init__(LayoutXLMModel,
+                                             LayoutXLMForRelationExtraction,
+                                             're', pretrained, checkpoints)
+
+    def forward(self, x):
+        x = self.model(
+            input_ids=x[0],
+            bbox=x[1],
+            labels=None,
+            image=x[2],
+            attention_mask=x[3],
+            token_type_ids=x[4],
+            position_ids=None,
+            head_mask=None,
+            entities=x[5],
+            relations=x[6])
+        return x