mirror of
https://github.com/YaoFANGUK/video-subtitle-remover.git
synced 2026-03-17 02:17:31 +08:00
init
This commit is contained in:
58
backend/ppocr/modeling/heads/__init__.py
Executable file
58
backend/ppocr/modeling/heads/__init__.py
Executable file
@@ -0,0 +1,58 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ['build_head']
|
||||
|
||||
|
||||
def build_head(config):
|
||||
# det head
|
||||
from .det_db_head import DBHead
|
||||
from .det_east_head import EASTHead
|
||||
from .det_sast_head import SASTHead
|
||||
from .det_pse_head import PSEHead
|
||||
from .det_fce_head import FCEHead
|
||||
from .e2e_pg_head import PGHead
|
||||
|
||||
# rec head
|
||||
from .rec_ctc_head import CTCHead
|
||||
from .rec_att_head import AttentionHead
|
||||
from .rec_srn_head import SRNHead
|
||||
from .rec_nrtr_head import Transformer
|
||||
from .rec_sar_head import SARHead
|
||||
from .rec_aster_head import AsterHead
|
||||
from .rec_pren_head import PRENHead
|
||||
from .rec_multi_head import MultiHead
|
||||
|
||||
# cls head
|
||||
from .cls_head import ClsHead
|
||||
|
||||
#kie head
|
||||
from .kie_sdmgr_head import SDMGRHead
|
||||
|
||||
from .table_att_head import TableAttentionHead
|
||||
|
||||
support_dict = [
|
||||
'DBHead', 'PSEHead', 'FCEHead', 'EASTHead', 'SASTHead', 'CTCHead',
|
||||
'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer',
|
||||
'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead',
|
||||
'MultiHead'
|
||||
]
|
||||
|
||||
#table head
|
||||
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception('head only support {}'.format(
|
||||
support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
||||
52
backend/ppocr/modeling/heads/cls_head.py
Normal file
52
backend/ppocr/modeling/heads/cls_head.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn, ParamAttr
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
class ClsHead(nn.Layer):
|
||||
"""
|
||||
Class orientation
|
||||
|
||||
Args:
|
||||
|
||||
params(dict): super parameters for build Class network
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, class_dim, **kwargs):
|
||||
super(ClsHead, self).__init__()
|
||||
self.pool = nn.AdaptiveAvgPool2D(1)
|
||||
stdv = 1.0 / math.sqrt(in_channels * 1.0)
|
||||
self.fc = nn.Linear(
|
||||
in_channels,
|
||||
class_dim,
|
||||
weight_attr=ParamAttr(
|
||||
name="fc_0.w_0",
|
||||
initializer=nn.initializer.Uniform(-stdv, stdv)),
|
||||
bias_attr=ParamAttr(name="fc_0.b_0"), )
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
x = self.pool(x)
|
||||
x = paddle.reshape(x, shape=[x.shape[0], x.shape[1]])
|
||||
x = self.fc(x)
|
||||
if not self.training:
|
||||
x = F.softmax(x, axis=1)
|
||||
return x
|
||||
118
backend/ppocr/modeling/heads/det_db_head.py
Normal file
118
backend/ppocr/modeling/heads/det_db_head.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
def get_bias_attr(k):
|
||||
stdv = 1.0 / math.sqrt(k * 1.0)
|
||||
initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
|
||||
bias_attr = ParamAttr(initializer=initializer)
|
||||
return bias_attr
|
||||
|
||||
|
||||
class Head(nn.Layer):
|
||||
def __init__(self, in_channels, name_list, kernel_list=[3, 2, 2], **kwargs):
|
||||
super(Head, self).__init__()
|
||||
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels // 4,
|
||||
kernel_size=kernel_list[0],
|
||||
padding=int(kernel_list[0] // 2),
|
||||
weight_attr=ParamAttr(),
|
||||
bias_attr=False)
|
||||
self.conv_bn1 = nn.BatchNorm(
|
||||
num_channels=in_channels // 4,
|
||||
param_attr=ParamAttr(
|
||||
initializer=paddle.nn.initializer.Constant(value=1.0)),
|
||||
bias_attr=ParamAttr(
|
||||
initializer=paddle.nn.initializer.Constant(value=1e-4)),
|
||||
act='relu')
|
||||
self.conv2 = nn.Conv2DTranspose(
|
||||
in_channels=in_channels // 4,
|
||||
out_channels=in_channels // 4,
|
||||
kernel_size=kernel_list[1],
|
||||
stride=2,
|
||||
weight_attr=ParamAttr(
|
||||
initializer=paddle.nn.initializer.KaimingUniform()),
|
||||
bias_attr=get_bias_attr(in_channels // 4))
|
||||
self.conv_bn2 = nn.BatchNorm(
|
||||
num_channels=in_channels // 4,
|
||||
param_attr=ParamAttr(
|
||||
initializer=paddle.nn.initializer.Constant(value=1.0)),
|
||||
bias_attr=ParamAttr(
|
||||
initializer=paddle.nn.initializer.Constant(value=1e-4)),
|
||||
act="relu")
|
||||
self.conv3 = nn.Conv2DTranspose(
|
||||
in_channels=in_channels // 4,
|
||||
out_channels=1,
|
||||
kernel_size=kernel_list[2],
|
||||
stride=2,
|
||||
weight_attr=ParamAttr(
|
||||
initializer=paddle.nn.initializer.KaimingUniform()),
|
||||
bias_attr=get_bias_attr(in_channels // 4), )
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv_bn1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.conv_bn2(x)
|
||||
x = self.conv3(x)
|
||||
x = F.sigmoid(x)
|
||||
return x
|
||||
|
||||
|
||||
class DBHead(nn.Layer):
|
||||
"""
|
||||
Differentiable Binarization (DB) for text detection:
|
||||
see https://arxiv.org/abs/1911.08947
|
||||
args:
|
||||
params(dict): super parameters for build DB network
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, k=50, **kwargs):
|
||||
super(DBHead, self).__init__()
|
||||
self.k = k
|
||||
binarize_name_list = [
|
||||
'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
|
||||
'conv2d_transpose_1', 'binarize'
|
||||
]
|
||||
thresh_name_list = [
|
||||
'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
|
||||
'conv2d_transpose_3', 'thresh'
|
||||
]
|
||||
self.binarize = Head(in_channels, binarize_name_list, **kwargs)
|
||||
self.thresh = Head(in_channels, thresh_name_list, **kwargs)
|
||||
|
||||
def step_function(self, x, y):
|
||||
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
shrink_maps = self.binarize(x)
|
||||
if not self.training:
|
||||
return {'maps': shrink_maps}
|
||||
|
||||
threshold_maps = self.thresh(x)
|
||||
binary_maps = self.step_function(shrink_maps, threshold_maps)
|
||||
y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
|
||||
return {'maps': y}
|
||||
121
backend/ppocr/modeling/heads/det_east_head.py
Normal file
121
backend/ppocr/modeling/heads/det_east_head.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.if_act = if_act
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + '_weights'),
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name="bn_" + name + "_scale"),
|
||||
bias_attr=ParamAttr(name="bn_" + name + "_offset"),
|
||||
moving_mean_name="bn_" + name + "_mean",
|
||||
moving_variance_name="bn_" + name + "_variance")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
|
||||
class EASTHead(nn.Layer):
|
||||
"""
|
||||
"""
|
||||
def __init__(self, in_channels, model_name, **kwargs):
|
||||
super(EASTHead, self).__init__()
|
||||
self.model_name = model_name
|
||||
if self.model_name == "large":
|
||||
num_outputs = [128, 64, 1, 8]
|
||||
else:
|
||||
num_outputs = [64, 32, 1, 8]
|
||||
|
||||
self.det_conv1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=num_outputs[0],
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
if_act=True,
|
||||
act='relu',
|
||||
name="det_head1")
|
||||
self.det_conv2 = ConvBNLayer(
|
||||
in_channels=num_outputs[0],
|
||||
out_channels=num_outputs[1],
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
if_act=True,
|
||||
act='relu',
|
||||
name="det_head2")
|
||||
self.score_conv = ConvBNLayer(
|
||||
in_channels=num_outputs[1],
|
||||
out_channels=num_outputs[2],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=False,
|
||||
act=None,
|
||||
name="f_score")
|
||||
self.geo_conv = ConvBNLayer(
|
||||
in_channels=num_outputs[1],
|
||||
out_channels=num_outputs[3],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=False,
|
||||
act=None,
|
||||
name="f_geo")
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
f_det = self.det_conv1(x)
|
||||
f_det = self.det_conv2(f_det)
|
||||
f_score = self.score_conv(f_det)
|
||||
f_score = F.sigmoid(f_score)
|
||||
f_geo = self.geo_conv(f_det)
|
||||
f_geo = (F.sigmoid(f_geo) - 0.5) * 2 * 800
|
||||
|
||||
pred = {'f_score': f_score, 'f_geo': f_geo}
|
||||
return pred
|
||||
99
backend/ppocr/modeling/heads/det_fce_head.py
Normal file
99
backend/ppocr/modeling/heads/det_fce_head.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
|
||||
"""
|
||||
|
||||
from paddle import nn
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn.initializer import Normal
|
||||
import paddle
|
||||
from functools import partial
|
||||
|
||||
|
||||
def multi_apply(func, *args, **kwargs):
|
||||
pfunc = partial(func, **kwargs) if kwargs else func
|
||||
map_results = map(pfunc, *args)
|
||||
return tuple(map(list, zip(*map_results)))
|
||||
|
||||
|
||||
class FCEHead(nn.Layer):
|
||||
"""The class for implementing FCENet head.
|
||||
FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
|
||||
Detection.
|
||||
|
||||
[https://arxiv.org/abs/2104.10442]
|
||||
|
||||
Args:
|
||||
in_channels (int): The number of input channels.
|
||||
scales (list[int]) : The scale of each layer.
|
||||
fourier_degree (int) : The maximum Fourier transform degree k.
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, fourier_degree=5):
|
||||
super().__init__()
|
||||
assert isinstance(in_channels, int)
|
||||
|
||||
self.downsample_ratio = 1.0
|
||||
self.in_channels = in_channels
|
||||
self.fourier_degree = fourier_degree
|
||||
self.out_channels_cls = 4
|
||||
self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
|
||||
|
||||
self.out_conv_cls = nn.Conv2D(
|
||||
in_channels=self.in_channels,
|
||||
out_channels=self.out_channels_cls,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='cls_weights',
|
||||
initializer=Normal(
|
||||
mean=0., std=0.01)),
|
||||
bias_attr=True)
|
||||
self.out_conv_reg = nn.Conv2D(
|
||||
in_channels=self.in_channels,
|
||||
out_channels=self.out_channels_reg,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='reg_weights',
|
||||
initializer=Normal(
|
||||
mean=0., std=0.01)),
|
||||
bias_attr=True)
|
||||
|
||||
def forward(self, feats, targets=None):
|
||||
cls_res, reg_res = multi_apply(self.forward_single, feats)
|
||||
level_num = len(cls_res)
|
||||
outs = {}
|
||||
if not self.training:
|
||||
for i in range(level_num):
|
||||
tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], axis=1)
|
||||
tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], axis=1)
|
||||
outs['level_{}'.format(i)] = paddle.concat(
|
||||
[tr_pred, tcl_pred, reg_res[i]], axis=1)
|
||||
else:
|
||||
preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
|
||||
outs['levels'] = preds
|
||||
return outs
|
||||
|
||||
def forward_single(self, x):
|
||||
cls_predict = self.out_conv_cls(x)
|
||||
reg_predict = self.out_conv_reg(x)
|
||||
return cls_predict, reg_predict
|
||||
37
backend/ppocr/modeling/heads/det_pse_head.py
Normal file
37
backend/ppocr/modeling/heads/det_pse_head.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
|
||||
"""
|
||||
|
||||
from paddle import nn
|
||||
|
||||
|
||||
class PSEHead(nn.Layer):
|
||||
def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs):
|
||||
super(PSEHead, self).__init__()
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
|
||||
self.bn1 = nn.BatchNorm2D(hidden_dim)
|
||||
self.relu1 = nn.ReLU()
|
||||
|
||||
self.conv2 = nn.Conv2D(
|
||||
hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
out = self.conv1(x)
|
||||
out = self.relu1(self.bn1(out))
|
||||
out = self.conv2(out)
|
||||
return {'maps': out}
|
||||
128
backend/ppocr/modeling/heads/det_sast_head.py
Normal file
128
backend/ppocr/modeling/heads/det_sast_head.py
Normal file
@@ -0,0 +1,128 @@
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.if_act = if_act
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + '_weights'),
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name="bn_" + name + "_scale"),
|
||||
bias_attr=ParamAttr(name="bn_" + name + "_offset"),
|
||||
moving_mean_name="bn_" + name + "_mean",
|
||||
moving_variance_name="bn_" + name + "_variance")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
|
||||
class SAST_Header1(nn.Layer):
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super(SAST_Header1, self).__init__()
|
||||
out_channels = [64, 64, 128]
|
||||
self.score_conv = nn.Sequential(
|
||||
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
|
||||
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
|
||||
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
|
||||
ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
|
||||
)
|
||||
self.border_conv = nn.Sequential(
|
||||
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
|
||||
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
|
||||
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
|
||||
ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
f_score = self.score_conv(x)
|
||||
f_score = F.sigmoid(f_score)
|
||||
f_border = self.border_conv(x)
|
||||
return f_score, f_border
|
||||
|
||||
|
||||
class SAST_Header2(nn.Layer):
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super(SAST_Header2, self).__init__()
|
||||
out_channels = [64, 64, 128]
|
||||
self.tvo_conv = nn.Sequential(
|
||||
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
|
||||
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
|
||||
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
|
||||
ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
|
||||
)
|
||||
self.tco_conv = nn.Sequential(
|
||||
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
|
||||
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
|
||||
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
|
||||
ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
f_tvo = self.tvo_conv(x)
|
||||
f_tco = self.tco_conv(x)
|
||||
return f_tvo, f_tco
|
||||
|
||||
|
||||
class SASTHead(nn.Layer):
|
||||
"""
|
||||
"""
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super(SASTHead, self).__init__()
|
||||
|
||||
self.head1 = SAST_Header1(in_channels)
|
||||
self.head2 = SAST_Header2(in_channels)
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
f_score, f_border = self.head1(x)
|
||||
f_tvo, f_tco = self.head2(x)
|
||||
|
||||
predicts = {}
|
||||
predicts['f_score'] = f_score
|
||||
predicts['f_border'] = f_border
|
||||
predicts['f_tvo'] = f_tvo
|
||||
predicts['f_tco'] = f_tco
|
||||
return predicts
|
||||
253
backend/ppocr/modeling/heads/e2e_pg_head.py
Normal file
253
backend/ppocr/modeling/heads/e2e_pg_head.py
Normal file
@@ -0,0 +1,253 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.if_act = if_act
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + '_weights'),
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name="bn_" + name + "_scale"),
|
||||
bias_attr=ParamAttr(name="bn_" + name + "_offset"),
|
||||
moving_mean_name="bn_" + name + "_mean",
|
||||
moving_variance_name="bn_" + name + "_variance",
|
||||
use_global_stats=False)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
|
||||
class PGHead(nn.Layer):
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super(PGHead, self).__init__()
|
||||
self.conv_f_score1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_score{}".format(1))
|
||||
self.conv_f_score2 = ConvBNLayer(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act='relu',
|
||||
name="conv_f_score{}".format(2))
|
||||
self.conv_f_score3 = ConvBNLayer(
|
||||
in_channels=64,
|
||||
out_channels=128,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_score{}".format(3))
|
||||
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=128,
|
||||
out_channels=1,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
weight_attr=ParamAttr(name="conv_f_score{}".format(4)),
|
||||
bias_attr=False)
|
||||
|
||||
self.conv_f_boder1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_boder{}".format(1))
|
||||
self.conv_f_boder2 = ConvBNLayer(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act='relu',
|
||||
name="conv_f_boder{}".format(2))
|
||||
self.conv_f_boder3 = ConvBNLayer(
|
||||
in_channels=64,
|
||||
out_channels=128,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_boder{}".format(3))
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=128,
|
||||
out_channels=4,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
weight_attr=ParamAttr(name="conv_f_boder{}".format(4)),
|
||||
bias_attr=False)
|
||||
self.conv_f_char1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=128,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_char{}".format(1))
|
||||
self.conv_f_char2 = ConvBNLayer(
|
||||
in_channels=128,
|
||||
out_channels=128,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act='relu',
|
||||
name="conv_f_char{}".format(2))
|
||||
self.conv_f_char3 = ConvBNLayer(
|
||||
in_channels=128,
|
||||
out_channels=256,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_char{}".format(3))
|
||||
self.conv_f_char4 = ConvBNLayer(
|
||||
in_channels=256,
|
||||
out_channels=256,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act='relu',
|
||||
name="conv_f_char{}".format(4))
|
||||
self.conv_f_char5 = ConvBNLayer(
|
||||
in_channels=256,
|
||||
out_channels=256,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_char{}".format(5))
|
||||
self.conv3 = nn.Conv2D(
|
||||
in_channels=256,
|
||||
out_channels=37,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
weight_attr=ParamAttr(name="conv_f_char{}".format(6)),
|
||||
bias_attr=False)
|
||||
|
||||
self.conv_f_direc1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_direc{}".format(1))
|
||||
self.conv_f_direc2 = ConvBNLayer(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act='relu',
|
||||
name="conv_f_direc{}".format(2))
|
||||
self.conv_f_direc3 = ConvBNLayer(
|
||||
in_channels=64,
|
||||
out_channels=128,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act='relu',
|
||||
name="conv_f_direc{}".format(3))
|
||||
self.conv4 = nn.Conv2D(
|
||||
in_channels=128,
|
||||
out_channels=2,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
weight_attr=ParamAttr(name="conv_f_direc{}".format(4)),
|
||||
bias_attr=False)
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
f_score = self.conv_f_score1(x)
|
||||
f_score = self.conv_f_score2(f_score)
|
||||
f_score = self.conv_f_score3(f_score)
|
||||
f_score = self.conv1(f_score)
|
||||
f_score = F.sigmoid(f_score)
|
||||
|
||||
# f_border
|
||||
f_border = self.conv_f_boder1(x)
|
||||
f_border = self.conv_f_boder2(f_border)
|
||||
f_border = self.conv_f_boder3(f_border)
|
||||
f_border = self.conv2(f_border)
|
||||
|
||||
f_char = self.conv_f_char1(x)
|
||||
f_char = self.conv_f_char2(f_char)
|
||||
f_char = self.conv_f_char3(f_char)
|
||||
f_char = self.conv_f_char4(f_char)
|
||||
f_char = self.conv_f_char5(f_char)
|
||||
f_char = self.conv3(f_char)
|
||||
|
||||
f_direction = self.conv_f_direc1(x)
|
||||
f_direction = self.conv_f_direc2(f_direction)
|
||||
f_direction = self.conv_f_direc3(f_direction)
|
||||
f_direction = self.conv4(f_direction)
|
||||
|
||||
predicts = {}
|
||||
predicts['f_score'] = f_score
|
||||
predicts['f_border'] = f_border
|
||||
predicts['f_char'] = f_char
|
||||
predicts['f_direction'] = f_direction
|
||||
return predicts
|
||||
207
backend/ppocr/modeling/heads/kie_sdmgr_head.py
Normal file
207
backend/ppocr/modeling/heads/kie_sdmgr_head.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
class SDMGRHead(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
num_chars=92,
|
||||
visual_dim=16,
|
||||
fusion_dim=1024,
|
||||
node_input=32,
|
||||
node_embed=256,
|
||||
edge_input=5,
|
||||
edge_embed=256,
|
||||
num_gnn=2,
|
||||
num_classes=26,
|
||||
bidirectional=False):
|
||||
super().__init__()
|
||||
|
||||
self.fusion = Block([visual_dim, node_embed], node_embed, fusion_dim)
|
||||
self.node_embed = nn.Embedding(num_chars, node_input, 0)
|
||||
hidden = node_embed // 2 if bidirectional else node_embed
|
||||
self.rnn = nn.LSTM(
|
||||
input_size=node_input, hidden_size=hidden, num_layers=1)
|
||||
self.edge_embed = nn.Linear(edge_input, edge_embed)
|
||||
self.gnn_layers = nn.LayerList(
|
||||
[GNNLayer(node_embed, edge_embed) for _ in range(num_gnn)])
|
||||
self.node_cls = nn.Linear(node_embed, num_classes)
|
||||
self.edge_cls = nn.Linear(edge_embed, 2)
|
||||
|
||||
def forward(self, input, targets):
|
||||
relations, texts, x = input
|
||||
node_nums, char_nums = [], []
|
||||
for text in texts:
|
||||
node_nums.append(text.shape[0])
|
||||
char_nums.append(paddle.sum((text > -1).astype(int), axis=-1))
|
||||
|
||||
max_num = max([char_num.max() for char_num in char_nums])
|
||||
all_nodes = paddle.concat([
|
||||
paddle.concat(
|
||||
[text, paddle.zeros(
|
||||
(text.shape[0], max_num - text.shape[1]))], -1)
|
||||
for text in texts
|
||||
])
|
||||
temp = paddle.clip(all_nodes, min=0).astype(int)
|
||||
embed_nodes = self.node_embed(temp)
|
||||
rnn_nodes, _ = self.rnn(embed_nodes)
|
||||
|
||||
b, h, w = rnn_nodes.shape
|
||||
nodes = paddle.zeros([b, w])
|
||||
all_nums = paddle.concat(char_nums)
|
||||
valid = paddle.nonzero((all_nums > 0).astype(int))
|
||||
temp_all_nums = (
|
||||
paddle.gather(all_nums, valid) - 1).unsqueeze(-1).unsqueeze(-1)
|
||||
temp_all_nums = paddle.expand(temp_all_nums, [
|
||||
temp_all_nums.shape[0], temp_all_nums.shape[1], rnn_nodes.shape[-1]
|
||||
])
|
||||
temp_all_nodes = paddle.gather(rnn_nodes, valid)
|
||||
N, C, A = temp_all_nodes.shape
|
||||
one_hot = F.one_hot(
|
||||
temp_all_nums[:, 0, :], num_classes=C).transpose([0, 2, 1])
|
||||
one_hot = paddle.multiply(
|
||||
temp_all_nodes, one_hot.astype("float32")).sum(axis=1, keepdim=True)
|
||||
t = one_hot.expand([N, 1, A]).squeeze(1)
|
||||
nodes = paddle.scatter(nodes, valid.squeeze(1), t)
|
||||
|
||||
if x is not None:
|
||||
nodes = self.fusion([x, nodes])
|
||||
|
||||
all_edges = paddle.concat(
|
||||
[rel.reshape([-1, rel.shape[-1]]) for rel in relations])
|
||||
embed_edges = self.edge_embed(all_edges.astype('float32'))
|
||||
embed_edges = F.normalize(embed_edges)
|
||||
|
||||
for gnn_layer in self.gnn_layers:
|
||||
nodes, cat_nodes = gnn_layer(nodes, embed_edges, node_nums)
|
||||
|
||||
node_cls, edge_cls = self.node_cls(nodes), self.edge_cls(cat_nodes)
|
||||
return node_cls, edge_cls
|
||||
|
||||
|
||||
class GNNLayer(nn.Layer):
|
||||
def __init__(self, node_dim=256, edge_dim=256):
|
||||
super().__init__()
|
||||
self.in_fc = nn.Linear(node_dim * 2 + edge_dim, node_dim)
|
||||
self.coef_fc = nn.Linear(node_dim, 1)
|
||||
self.out_fc = nn.Linear(node_dim, node_dim)
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def forward(self, nodes, edges, nums):
|
||||
start, cat_nodes = 0, []
|
||||
for num in nums:
|
||||
sample_nodes = nodes[start:start + num]
|
||||
cat_nodes.append(
|
||||
paddle.concat([
|
||||
paddle.expand(sample_nodes.unsqueeze(1), [-1, num, -1]),
|
||||
paddle.expand(sample_nodes.unsqueeze(0), [num, -1, -1])
|
||||
], -1).reshape([num**2, -1]))
|
||||
start += num
|
||||
cat_nodes = paddle.concat([paddle.concat(cat_nodes), edges], -1)
|
||||
cat_nodes = self.relu(self.in_fc(cat_nodes))
|
||||
coefs = self.coef_fc(cat_nodes)
|
||||
|
||||
start, residuals = 0, []
|
||||
for num in nums:
|
||||
residual = F.softmax(
|
||||
-paddle.eye(num).unsqueeze(-1) * 1e9 +
|
||||
coefs[start:start + num**2].reshape([num, num, -1]), 1)
|
||||
residuals.append((residual * cat_nodes[start:start + num**2]
|
||||
.reshape([num, num, -1])).sum(1))
|
||||
start += num**2
|
||||
|
||||
nodes += self.relu(self.out_fc(paddle.concat(residuals)))
|
||||
return [nodes, cat_nodes]
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
def __init__(self,
|
||||
input_dims,
|
||||
output_dim,
|
||||
mm_dim=1600,
|
||||
chunks=20,
|
||||
rank=15,
|
||||
shared=False,
|
||||
dropout_input=0.,
|
||||
dropout_pre_lin=0.,
|
||||
dropout_output=0.,
|
||||
pos_norm='before_cat'):
|
||||
super().__init__()
|
||||
self.rank = rank
|
||||
self.dropout_input = dropout_input
|
||||
self.dropout_pre_lin = dropout_pre_lin
|
||||
self.dropout_output = dropout_output
|
||||
assert (pos_norm in ['before_cat', 'after_cat'])
|
||||
self.pos_norm = pos_norm
|
||||
# Modules
|
||||
self.linear0 = nn.Linear(input_dims[0], mm_dim)
|
||||
self.linear1 = (self.linear0
|
||||
if shared else nn.Linear(input_dims[1], mm_dim))
|
||||
self.merge_linears0 = nn.LayerList()
|
||||
self.merge_linears1 = nn.LayerList()
|
||||
self.chunks = self.chunk_sizes(mm_dim, chunks)
|
||||
for size in self.chunks:
|
||||
ml0 = nn.Linear(size, size * rank)
|
||||
self.merge_linears0.append(ml0)
|
||||
ml1 = ml0 if shared else nn.Linear(size, size * rank)
|
||||
self.merge_linears1.append(ml1)
|
||||
self.linear_out = nn.Linear(mm_dim, output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
x0 = self.linear0(x[0])
|
||||
x1 = self.linear1(x[1])
|
||||
bs = x1.shape[0]
|
||||
if self.dropout_input > 0:
|
||||
x0 = F.dropout(x0, p=self.dropout_input, training=self.training)
|
||||
x1 = F.dropout(x1, p=self.dropout_input, training=self.training)
|
||||
x0_chunks = paddle.split(x0, self.chunks, -1)
|
||||
x1_chunks = paddle.split(x1, self.chunks, -1)
|
||||
zs = []
|
||||
for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0,
|
||||
self.merge_linears1):
|
||||
m = m0(x0_c) * m1(x1_c) # bs x split_size*rank
|
||||
m = m.reshape([bs, self.rank, -1])
|
||||
z = paddle.sum(m, 1)
|
||||
if self.pos_norm == 'before_cat':
|
||||
z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
|
||||
z = F.normalize(z)
|
||||
zs.append(z)
|
||||
z = paddle.concat(zs, 1)
|
||||
if self.pos_norm == 'after_cat':
|
||||
z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
|
||||
z = F.normalize(z)
|
||||
|
||||
if self.dropout_pre_lin > 0:
|
||||
z = F.dropout(z, p=self.dropout_pre_lin, training=self.training)
|
||||
z = self.linear_out(z)
|
||||
if self.dropout_output > 0:
|
||||
z = F.dropout(z, p=self.dropout_output, training=self.training)
|
||||
return z
|
||||
|
||||
def chunk_sizes(self, dim, chunks):
|
||||
split_size = (dim + chunks - 1) // chunks
|
||||
sizes_list = [split_size] * chunks
|
||||
sizes_list[-1] = sizes_list[-1] - (sum(sizes_list) - dim)
|
||||
return sizes_list
|
||||
163
backend/ppocr/modeling/heads/multiheadAttention.py
Executable file
163
backend/ppocr/modeling/heads/multiheadAttention.py
Executable file
@@ -0,0 +1,163 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import Linear
|
||||
from paddle.nn.initializer import XavierUniform as xavier_uniform_
|
||||
from paddle.nn.initializer import Constant as constant_
|
||||
from paddle.nn.initializer import XavierNormal as xavier_normal_
|
||||
|
||||
zeros_ = constant_(value=0.)
|
||||
ones_ = constant_(value=1.)
|
||||
|
||||
|
||||
class MultiheadAttention(nn.Layer):
|
||||
"""Allows the model to jointly attend to information
|
||||
from different representation subspaces.
|
||||
See reference: Attention Is All You Need
|
||||
|
||||
.. math::
|
||||
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
|
||||
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
|
||||
|
||||
Args:
|
||||
embed_dim: total dimension of the model
|
||||
num_heads: parallel attention layers, or heads
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embed_dim,
|
||||
num_heads,
|
||||
dropout=0.,
|
||||
bias=True,
|
||||
add_bias_kv=False,
|
||||
add_zero_attn=False):
|
||||
super(MultiheadAttention, self).__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.dropout = dropout
|
||||
self.head_dim = embed_dim // num_heads
|
||||
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias)
|
||||
self._reset_parameters()
|
||||
self.conv1 = paddle.nn.Conv2D(
|
||||
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
|
||||
self.conv2 = paddle.nn.Conv2D(
|
||||
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
|
||||
self.conv3 = paddle.nn.Conv2D(
|
||||
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
|
||||
|
||||
def _reset_parameters(self):
|
||||
xavier_uniform_(self.out_proj.weight)
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
key_padding_mask=None,
|
||||
incremental_state=None,
|
||||
attn_mask=None):
|
||||
"""
|
||||
Inputs of forward function
|
||||
query: [target length, batch size, embed dim]
|
||||
key: [sequence length, batch size, embed dim]
|
||||
value: [sequence length, batch size, embed dim]
|
||||
key_padding_mask: if True, mask padding based on batch size
|
||||
incremental_state: if provided, previous time steps are cashed
|
||||
need_weights: output attn_output_weights
|
||||
static_kv: key and value are static
|
||||
|
||||
Outputs of forward function
|
||||
attn_output: [target length, batch size, embed dim]
|
||||
attn_output_weights: [batch size, target length, sequence length]
|
||||
"""
|
||||
q_shape = paddle.shape(query)
|
||||
src_shape = paddle.shape(key)
|
||||
q = self._in_proj_q(query)
|
||||
k = self._in_proj_k(key)
|
||||
v = self._in_proj_v(value)
|
||||
q *= self.scaling
|
||||
q = paddle.transpose(
|
||||
paddle.reshape(
|
||||
q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
|
||||
[1, 2, 0, 3])
|
||||
k = paddle.transpose(
|
||||
paddle.reshape(
|
||||
k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
|
||||
[1, 2, 0, 3])
|
||||
v = paddle.transpose(
|
||||
paddle.reshape(
|
||||
v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
|
||||
[1, 2, 0, 3])
|
||||
if key_padding_mask is not None:
|
||||
assert key_padding_mask.shape[0] == q_shape[1]
|
||||
assert key_padding_mask.shape[1] == src_shape[0]
|
||||
attn_output_weights = paddle.matmul(q,
|
||||
paddle.transpose(k, [0, 1, 3, 2]))
|
||||
if attn_mask is not None:
|
||||
attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0)
|
||||
attn_output_weights += attn_mask
|
||||
if key_padding_mask is not None:
|
||||
attn_output_weights = paddle.reshape(
|
||||
attn_output_weights,
|
||||
[q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
|
||||
key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2)
|
||||
key = paddle.cast(key, 'float32')
|
||||
y = paddle.full(
|
||||
shape=paddle.shape(key), dtype='float32', fill_value='-inf')
|
||||
y = paddle.where(key == 0., key, y)
|
||||
attn_output_weights += y
|
||||
attn_output_weights = F.softmax(
|
||||
attn_output_weights.astype('float32'),
|
||||
axis=-1,
|
||||
dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16
|
||||
else attn_output_weights.dtype)
|
||||
attn_output_weights = F.dropout(
|
||||
attn_output_weights, p=self.dropout, training=self.training)
|
||||
|
||||
attn_output = paddle.matmul(attn_output_weights, v)
|
||||
attn_output = paddle.reshape(
|
||||
paddle.transpose(attn_output, [2, 0, 1, 3]),
|
||||
[q_shape[0], q_shape[1], self.embed_dim])
|
||||
attn_output = self.out_proj(attn_output)
|
||||
|
||||
return attn_output
|
||||
|
||||
def _in_proj_q(self, query):
|
||||
query = paddle.transpose(query, [1, 2, 0])
|
||||
query = paddle.unsqueeze(query, axis=2)
|
||||
res = self.conv1(query)
|
||||
res = paddle.squeeze(res, axis=2)
|
||||
res = paddle.transpose(res, [2, 0, 1])
|
||||
return res
|
||||
|
||||
def _in_proj_k(self, key):
|
||||
key = paddle.transpose(key, [1, 2, 0])
|
||||
key = paddle.unsqueeze(key, axis=2)
|
||||
res = self.conv2(key)
|
||||
res = paddle.squeeze(res, axis=2)
|
||||
res = paddle.transpose(res, [2, 0, 1])
|
||||
return res
|
||||
|
||||
def _in_proj_v(self, value):
|
||||
value = paddle.transpose(value, [1, 2, 0]) #(1, 2, 0)
|
||||
value = paddle.unsqueeze(value, axis=2)
|
||||
res = self.conv3(value)
|
||||
res = paddle.squeeze(res, axis=2)
|
||||
res = paddle.transpose(res, [2, 0, 1])
|
||||
return res
|
||||
393
backend/ppocr/modeling/heads/rec_aster_head.py
Normal file
393
backend/ppocr/modeling/heads/rec_aster_head.py
Normal file
@@ -0,0 +1,393 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/attention_recognition_head.py
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
||||
class AsterHead(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
sDim,
|
||||
attDim,
|
||||
max_len_labels,
|
||||
time_step=25,
|
||||
beam_width=5,
|
||||
**kwargs):
|
||||
super(AsterHead, self).__init__()
|
||||
self.num_classes = out_channels
|
||||
self.in_planes = in_channels
|
||||
self.sDim = sDim
|
||||
self.attDim = attDim
|
||||
self.max_len_labels = max_len_labels
|
||||
self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim,
|
||||
attDim, max_len_labels)
|
||||
self.time_step = time_step
|
||||
self.embeder = Embedding(self.time_step, in_channels)
|
||||
self.beam_width = beam_width
|
||||
self.eos = self.num_classes - 3
|
||||
|
||||
def forward(self, x, targets=None, embed=None):
|
||||
return_dict = {}
|
||||
embedding_vectors = self.embeder(x)
|
||||
|
||||
if self.training:
|
||||
rec_targets, rec_lengths, _ = targets
|
||||
rec_pred = self.decoder([x, rec_targets, rec_lengths],
|
||||
embedding_vectors)
|
||||
return_dict['rec_pred'] = rec_pred
|
||||
return_dict['embedding_vectors'] = embedding_vectors
|
||||
else:
|
||||
rec_pred, rec_pred_scores = self.decoder.beam_search(
|
||||
x, self.beam_width, self.eos, embedding_vectors)
|
||||
return_dict['rec_pred'] = rec_pred
|
||||
return_dict['rec_pred_scores'] = rec_pred_scores
|
||||
return_dict['embedding_vectors'] = embedding_vectors
|
||||
|
||||
return return_dict
|
||||
|
||||
|
||||
class Embedding(nn.Layer):
|
||||
def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300):
|
||||
super(Embedding, self).__init__()
|
||||
self.in_timestep = in_timestep
|
||||
self.in_planes = in_planes
|
||||
self.embed_dim = embed_dim
|
||||
self.mid_dim = mid_dim
|
||||
self.eEmbed = nn.Linear(
|
||||
in_timestep * in_planes,
|
||||
self.embed_dim) # Embed encoder output to a word-embedding like
|
||||
|
||||
def forward(self, x):
|
||||
x = paddle.reshape(x, [paddle.shape(x)[0], -1])
|
||||
x = self.eEmbed(x)
|
||||
return x
|
||||
|
||||
|
||||
class AttentionRecognitionHead(nn.Layer):
|
||||
"""
|
||||
input: [b x 16 x 64 x in_planes]
|
||||
output: probability sequence: [b x T x num_classes]
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels):
|
||||
super(AttentionRecognitionHead, self).__init__()
|
||||
self.num_classes = out_channels # this is the output classes. So it includes the <EOS>.
|
||||
self.in_planes = in_channels
|
||||
self.sDim = sDim
|
||||
self.attDim = attDim
|
||||
self.max_len_labels = max_len_labels
|
||||
|
||||
self.decoder = DecoderUnit(
|
||||
sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim)
|
||||
|
||||
def forward(self, x, embed):
|
||||
x, targets, lengths = x
|
||||
batch_size = paddle.shape(x)[0]
|
||||
# Decoder
|
||||
state = self.decoder.get_initial_state(embed)
|
||||
outputs = []
|
||||
for i in range(max(lengths)):
|
||||
if i == 0:
|
||||
y_prev = paddle.full(
|
||||
shape=[batch_size], fill_value=self.num_classes)
|
||||
else:
|
||||
y_prev = targets[:, i - 1]
|
||||
output, state = self.decoder(x, state, y_prev)
|
||||
outputs.append(output)
|
||||
outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1)
|
||||
return outputs
|
||||
|
||||
# inference stage.
|
||||
def sample(self, x):
|
||||
x, _, _ = x
|
||||
batch_size = x.size(0)
|
||||
# Decoder
|
||||
state = paddle.zeros([1, batch_size, self.sDim])
|
||||
|
||||
predicted_ids, predicted_scores = [], []
|
||||
for i in range(self.max_len_labels):
|
||||
if i == 0:
|
||||
y_prev = paddle.full(
|
||||
shape=[batch_size], fill_value=self.num_classes)
|
||||
else:
|
||||
y_prev = predicted
|
||||
|
||||
output, state = self.decoder(x, state, y_prev)
|
||||
output = F.softmax(output, axis=1)
|
||||
score, predicted = output.max(1)
|
||||
predicted_ids.append(predicted.unsqueeze(1))
|
||||
predicted_scores.append(score.unsqueeze(1))
|
||||
predicted_ids = paddle.concat([predicted_ids, 1])
|
||||
predicted_scores = paddle.concat([predicted_scores, 1])
|
||||
# return predicted_ids.squeeze(), predicted_scores.squeeze()
|
||||
return predicted_ids, predicted_scores
|
||||
|
||||
def beam_search(self, x, beam_width, eos, embed):
|
||||
def _inflate(tensor, times, dim):
|
||||
repeat_dims = [1] * tensor.dim()
|
||||
repeat_dims[dim] = times
|
||||
output = paddle.tile(tensor, repeat_dims)
|
||||
return output
|
||||
|
||||
# https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py
|
||||
batch_size, l, d = x.shape
|
||||
x = paddle.tile(
|
||||
paddle.transpose(
|
||||
x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1])
|
||||
inflated_encoder_feats = paddle.reshape(
|
||||
paddle.transpose(
|
||||
x, perm=[1, 0, 2, 3]), [-1, l, d])
|
||||
|
||||
# Initialize the decoder
|
||||
state = self.decoder.get_initial_state(embed, tile_times=beam_width)
|
||||
|
||||
pos_index = paddle.reshape(
|
||||
paddle.arange(batch_size) * beam_width, shape=[-1, 1])
|
||||
|
||||
# Initialize the scores
|
||||
sequence_scores = paddle.full(
|
||||
shape=[batch_size * beam_width, 1], fill_value=-float('Inf'))
|
||||
index = [i * beam_width for i in range(0, batch_size)]
|
||||
sequence_scores[index] = 0.0
|
||||
|
||||
# Initialize the input vector
|
||||
y_prev = paddle.full(
|
||||
shape=[batch_size * beam_width], fill_value=self.num_classes)
|
||||
|
||||
# Store decisions for backtracking
|
||||
stored_scores = list()
|
||||
stored_predecessors = list()
|
||||
stored_emitted_symbols = list()
|
||||
|
||||
for i in range(self.max_len_labels):
|
||||
output, state = self.decoder(inflated_encoder_feats, state, y_prev)
|
||||
state = paddle.unsqueeze(state, axis=0)
|
||||
log_softmax_output = paddle.nn.functional.log_softmax(
|
||||
output, axis=1)
|
||||
|
||||
sequence_scores = _inflate(sequence_scores, self.num_classes, 1)
|
||||
sequence_scores += log_softmax_output
|
||||
scores, candidates = paddle.topk(
|
||||
paddle.reshape(sequence_scores, [batch_size, -1]),
|
||||
beam_width,
|
||||
axis=1)
|
||||
|
||||
# Reshape input = (bk, 1) and sequence_scores = (bk, 1)
|
||||
y_prev = paddle.reshape(
|
||||
candidates % self.num_classes, shape=[batch_size * beam_width])
|
||||
sequence_scores = paddle.reshape(
|
||||
scores, shape=[batch_size * beam_width, 1])
|
||||
|
||||
# Update fields for next timestep
|
||||
pos_index = paddle.expand_as(pos_index, candidates)
|
||||
predecessors = paddle.cast(
|
||||
candidates / self.num_classes + pos_index, dtype='int64')
|
||||
predecessors = paddle.reshape(
|
||||
predecessors, shape=[batch_size * beam_width, 1])
|
||||
state = paddle.index_select(
|
||||
state, index=predecessors.squeeze(), axis=1)
|
||||
|
||||
# Update sequence socres and erase scores for <eos> symbol so that they aren't expanded
|
||||
stored_scores.append(sequence_scores.clone())
|
||||
y_prev = paddle.reshape(y_prev, shape=[-1, 1])
|
||||
eos_prev = paddle.full_like(y_prev, fill_value=eos)
|
||||
mask = eos_prev == y_prev
|
||||
mask = paddle.nonzero(mask)
|
||||
if mask.dim() > 0:
|
||||
sequence_scores = sequence_scores.numpy()
|
||||
mask = mask.numpy()
|
||||
sequence_scores[mask] = -float('inf')
|
||||
sequence_scores = paddle.to_tensor(sequence_scores)
|
||||
|
||||
# Cache results for backtracking
|
||||
stored_predecessors.append(predecessors)
|
||||
y_prev = paddle.squeeze(y_prev)
|
||||
stored_emitted_symbols.append(y_prev)
|
||||
|
||||
# Do backtracking to return the optimal values
|
||||
#====== backtrak ======#
|
||||
# Initialize return variables given different types
|
||||
p = list()
|
||||
l = [[self.max_len_labels] * beam_width for _ in range(batch_size)
|
||||
] # Placeholder for lengths of top-k sequences
|
||||
|
||||
# the last step output of the beams are not sorted
|
||||
# thus they are sorted here
|
||||
sorted_score, sorted_idx = paddle.topk(
|
||||
paddle.reshape(
|
||||
stored_scores[-1], shape=[batch_size, beam_width]),
|
||||
beam_width)
|
||||
|
||||
# initialize the sequence scores with the sorted last step beam scores
|
||||
s = sorted_score.clone()
|
||||
|
||||
batch_eos_found = [0] * batch_size # the number of EOS found
|
||||
# in the backward loop below for each batch
|
||||
t = self.max_len_labels - 1
|
||||
# initialize the back pointer with the sorted order of the last step beams.
|
||||
# add pos_index for indexing variable with b*k as the first dimension.
|
||||
t_predecessors = paddle.reshape(
|
||||
sorted_idx + pos_index.expand_as(sorted_idx),
|
||||
shape=[batch_size * beam_width])
|
||||
while t >= 0:
|
||||
# Re-order the variables with the back pointer
|
||||
current_symbol = paddle.index_select(
|
||||
stored_emitted_symbols[t], index=t_predecessors, axis=0)
|
||||
t_predecessors = paddle.index_select(
|
||||
stored_predecessors[t].squeeze(), index=t_predecessors, axis=0)
|
||||
eos_indices = stored_emitted_symbols[t] == eos
|
||||
eos_indices = paddle.nonzero(eos_indices)
|
||||
|
||||
if eos_indices.dim() > 0:
|
||||
for i in range(eos_indices.shape[0] - 1, -1, -1):
|
||||
# Indices of the EOS symbol for both variables
|
||||
# with b*k as the first dimension, and b, k for
|
||||
# the first two dimensions
|
||||
idx = eos_indices[i]
|
||||
b_idx = int(idx[0] / beam_width)
|
||||
# The indices of the replacing position
|
||||
# according to the replacement strategy noted above
|
||||
res_k_idx = beam_width - (batch_eos_found[b_idx] %
|
||||
beam_width) - 1
|
||||
batch_eos_found[b_idx] += 1
|
||||
res_idx = b_idx * beam_width + res_k_idx
|
||||
|
||||
# Replace the old information in return variables
|
||||
# with the new ended sequence information
|
||||
t_predecessors[res_idx] = stored_predecessors[t][idx[0]]
|
||||
current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]]
|
||||
s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0]
|
||||
l[b_idx][res_k_idx] = t + 1
|
||||
|
||||
# record the back tracked results
|
||||
p.append(current_symbol)
|
||||
t -= 1
|
||||
|
||||
# Sort and re-order again as the added ended sequences may change
|
||||
# the order (very unlikely)
|
||||
s, re_sorted_idx = s.topk(beam_width)
|
||||
for b_idx in range(batch_size):
|
||||
l[b_idx] = [
|
||||
l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :]
|
||||
]
|
||||
|
||||
re_sorted_idx = paddle.reshape(
|
||||
re_sorted_idx + pos_index.expand_as(re_sorted_idx),
|
||||
[batch_size * beam_width])
|
||||
|
||||
# Reverse the sequences and re-order at the same time
|
||||
# It is reversed because the backtracking happens in reverse time order
|
||||
p = [
|
||||
paddle.reshape(
|
||||
paddle.index_select(step, re_sorted_idx, 0),
|
||||
shape=[batch_size, beam_width, -1]) for step in reversed(p)
|
||||
]
|
||||
p = paddle.concat(p, -1)[:, 0, :]
|
||||
return p, paddle.ones_like(p)
|
||||
|
||||
|
||||
class AttentionUnit(nn.Layer):
|
||||
def __init__(self, sDim, xDim, attDim):
|
||||
super(AttentionUnit, self).__init__()
|
||||
|
||||
self.sDim = sDim
|
||||
self.xDim = xDim
|
||||
self.attDim = attDim
|
||||
|
||||
self.sEmbed = nn.Linear(sDim, attDim)
|
||||
self.xEmbed = nn.Linear(xDim, attDim)
|
||||
self.wEmbed = nn.Linear(attDim, 1)
|
||||
|
||||
def forward(self, x, sPrev):
|
||||
batch_size, T, _ = x.shape # [b x T x xDim]
|
||||
x = paddle.reshape(x, [-1, self.xDim]) # [(b x T) x xDim]
|
||||
xProj = self.xEmbed(x) # [(b x T) x attDim]
|
||||
xProj = paddle.reshape(xProj, [batch_size, T, -1]) # [b x T x attDim]
|
||||
|
||||
sPrev = sPrev.squeeze(0)
|
||||
sProj = self.sEmbed(sPrev) # [b x attDim]
|
||||
sProj = paddle.unsqueeze(sProj, 1) # [b x 1 x attDim]
|
||||
sProj = paddle.expand(sProj,
|
||||
[batch_size, T, self.attDim]) # [b x T x attDim]
|
||||
|
||||
sumTanh = paddle.tanh(sProj + xProj)
|
||||
sumTanh = paddle.reshape(sumTanh, [-1, self.attDim])
|
||||
|
||||
vProj = self.wEmbed(sumTanh) # [(b x T) x 1]
|
||||
vProj = paddle.reshape(vProj, [batch_size, T])
|
||||
alpha = F.softmax(
|
||||
vProj, axis=1) # attention weights for each sample in the minibatch
|
||||
return alpha
|
||||
|
||||
|
||||
class DecoderUnit(nn.Layer):
|
||||
def __init__(self, sDim, xDim, yDim, attDim):
|
||||
super(DecoderUnit, self).__init__()
|
||||
self.sDim = sDim
|
||||
self.xDim = xDim
|
||||
self.yDim = yDim
|
||||
self.attDim = attDim
|
||||
self.emdDim = attDim
|
||||
|
||||
self.attention_unit = AttentionUnit(sDim, xDim, attDim)
|
||||
self.tgt_embedding = nn.Embedding(
|
||||
yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal(
|
||||
std=0.01)) # the last is used for <BOS>
|
||||
self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim)
|
||||
self.fc = nn.Linear(
|
||||
sDim,
|
||||
yDim,
|
||||
weight_attr=nn.initializer.Normal(std=0.01),
|
||||
bias_attr=nn.initializer.Constant(value=0))
|
||||
self.embed_fc = nn.Linear(300, self.sDim)
|
||||
|
||||
def get_initial_state(self, embed, tile_times=1):
|
||||
assert embed.shape[1] == 300
|
||||
state = self.embed_fc(embed) # N * sDim
|
||||
if tile_times != 1:
|
||||
state = state.unsqueeze(1)
|
||||
trans_state = paddle.transpose(state, perm=[1, 0, 2])
|
||||
state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1])
|
||||
trans_state = paddle.transpose(state, perm=[1, 0, 2])
|
||||
state = paddle.reshape(trans_state, shape=[-1, self.sDim])
|
||||
state = state.unsqueeze(0) # 1 * N * sDim
|
||||
return state
|
||||
|
||||
def forward(self, x, sPrev, yPrev):
|
||||
# x: feature sequence from the image decoder.
|
||||
batch_size, T, _ = x.shape
|
||||
alpha = self.attention_unit(x, sPrev)
|
||||
context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1)
|
||||
yPrev = paddle.cast(yPrev, dtype="int64")
|
||||
yProj = self.tgt_embedding(yPrev)
|
||||
|
||||
concat_context = paddle.concat([yProj, context], 1)
|
||||
concat_context = paddle.squeeze(concat_context, 1)
|
||||
sPrev = paddle.squeeze(sPrev, 0)
|
||||
output, state = self.gru(concat_context, sPrev)
|
||||
output = paddle.squeeze(output, axis=1)
|
||||
output = self.fc(output)
|
||||
return output, state
|
||||
202
backend/ppocr/modeling/heads/rec_att_head.py
Normal file
202
backend/ppocr/modeling/heads/rec_att_head.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
|
||||
class AttentionHead(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
|
||||
super(AttentionHead, self).__init__()
|
||||
self.input_size = in_channels
|
||||
self.hidden_size = hidden_size
|
||||
self.num_classes = out_channels
|
||||
|
||||
self.attention_cell = AttentionGRUCell(
|
||||
in_channels, hidden_size, out_channels, use_gru=False)
|
||||
self.generator = nn.Linear(hidden_size, out_channels)
|
||||
|
||||
def _char_to_onehot(self, input_char, onehot_dim):
|
||||
input_ont_hot = F.one_hot(input_char, onehot_dim)
|
||||
return input_ont_hot
|
||||
|
||||
def forward(self, inputs, targets=None, batch_max_length=25):
|
||||
batch_size = paddle.shape(inputs)[0]
|
||||
num_steps = batch_max_length
|
||||
|
||||
hidden = paddle.zeros((batch_size, self.hidden_size))
|
||||
output_hiddens = []
|
||||
|
||||
if targets is not None:
|
||||
for i in range(num_steps):
|
||||
char_onehots = self._char_to_onehot(
|
||||
targets[:, i], onehot_dim=self.num_classes)
|
||||
(outputs, hidden), alpha = self.attention_cell(hidden, inputs,
|
||||
char_onehots)
|
||||
output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
|
||||
output = paddle.concat(output_hiddens, axis=1)
|
||||
probs = self.generator(output)
|
||||
else:
|
||||
targets = paddle.zeros(shape=[batch_size], dtype="int32")
|
||||
probs = None
|
||||
char_onehots = None
|
||||
outputs = None
|
||||
alpha = None
|
||||
|
||||
for i in range(num_steps):
|
||||
char_onehots = self._char_to_onehot(
|
||||
targets, onehot_dim=self.num_classes)
|
||||
(outputs, hidden), alpha = self.attention_cell(hidden, inputs,
|
||||
char_onehots)
|
||||
probs_step = self.generator(outputs)
|
||||
if probs is None:
|
||||
probs = paddle.unsqueeze(probs_step, axis=1)
|
||||
else:
|
||||
probs = paddle.concat(
|
||||
[probs, paddle.unsqueeze(
|
||||
probs_step, axis=1)], axis=1)
|
||||
next_input = probs_step.argmax(axis=1)
|
||||
targets = next_input
|
||||
if not self.training:
|
||||
probs = paddle.nn.functional.softmax(probs, axis=2)
|
||||
return probs
|
||||
|
||||
|
||||
class AttentionGRUCell(nn.Layer):
|
||||
def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
|
||||
super(AttentionGRUCell, self).__init__()
|
||||
self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
|
||||
self.h2h = nn.Linear(hidden_size, hidden_size)
|
||||
self.score = nn.Linear(hidden_size, 1, bias_attr=False)
|
||||
|
||||
self.rnn = nn.GRUCell(
|
||||
input_size=input_size + num_embeddings, hidden_size=hidden_size)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
def forward(self, prev_hidden, batch_H, char_onehots):
|
||||
|
||||
batch_H_proj = self.i2h(batch_H)
|
||||
prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
|
||||
|
||||
res = paddle.add(batch_H_proj, prev_hidden_proj)
|
||||
res = paddle.tanh(res)
|
||||
e = self.score(res)
|
||||
|
||||
alpha = F.softmax(e, axis=1)
|
||||
alpha = paddle.transpose(alpha, [0, 2, 1])
|
||||
context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
|
||||
concat_context = paddle.concat([context, char_onehots], 1)
|
||||
|
||||
cur_hidden = self.rnn(concat_context, prev_hidden)
|
||||
|
||||
return cur_hidden, alpha
|
||||
|
||||
|
||||
class AttentionLSTM(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
|
||||
super(AttentionLSTM, self).__init__()
|
||||
self.input_size = in_channels
|
||||
self.hidden_size = hidden_size
|
||||
self.num_classes = out_channels
|
||||
|
||||
self.attention_cell = AttentionLSTMCell(
|
||||
in_channels, hidden_size, out_channels, use_gru=False)
|
||||
self.generator = nn.Linear(hidden_size, out_channels)
|
||||
|
||||
def _char_to_onehot(self, input_char, onehot_dim):
|
||||
input_ont_hot = F.one_hot(input_char, onehot_dim)
|
||||
return input_ont_hot
|
||||
|
||||
def forward(self, inputs, targets=None, batch_max_length=25):
|
||||
batch_size = inputs.shape[0]
|
||||
num_steps = batch_max_length
|
||||
|
||||
hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
|
||||
(batch_size, self.hidden_size)))
|
||||
output_hiddens = []
|
||||
|
||||
if targets is not None:
|
||||
for i in range(num_steps):
|
||||
# one-hot vectors for a i-th char
|
||||
char_onehots = self._char_to_onehot(
|
||||
targets[:, i], onehot_dim=self.num_classes)
|
||||
hidden, alpha = self.attention_cell(hidden, inputs,
|
||||
char_onehots)
|
||||
|
||||
hidden = (hidden[1][0], hidden[1][1])
|
||||
output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
|
||||
output = paddle.concat(output_hiddens, axis=1)
|
||||
probs = self.generator(output)
|
||||
|
||||
else:
|
||||
targets = paddle.zeros(shape=[batch_size], dtype="int32")
|
||||
probs = None
|
||||
|
||||
for i in range(num_steps):
|
||||
char_onehots = self._char_to_onehot(
|
||||
targets, onehot_dim=self.num_classes)
|
||||
hidden, alpha = self.attention_cell(hidden, inputs,
|
||||
char_onehots)
|
||||
probs_step = self.generator(hidden[0])
|
||||
hidden = (hidden[1][0], hidden[1][1])
|
||||
if probs is None:
|
||||
probs = paddle.unsqueeze(probs_step, axis=1)
|
||||
else:
|
||||
probs = paddle.concat(
|
||||
[probs, paddle.unsqueeze(
|
||||
probs_step, axis=1)], axis=1)
|
||||
|
||||
next_input = probs_step.argmax(axis=1)
|
||||
|
||||
targets = next_input
|
||||
|
||||
return probs
|
||||
|
||||
|
||||
class AttentionLSTMCell(nn.Layer):
|
||||
def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
|
||||
super(AttentionLSTMCell, self).__init__()
|
||||
self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
|
||||
self.h2h = nn.Linear(hidden_size, hidden_size)
|
||||
self.score = nn.Linear(hidden_size, 1, bias_attr=False)
|
||||
if not use_gru:
|
||||
self.rnn = nn.LSTMCell(
|
||||
input_size=input_size + num_embeddings, hidden_size=hidden_size)
|
||||
else:
|
||||
self.rnn = nn.GRUCell(
|
||||
input_size=input_size + num_embeddings, hidden_size=hidden_size)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
def forward(self, prev_hidden, batch_H, char_onehots):
|
||||
batch_H_proj = self.i2h(batch_H)
|
||||
prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1)
|
||||
res = paddle.add(batch_H_proj, prev_hidden_proj)
|
||||
res = paddle.tanh(res)
|
||||
e = self.score(res)
|
||||
|
||||
alpha = F.softmax(e, axis=1)
|
||||
alpha = paddle.transpose(alpha, [0, 2, 1])
|
||||
context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
|
||||
concat_context = paddle.concat([context, char_onehots], 1)
|
||||
cur_hidden = self.rnn(concat_context, prev_hidden)
|
||||
|
||||
return cur_hidden, alpha
|
||||
87
backend/ppocr/modeling/heads/rec_ctc_head.py
Executable file
87
backend/ppocr/modeling/heads/rec_ctc_head.py
Executable file
@@ -0,0 +1,87 @@
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr, nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
||||
def get_para_bias_attr(l2_decay, k):
|
||||
regularizer = paddle.regularizer.L2Decay(l2_decay)
|
||||
stdv = 1.0 / math.sqrt(k * 1.0)
|
||||
initializer = nn.initializer.Uniform(-stdv, stdv)
|
||||
weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer)
|
||||
bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer)
|
||||
return [weight_attr, bias_attr]
|
||||
|
||||
|
||||
class CTCHead(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
fc_decay=0.0004,
|
||||
mid_channels=None,
|
||||
return_feats=False,
|
||||
**kwargs):
|
||||
super(CTCHead, self).__init__()
|
||||
if mid_channels is None:
|
||||
weight_attr, bias_attr = get_para_bias_attr(
|
||||
l2_decay=fc_decay, k=in_channels)
|
||||
self.fc = nn.Linear(
|
||||
in_channels,
|
||||
out_channels,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr)
|
||||
else:
|
||||
weight_attr1, bias_attr1 = get_para_bias_attr(
|
||||
l2_decay=fc_decay, k=in_channels)
|
||||
self.fc1 = nn.Linear(
|
||||
in_channels,
|
||||
mid_channels,
|
||||
weight_attr=weight_attr1,
|
||||
bias_attr=bias_attr1)
|
||||
|
||||
weight_attr2, bias_attr2 = get_para_bias_attr(
|
||||
l2_decay=fc_decay, k=mid_channels)
|
||||
self.fc2 = nn.Linear(
|
||||
mid_channels,
|
||||
out_channels,
|
||||
weight_attr=weight_attr2,
|
||||
bias_attr=bias_attr2)
|
||||
self.out_channels = out_channels
|
||||
self.mid_channels = mid_channels
|
||||
self.return_feats = return_feats
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
if self.mid_channels is None:
|
||||
predicts = self.fc(x)
|
||||
else:
|
||||
x = self.fc1(x)
|
||||
predicts = self.fc2(x)
|
||||
|
||||
if self.return_feats:
|
||||
result = (x, predicts)
|
||||
else:
|
||||
result = predicts
|
||||
if not self.training:
|
||||
predicts = F.softmax(predicts, axis=2)
|
||||
result = predicts
|
||||
|
||||
return result
|
||||
73
backend/ppocr/modeling/heads/rec_multi_head.py
Normal file
73
backend/ppocr/modeling/heads/rec_multi_head.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from ppocr.modeling.necks.rnn import Im2Seq, EncoderWithRNN, EncoderWithFC, SequenceEncoder, EncoderWithSVTR
|
||||
from .rec_ctc_head import CTCHead
|
||||
from .rec_sar_head import SARHead
|
||||
|
||||
|
||||
class MultiHead(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels_list, **kwargs):
|
||||
super().__init__()
|
||||
self.head_list = kwargs.pop('head_list')
|
||||
self.gtc_head = 'sar'
|
||||
assert len(self.head_list) >= 2
|
||||
for idx, head_name in enumerate(self.head_list):
|
||||
name = list(head_name)[0]
|
||||
if name == 'SARHead':
|
||||
# sar head
|
||||
sar_args = self.head_list[idx][name]
|
||||
self.sar_head = eval(name)(in_channels=in_channels, \
|
||||
out_channels=out_channels_list['SARLabelDecode'], **sar_args)
|
||||
elif name == 'CTCHead':
|
||||
# ctc neck
|
||||
self.encoder_reshape = Im2Seq(in_channels)
|
||||
neck_args = self.head_list[idx][name]['Neck']
|
||||
encoder_type = neck_args.pop('name')
|
||||
self.encoder = encoder_type
|
||||
self.ctc_encoder = SequenceEncoder(in_channels=in_channels, \
|
||||
encoder_type=encoder_type, **neck_args)
|
||||
# ctc head
|
||||
head_args = self.head_list[idx][name]['Head']
|
||||
self.ctc_head = eval(name)(in_channels=self.ctc_encoder.out_channels, \
|
||||
out_channels=out_channels_list['CTCLabelDecode'], **head_args)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
'{} is not supported in MultiHead yet'.format(name))
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
ctc_encoder = self.ctc_encoder(x)
|
||||
ctc_out = self.ctc_head(ctc_encoder, targets)
|
||||
head_out = dict()
|
||||
head_out['ctc'] = ctc_out
|
||||
head_out['ctc_neck'] = ctc_encoder
|
||||
# eval mode
|
||||
if not self.training:
|
||||
return ctc_out
|
||||
if self.gtc_head == 'sar':
|
||||
sar_out = self.sar_head(x, targets[1:])
|
||||
head_out['sar'] = sar_out
|
||||
return head_out
|
||||
else:
|
||||
return head_out
|
||||
826
backend/ppocr/modeling/heads/rec_nrtr_head.py
Normal file
826
backend/ppocr/modeling/heads/rec_nrtr_head.py
Normal file
@@ -0,0 +1,826 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import paddle
|
||||
import copy
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import LayerList
|
||||
from paddle.nn.initializer import XavierNormal as xavier_uniform_
|
||||
from paddle.nn import Dropout, Linear, LayerNorm, Conv2D
|
||||
import numpy as np
|
||||
from ppocr.modeling.heads.multiheadAttention import MultiheadAttention
|
||||
from paddle.nn.initializer import Constant as constant_
|
||||
from paddle.nn.initializer import XavierNormal as xavier_normal_
|
||||
|
||||
zeros_ = constant_(value=0.)
|
||||
ones_ = constant_(value=1.)
|
||||
|
||||
|
||||
class Transformer(nn.Layer):
|
||||
"""A transformer model. User is able to modify the attributes as needed. The architechture
|
||||
is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
|
||||
Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
|
||||
Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
|
||||
Processing Systems, pages 6000-6010.
|
||||
|
||||
Args:
|
||||
d_model: the number of expected features in the encoder/decoder inputs (default=512).
|
||||
nhead: the number of heads in the multiheadattention models (default=8).
|
||||
num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
|
||||
num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
|
||||
dim_feedforward: the dimension of the feedforward network model (default=2048).
|
||||
dropout: the dropout value (default=0.1).
|
||||
custom_encoder: custom encoder (default=None).
|
||||
custom_decoder: custom decoder (default=None).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_model=512,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
beam_size=0,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
attention_dropout_rate=0.0,
|
||||
residual_dropout_rate=0.1,
|
||||
custom_encoder=None,
|
||||
custom_decoder=None,
|
||||
in_channels=0,
|
||||
out_channels=0,
|
||||
scale_embedding=True):
|
||||
super(Transformer, self).__init__()
|
||||
self.out_channels = out_channels + 1
|
||||
self.embedding = Embeddings(
|
||||
d_model=d_model,
|
||||
vocab=self.out_channels,
|
||||
padding_idx=0,
|
||||
scale_embedding=scale_embedding)
|
||||
self.positional_encoding = PositionalEncoding(
|
||||
dropout=residual_dropout_rate,
|
||||
dim=d_model, )
|
||||
if custom_encoder is not None:
|
||||
self.encoder = custom_encoder
|
||||
else:
|
||||
if num_encoder_layers > 0:
|
||||
encoder_layer = TransformerEncoderLayer(
|
||||
d_model, nhead, dim_feedforward, attention_dropout_rate,
|
||||
residual_dropout_rate)
|
||||
self.encoder = TransformerEncoder(encoder_layer,
|
||||
num_encoder_layers)
|
||||
else:
|
||||
self.encoder = None
|
||||
|
||||
if custom_decoder is not None:
|
||||
self.decoder = custom_decoder
|
||||
else:
|
||||
decoder_layer = TransformerDecoderLayer(
|
||||
d_model, nhead, dim_feedforward, attention_dropout_rate,
|
||||
residual_dropout_rate)
|
||||
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers)
|
||||
|
||||
self._reset_parameters()
|
||||
self.beam_size = beam_size
|
||||
self.d_model = d_model
|
||||
self.nhead = nhead
|
||||
self.tgt_word_prj = nn.Linear(
|
||||
d_model, self.out_channels, bias_attr=False)
|
||||
w0 = np.random.normal(0.0, d_model**-0.5,
|
||||
(d_model, self.out_channels)).astype(np.float32)
|
||||
self.tgt_word_prj.weight.set_value(w0)
|
||||
self.apply(self._init_weights)
|
||||
|
||||
def _init_weights(self, m):
|
||||
|
||||
if isinstance(m, nn.Conv2D):
|
||||
xavier_normal_(m.weight)
|
||||
if m.bias is not None:
|
||||
zeros_(m.bias)
|
||||
|
||||
def forward_train(self, src, tgt):
|
||||
tgt = tgt[:, :-1]
|
||||
|
||||
tgt_key_padding_mask = self.generate_padding_mask(tgt)
|
||||
tgt = self.embedding(tgt).transpose([1, 0, 2])
|
||||
tgt = self.positional_encoding(tgt)
|
||||
tgt_mask = self.generate_square_subsequent_mask(tgt.shape[0])
|
||||
|
||||
if self.encoder is not None:
|
||||
src = self.positional_encoding(src.transpose([1, 0, 2]))
|
||||
memory = self.encoder(src)
|
||||
else:
|
||||
memory = src.squeeze(2).transpose([2, 0, 1])
|
||||
output = self.decoder(
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=tgt_mask,
|
||||
memory_mask=None,
|
||||
tgt_key_padding_mask=tgt_key_padding_mask,
|
||||
memory_key_padding_mask=None)
|
||||
output = output.transpose([1, 0, 2])
|
||||
logit = self.tgt_word_prj(output)
|
||||
return logit
|
||||
|
||||
def forward(self, src, targets=None):
|
||||
"""Take in and process masked source/target sequences.
|
||||
Args:
|
||||
src: the sequence to the encoder (required).
|
||||
tgt: the sequence to the decoder (required).
|
||||
Shape:
|
||||
- src: :math:`(S, N, E)`.
|
||||
- tgt: :math:`(T, N, E)`.
|
||||
Examples:
|
||||
>>> output = transformer_model(src, tgt)
|
||||
"""
|
||||
|
||||
if self.training:
|
||||
max_len = targets[1].max()
|
||||
tgt = targets[0][:, :2 + max_len]
|
||||
return self.forward_train(src, tgt)
|
||||
else:
|
||||
if self.beam_size > 0:
|
||||
return self.forward_beam(src)
|
||||
else:
|
||||
return self.forward_test(src)
|
||||
|
||||
def forward_test(self, src):
|
||||
bs = paddle.shape(src)[0]
|
||||
if self.encoder is not None:
|
||||
src = self.positional_encoding(paddle.transpose(src, [1, 0, 2]))
|
||||
memory = self.encoder(src)
|
||||
else:
|
||||
memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1])
|
||||
dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64)
|
||||
dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32)
|
||||
for len_dec_seq in range(1, 25):
|
||||
dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
|
||||
dec_seq_embed = self.positional_encoding(dec_seq_embed)
|
||||
tgt_mask = self.generate_square_subsequent_mask(
|
||||
paddle.shape(dec_seq_embed)[0])
|
||||
output = self.decoder(
|
||||
dec_seq_embed,
|
||||
memory,
|
||||
tgt_mask=tgt_mask,
|
||||
memory_mask=None,
|
||||
tgt_key_padding_mask=None,
|
||||
memory_key_padding_mask=None)
|
||||
dec_output = paddle.transpose(output, [1, 0, 2])
|
||||
dec_output = dec_output[:, -1, :]
|
||||
word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
|
||||
preds_idx = paddle.argmax(word_prob, axis=1)
|
||||
if paddle.equal_all(
|
||||
preds_idx,
|
||||
paddle.full(
|
||||
paddle.shape(preds_idx), 3, dtype='int64')):
|
||||
break
|
||||
preds_prob = paddle.max(word_prob, axis=1)
|
||||
dec_seq = paddle.concat(
|
||||
[dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1)
|
||||
dec_prob = paddle.concat(
|
||||
[dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1)
|
||||
return [dec_seq, dec_prob]
|
||||
|
||||
def forward_beam(self, images):
|
||||
''' Translation work in one batch '''
|
||||
|
||||
def get_inst_idx_to_tensor_position_map(inst_idx_list):
|
||||
''' Indicate the position of an instance in a tensor. '''
|
||||
return {
|
||||
inst_idx: tensor_position
|
||||
for tensor_position, inst_idx in enumerate(inst_idx_list)
|
||||
}
|
||||
|
||||
def collect_active_part(beamed_tensor, curr_active_inst_idx,
|
||||
n_prev_active_inst, n_bm):
|
||||
''' Collect tensor parts associated to active instances. '''
|
||||
|
||||
beamed_tensor_shape = paddle.shape(beamed_tensor)
|
||||
n_curr_active_inst = len(curr_active_inst_idx)
|
||||
new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1],
|
||||
beamed_tensor_shape[2])
|
||||
|
||||
beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1])
|
||||
beamed_tensor = beamed_tensor.index_select(
|
||||
curr_active_inst_idx, axis=0)
|
||||
beamed_tensor = beamed_tensor.reshape(new_shape)
|
||||
|
||||
return beamed_tensor
|
||||
|
||||
def collate_active_info(src_enc, inst_idx_to_position_map,
|
||||
active_inst_idx_list):
|
||||
# Sentences which are still active are collected,
|
||||
# so the decoder will not run on completed sentences.
|
||||
|
||||
n_prev_active_inst = len(inst_idx_to_position_map)
|
||||
active_inst_idx = [
|
||||
inst_idx_to_position_map[k] for k in active_inst_idx_list
|
||||
]
|
||||
active_inst_idx = paddle.to_tensor(active_inst_idx, dtype='int64')
|
||||
active_src_enc = collect_active_part(
|
||||
src_enc.transpose([1, 0, 2]), active_inst_idx,
|
||||
n_prev_active_inst, n_bm).transpose([1, 0, 2])
|
||||
active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
|
||||
active_inst_idx_list)
|
||||
return active_src_enc, active_inst_idx_to_position_map
|
||||
|
||||
def beam_decode_step(inst_dec_beams, len_dec_seq, enc_output,
|
||||
inst_idx_to_position_map, n_bm,
|
||||
memory_key_padding_mask):
|
||||
''' Decode and update beam status, and then return active beam idx '''
|
||||
|
||||
def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
|
||||
dec_partial_seq = [
|
||||
b.get_current_state() for b in inst_dec_beams if not b.done
|
||||
]
|
||||
dec_partial_seq = paddle.stack(dec_partial_seq)
|
||||
dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq])
|
||||
return dec_partial_seq
|
||||
|
||||
def predict_word(dec_seq, enc_output, n_active_inst, n_bm,
|
||||
memory_key_padding_mask):
|
||||
dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
|
||||
dec_seq = self.positional_encoding(dec_seq)
|
||||
tgt_mask = self.generate_square_subsequent_mask(
|
||||
paddle.shape(dec_seq)[0])
|
||||
dec_output = self.decoder(
|
||||
dec_seq,
|
||||
enc_output,
|
||||
tgt_mask=tgt_mask,
|
||||
tgt_key_padding_mask=None,
|
||||
memory_key_padding_mask=memory_key_padding_mask, )
|
||||
dec_output = paddle.transpose(dec_output, [1, 0, 2])
|
||||
dec_output = dec_output[:,
|
||||
-1, :] # Pick the last step: (bh * bm) * d_h
|
||||
word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
|
||||
word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1])
|
||||
return word_prob
|
||||
|
||||
def collect_active_inst_idx_list(inst_beams, word_prob,
|
||||
inst_idx_to_position_map):
|
||||
active_inst_idx_list = []
|
||||
for inst_idx, inst_position in inst_idx_to_position_map.items():
|
||||
is_inst_complete = inst_beams[inst_idx].advance(word_prob[
|
||||
inst_position])
|
||||
if not is_inst_complete:
|
||||
active_inst_idx_list += [inst_idx]
|
||||
|
||||
return active_inst_idx_list
|
||||
|
||||
n_active_inst = len(inst_idx_to_position_map)
|
||||
dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
|
||||
word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm,
|
||||
None)
|
||||
# Update the beam with predicted word prob information and collect incomplete instances
|
||||
active_inst_idx_list = collect_active_inst_idx_list(
|
||||
inst_dec_beams, word_prob, inst_idx_to_position_map)
|
||||
return active_inst_idx_list
|
||||
|
||||
def collect_hypothesis_and_scores(inst_dec_beams, n_best):
|
||||
all_hyp, all_scores = [], []
|
||||
for inst_idx in range(len(inst_dec_beams)):
|
||||
scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
|
||||
all_scores += [scores[:n_best]]
|
||||
hyps = [
|
||||
inst_dec_beams[inst_idx].get_hypothesis(i)
|
||||
for i in tail_idxs[:n_best]
|
||||
]
|
||||
all_hyp += [hyps]
|
||||
return all_hyp, all_scores
|
||||
|
||||
with paddle.no_grad():
|
||||
#-- Encode
|
||||
if self.encoder is not None:
|
||||
src = self.positional_encoding(images.transpose([1, 0, 2]))
|
||||
src_enc = self.encoder(src)
|
||||
else:
|
||||
src_enc = images.squeeze(2).transpose([0, 2, 1])
|
||||
|
||||
n_bm = self.beam_size
|
||||
src_shape = paddle.shape(src_enc)
|
||||
inst_dec_beams = [Beam(n_bm) for _ in range(1)]
|
||||
active_inst_idx_list = list(range(1))
|
||||
# Repeat data for beam search
|
||||
src_enc = paddle.tile(src_enc, [1, n_bm, 1])
|
||||
inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
|
||||
active_inst_idx_list)
|
||||
# Decode
|
||||
for len_dec_seq in range(1, 25):
|
||||
src_enc_copy = src_enc.clone()
|
||||
active_inst_idx_list = beam_decode_step(
|
||||
inst_dec_beams, len_dec_seq, src_enc_copy,
|
||||
inst_idx_to_position_map, n_bm, None)
|
||||
if not active_inst_idx_list:
|
||||
break # all instances have finished their path to <EOS>
|
||||
src_enc, inst_idx_to_position_map = collate_active_info(
|
||||
src_enc_copy, inst_idx_to_position_map,
|
||||
active_inst_idx_list)
|
||||
batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams,
|
||||
1)
|
||||
result_hyp = []
|
||||
hyp_scores = []
|
||||
for bs_hyp, score in zip(batch_hyp, batch_scores):
|
||||
l = len(bs_hyp[0])
|
||||
bs_hyp_pad = bs_hyp[0] + [3] * (25 - l)
|
||||
result_hyp.append(bs_hyp_pad)
|
||||
score = float(score) / l
|
||||
hyp_score = [score for _ in range(25)]
|
||||
hyp_scores.append(hyp_score)
|
||||
return [
|
||||
paddle.to_tensor(
|
||||
np.array(result_hyp), dtype=paddle.int64),
|
||||
paddle.to_tensor(hyp_scores)
|
||||
]
|
||||
|
||||
def generate_square_subsequent_mask(self, sz):
|
||||
"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
|
||||
Unmasked positions are filled with float(0.0).
|
||||
"""
|
||||
mask = paddle.zeros([sz, sz], dtype='float32')
|
||||
mask_inf = paddle.triu(
|
||||
paddle.full(
|
||||
shape=[sz, sz], dtype='float32', fill_value='-inf'),
|
||||
diagonal=1)
|
||||
mask = mask + mask_inf
|
||||
return mask
|
||||
|
||||
def generate_padding_mask(self, x):
|
||||
padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype))
|
||||
return padding_mask
|
||||
|
||||
def _reset_parameters(self):
|
||||
"""Initiate parameters in the transformer model."""
|
||||
|
||||
for p in self.parameters():
|
||||
if p.dim() > 1:
|
||||
xavier_uniform_(p)
|
||||
|
||||
|
||||
class TransformerEncoder(nn.Layer):
|
||||
"""TransformerEncoder is a stack of N encoder layers
|
||||
Args:
|
||||
encoder_layer: an instance of the TransformerEncoderLayer() class (required).
|
||||
num_layers: the number of sub-encoder-layers in the encoder (required).
|
||||
norm: the layer normalization component (optional).
|
||||
"""
|
||||
|
||||
def __init__(self, encoder_layer, num_layers):
|
||||
super(TransformerEncoder, self).__init__()
|
||||
self.layers = _get_clones(encoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
|
||||
def forward(self, src):
|
||||
"""Pass the input through the endocder layers in turn.
|
||||
Args:
|
||||
src: the sequnce to the encoder (required).
|
||||
mask: the mask for the src sequence (optional).
|
||||
src_key_padding_mask: the mask for the src keys per batch (optional).
|
||||
"""
|
||||
output = src
|
||||
|
||||
for i in range(self.num_layers):
|
||||
output = self.layers[i](output,
|
||||
src_mask=None,
|
||||
src_key_padding_mask=None)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Layer):
|
||||
"""TransformerDecoder is a stack of N decoder layers
|
||||
|
||||
Args:
|
||||
decoder_layer: an instance of the TransformerDecoderLayer() class (required).
|
||||
num_layers: the number of sub-decoder-layers in the decoder (required).
|
||||
norm: the layer normalization component (optional).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, decoder_layer, num_layers):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=None,
|
||||
memory_mask=None,
|
||||
tgt_key_padding_mask=None,
|
||||
memory_key_padding_mask=None):
|
||||
"""Pass the inputs (and mask) through the decoder layer in turn.
|
||||
|
||||
Args:
|
||||
tgt: the sequence to the decoder (required).
|
||||
memory: the sequnce from the last layer of the encoder (required).
|
||||
tgt_mask: the mask for the tgt sequence (optional).
|
||||
memory_mask: the mask for the memory sequence (optional).
|
||||
tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
|
||||
memory_key_padding_mask: the mask for the memory keys per batch (optional).
|
||||
"""
|
||||
output = tgt
|
||||
for i in range(self.num_layers):
|
||||
output = self.layers[i](
|
||||
output,
|
||||
memory,
|
||||
tgt_mask=tgt_mask,
|
||||
memory_mask=memory_mask,
|
||||
tgt_key_padding_mask=tgt_key_padding_mask,
|
||||
memory_key_padding_mask=memory_key_padding_mask)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerEncoderLayer(nn.Layer):
|
||||
"""TransformerEncoderLayer is made up of self-attn and feedforward network.
|
||||
This standard encoder layer is based on the paper "Attention Is All You Need".
|
||||
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
|
||||
Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
|
||||
Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
|
||||
in a different way during application.
|
||||
|
||||
Args:
|
||||
d_model: the number of expected features in the input (required).
|
||||
nhead: the number of heads in the multiheadattention models (required).
|
||||
dim_feedforward: the dimension of the feedforward network model (default=2048).
|
||||
dropout: the dropout value (default=0.1).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=2048,
|
||||
attention_dropout_rate=0.0,
|
||||
residual_dropout_rate=0.1):
|
||||
super(TransformerEncoderLayer, self).__init__()
|
||||
self.self_attn = MultiheadAttention(
|
||||
d_model, nhead, dropout=attention_dropout_rate)
|
||||
|
||||
self.conv1 = Conv2D(
|
||||
in_channels=d_model,
|
||||
out_channels=dim_feedforward,
|
||||
kernel_size=(1, 1))
|
||||
self.conv2 = Conv2D(
|
||||
in_channels=dim_feedforward,
|
||||
out_channels=d_model,
|
||||
kernel_size=(1, 1))
|
||||
|
||||
self.norm1 = LayerNorm(d_model)
|
||||
self.norm2 = LayerNorm(d_model)
|
||||
self.dropout1 = Dropout(residual_dropout_rate)
|
||||
self.dropout2 = Dropout(residual_dropout_rate)
|
||||
|
||||
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
||||
"""Pass the input through the endocder layer.
|
||||
Args:
|
||||
src: the sequnce to the encoder layer (required).
|
||||
src_mask: the mask for the src sequence (optional).
|
||||
src_key_padding_mask: the mask for the src keys per batch (optional).
|
||||
"""
|
||||
src2 = self.self_attn(
|
||||
src,
|
||||
src,
|
||||
src,
|
||||
attn_mask=src_mask,
|
||||
key_padding_mask=src_key_padding_mask)
|
||||
src = src + self.dropout1(src2)
|
||||
src = self.norm1(src)
|
||||
|
||||
src = paddle.transpose(src, [1, 2, 0])
|
||||
src = paddle.unsqueeze(src, 2)
|
||||
src2 = self.conv2(F.relu(self.conv1(src)))
|
||||
src2 = paddle.squeeze(src2, 2)
|
||||
src2 = paddle.transpose(src2, [2, 0, 1])
|
||||
src = paddle.squeeze(src, 2)
|
||||
src = paddle.transpose(src, [2, 0, 1])
|
||||
|
||||
src = src + self.dropout2(src2)
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Layer):
|
||||
"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
|
||||
This standard decoder layer is based on the paper "Attention Is All You Need".
|
||||
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
|
||||
Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
|
||||
Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
|
||||
in a different way during application.
|
||||
|
||||
Args:
|
||||
d_model: the number of expected features in the input (required).
|
||||
nhead: the number of heads in the multiheadattention models (required).
|
||||
dim_feedforward: the dimension of the feedforward network model (default=2048).
|
||||
dropout: the dropout value (default=0.1).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=2048,
|
||||
attention_dropout_rate=0.0,
|
||||
residual_dropout_rate=0.1):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
self.self_attn = MultiheadAttention(
|
||||
d_model, nhead, dropout=attention_dropout_rate)
|
||||
self.multihead_attn = MultiheadAttention(
|
||||
d_model, nhead, dropout=attention_dropout_rate)
|
||||
|
||||
self.conv1 = Conv2D(
|
||||
in_channels=d_model,
|
||||
out_channels=dim_feedforward,
|
||||
kernel_size=(1, 1))
|
||||
self.conv2 = Conv2D(
|
||||
in_channels=dim_feedforward,
|
||||
out_channels=d_model,
|
||||
kernel_size=(1, 1))
|
||||
|
||||
self.norm1 = LayerNorm(d_model)
|
||||
self.norm2 = LayerNorm(d_model)
|
||||
self.norm3 = LayerNorm(d_model)
|
||||
self.dropout1 = Dropout(residual_dropout_rate)
|
||||
self.dropout2 = Dropout(residual_dropout_rate)
|
||||
self.dropout3 = Dropout(residual_dropout_rate)
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=None,
|
||||
memory_mask=None,
|
||||
tgt_key_padding_mask=None,
|
||||
memory_key_padding_mask=None):
|
||||
"""Pass the inputs (and mask) through the decoder layer.
|
||||
|
||||
Args:
|
||||
tgt: the sequence to the decoder layer (required).
|
||||
memory: the sequnce from the last layer of the encoder (required).
|
||||
tgt_mask: the mask for the tgt sequence (optional).
|
||||
memory_mask: the mask for the memory sequence (optional).
|
||||
tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
|
||||
memory_key_padding_mask: the mask for the memory keys per batch (optional).
|
||||
|
||||
"""
|
||||
tgt2 = self.self_attn(
|
||||
tgt,
|
||||
tgt,
|
||||
tgt,
|
||||
attn_mask=tgt_mask,
|
||||
key_padding_mask=tgt_key_padding_mask)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
tgt2 = self.multihead_attn(
|
||||
tgt,
|
||||
memory,
|
||||
memory,
|
||||
attn_mask=memory_mask,
|
||||
key_padding_mask=memory_key_padding_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# default
|
||||
tgt = paddle.transpose(tgt, [1, 2, 0])
|
||||
tgt = paddle.unsqueeze(tgt, 2)
|
||||
tgt2 = self.conv2(F.relu(self.conv1(tgt)))
|
||||
tgt2 = paddle.squeeze(tgt2, 2)
|
||||
tgt2 = paddle.transpose(tgt2, [2, 0, 1])
|
||||
tgt = paddle.squeeze(tgt, 2)
|
||||
tgt = paddle.transpose(tgt, [2, 0, 1])
|
||||
|
||||
tgt = tgt + self.dropout3(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
return tgt
|
||||
|
||||
|
||||
def _get_clones(module, N):
|
||||
return LayerList([copy.deepcopy(module) for i in range(N)])
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Layer):
|
||||
"""Inject some information about the relative or absolute position of the tokens
|
||||
in the sequence. The positional encodings have the same dimension as
|
||||
the embeddings, so that the two can be summed. Here, we use sine and cosine
|
||||
functions of different frequencies.
|
||||
.. math::
|
||||
\text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
|
||||
\text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
|
||||
\text{where pos is the word position and i is the embed idx)
|
||||
Args:
|
||||
d_model: the embed dim (required).
|
||||
dropout: the dropout value (default=0.1).
|
||||
max_len: the max. length of the incoming sequence (default=5000).
|
||||
Examples:
|
||||
>>> pos_encoder = PositionalEncoding(d_model)
|
||||
"""
|
||||
|
||||
def __init__(self, dropout, dim, max_len=5000):
|
||||
super(PositionalEncoding, self).__init__()
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
||||
pe = paddle.zeros([max_len, dim])
|
||||
position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1)
|
||||
div_term = paddle.exp(
|
||||
paddle.arange(0, dim, 2).astype('float32') *
|
||||
(-math.log(10000.0) / dim))
|
||||
pe[:, 0::2] = paddle.sin(position * div_term)
|
||||
pe[:, 1::2] = paddle.cos(position * div_term)
|
||||
pe = paddle.unsqueeze(pe, 0)
|
||||
pe = paddle.transpose(pe, [1, 0, 2])
|
||||
self.register_buffer('pe', pe)
|
||||
|
||||
def forward(self, x):
|
||||
"""Inputs of forward function
|
||||
Args:
|
||||
x: the sequence fed to the positional encoder model (required).
|
||||
Shape:
|
||||
x: [sequence length, batch size, embed dim]
|
||||
output: [sequence length, batch size, embed dim]
|
||||
Examples:
|
||||
>>> output = pos_encoder(x)
|
||||
"""
|
||||
x = x + self.pe[:paddle.shape(x)[0], :]
|
||||
return self.dropout(x)
|
||||
|
||||
|
||||
class PositionalEncoding_2d(nn.Layer):
|
||||
"""Inject some information about the relative or absolute position of the tokens
|
||||
in the sequence. The positional encodings have the same dimension as
|
||||
the embeddings, so that the two can be summed. Here, we use sine and cosine
|
||||
functions of different frequencies.
|
||||
.. math::
|
||||
\text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
|
||||
\text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
|
||||
\text{where pos is the word position and i is the embed idx)
|
||||
Args:
|
||||
d_model: the embed dim (required).
|
||||
dropout: the dropout value (default=0.1).
|
||||
max_len: the max. length of the incoming sequence (default=5000).
|
||||
Examples:
|
||||
>>> pos_encoder = PositionalEncoding(d_model)
|
||||
"""
|
||||
|
||||
def __init__(self, dropout, dim, max_len=5000):
|
||||
super(PositionalEncoding_2d, self).__init__()
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
||||
pe = paddle.zeros([max_len, dim])
|
||||
position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1)
|
||||
div_term = paddle.exp(
|
||||
paddle.arange(0, dim, 2).astype('float32') *
|
||||
(-math.log(10000.0) / dim))
|
||||
pe[:, 0::2] = paddle.sin(position * div_term)
|
||||
pe[:, 1::2] = paddle.cos(position * div_term)
|
||||
pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2])
|
||||
self.register_buffer('pe', pe)
|
||||
|
||||
self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1))
|
||||
self.linear1 = nn.Linear(dim, dim)
|
||||
self.linear1.weight.data.fill_(1.)
|
||||
self.avg_pool_2 = nn.AdaptiveAvgPool2D((1, 1))
|
||||
self.linear2 = nn.Linear(dim, dim)
|
||||
self.linear2.weight.data.fill_(1.)
|
||||
|
||||
def forward(self, x):
|
||||
"""Inputs of forward function
|
||||
Args:
|
||||
x: the sequence fed to the positional encoder model (required).
|
||||
Shape:
|
||||
x: [sequence length, batch size, embed dim]
|
||||
output: [sequence length, batch size, embed dim]
|
||||
Examples:
|
||||
>>> output = pos_encoder(x)
|
||||
"""
|
||||
w_pe = self.pe[:paddle.shape(x)[-1], :]
|
||||
w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0)
|
||||
w_pe = w_pe * w1
|
||||
w_pe = paddle.transpose(w_pe, [1, 2, 0])
|
||||
w_pe = paddle.unsqueeze(w_pe, 2)
|
||||
|
||||
h_pe = self.pe[:paddle.shape(x).shape[-2], :]
|
||||
w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0)
|
||||
h_pe = h_pe * w2
|
||||
h_pe = paddle.transpose(h_pe, [1, 2, 0])
|
||||
h_pe = paddle.unsqueeze(h_pe, 3)
|
||||
|
||||
x = x + w_pe + h_pe
|
||||
x = paddle.transpose(
|
||||
paddle.reshape(x,
|
||||
[x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]),
|
||||
[2, 0, 1])
|
||||
|
||||
return self.dropout(x)
|
||||
|
||||
|
||||
class Embeddings(nn.Layer):
|
||||
def __init__(self, d_model, vocab, padding_idx, scale_embedding):
|
||||
super(Embeddings, self).__init__()
|
||||
self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx)
|
||||
w0 = np.random.normal(0.0, d_model**-0.5,
|
||||
(vocab, d_model)).astype(np.float32)
|
||||
self.embedding.weight.set_value(w0)
|
||||
self.d_model = d_model
|
||||
self.scale_embedding = scale_embedding
|
||||
|
||||
def forward(self, x):
|
||||
if self.scale_embedding:
|
||||
x = self.embedding(x)
|
||||
return x * math.sqrt(self.d_model)
|
||||
return self.embedding(x)
|
||||
|
||||
|
||||
class Beam():
|
||||
''' Beam search '''
|
||||
|
||||
def __init__(self, size, device=False):
|
||||
|
||||
self.size = size
|
||||
self._done = False
|
||||
# The score for each translation on the beam.
|
||||
self.scores = paddle.zeros((size, ), dtype=paddle.float32)
|
||||
self.all_scores = []
|
||||
# The backpointers at each time-step.
|
||||
self.prev_ks = []
|
||||
# The outputs at each time-step.
|
||||
self.next_ys = [paddle.full((size, ), 0, dtype=paddle.int64)]
|
||||
self.next_ys[0][0] = 2
|
||||
|
||||
def get_current_state(self):
|
||||
"Get the outputs for the current timestep."
|
||||
return self.get_tentative_hypothesis()
|
||||
|
||||
def get_current_origin(self):
|
||||
"Get the backpointers for the current timestep."
|
||||
return self.prev_ks[-1]
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
return self._done
|
||||
|
||||
def advance(self, word_prob):
|
||||
"Update beam status and check if finished or not."
|
||||
num_words = word_prob.shape[1]
|
||||
|
||||
# Sum the previous scores.
|
||||
if len(self.prev_ks) > 0:
|
||||
beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
|
||||
else:
|
||||
beam_lk = word_prob[0]
|
||||
|
||||
flat_beam_lk = beam_lk.reshape([-1])
|
||||
best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True,
|
||||
True) # 1st sort
|
||||
self.all_scores.append(self.scores)
|
||||
self.scores = best_scores
|
||||
# bestScoresId is flattened as a (beam x word) array,
|
||||
# so we need to calculate which word and beam each score came from
|
||||
prev_k = best_scores_id // num_words
|
||||
self.prev_ks.append(prev_k)
|
||||
self.next_ys.append(best_scores_id - prev_k * num_words)
|
||||
# End condition is when top-of-beam is EOS.
|
||||
if self.next_ys[-1][0] == 3:
|
||||
self._done = True
|
||||
self.all_scores.append(self.scores)
|
||||
|
||||
return self._done
|
||||
|
||||
def sort_scores(self):
|
||||
"Sort the scores."
|
||||
return self.scores, paddle.to_tensor(
|
||||
[i for i in range(int(self.scores.shape[0]))], dtype='int32')
|
||||
|
||||
def get_the_best_score_and_idx(self):
|
||||
"Get the score of the best in the beam."
|
||||
scores, ids = self.sort_scores()
|
||||
return scores[1], ids[1]
|
||||
|
||||
def get_tentative_hypothesis(self):
|
||||
"Get the decoded sequence for the current timestep."
|
||||
if len(self.next_ys) == 1:
|
||||
dec_seq = self.next_ys[0].unsqueeze(1)
|
||||
else:
|
||||
_, keys = self.sort_scores()
|
||||
hyps = [self.get_hypothesis(k) for k in keys]
|
||||
hyps = [[2] + h for h in hyps]
|
||||
dec_seq = paddle.to_tensor(hyps, dtype='int64')
|
||||
return dec_seq
|
||||
|
||||
def get_hypothesis(self, k):
|
||||
""" Walk back to construct the full hypothesis. """
|
||||
hyp = []
|
||||
for j in range(len(self.prev_ks) - 1, -1, -1):
|
||||
hyp.append(self.next_ys[j + 1][k])
|
||||
k = self.prev_ks[j][k]
|
||||
return list(map(lambda x: x.item(), hyp[::-1]))
|
||||
34
backend/ppocr/modeling/heads/rec_pren_head.py
Normal file
34
backend/ppocr/modeling/heads/rec_pren_head.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
||||
class PRENHead(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, **kwargs):
|
||||
super(PRENHead, self).__init__()
|
||||
self.linear = nn.Linear(in_channels, out_channels)
|
||||
|
||||
def forward(self, x, targets=None):
|
||||
predicts = self.linear(x)
|
||||
|
||||
if not self.training:
|
||||
predicts = F.softmax(predicts, axis=2)
|
||||
|
||||
return predicts
|
||||
410
backend/ppocr/modeling/heads/rec_sar_head.py
Normal file
410
backend/ppocr/modeling/heads/rec_sar_head.py
Normal file
@@ -0,0 +1,410 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/sar_encoder.py
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/sar_decoder.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
class SAREncoder(nn.Layer):
|
||||
"""
|
||||
Args:
|
||||
enc_bi_rnn (bool): If True, use bidirectional RNN in encoder.
|
||||
enc_drop_rnn (float): Dropout probability of RNN layer in encoder.
|
||||
enc_gru (bool): If True, use GRU, else LSTM in encoder.
|
||||
d_model (int): Dim of channels from backbone.
|
||||
d_enc (int): Dim of encoder RNN layer.
|
||||
mask (bool): If True, mask padding in RNN sequence.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
enc_bi_rnn=False,
|
||||
enc_drop_rnn=0.1,
|
||||
enc_gru=False,
|
||||
d_model=512,
|
||||
d_enc=512,
|
||||
mask=True,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
assert isinstance(enc_bi_rnn, bool)
|
||||
assert isinstance(enc_drop_rnn, (int, float))
|
||||
assert 0 <= enc_drop_rnn < 1.0
|
||||
assert isinstance(enc_gru, bool)
|
||||
assert isinstance(d_model, int)
|
||||
assert isinstance(d_enc, int)
|
||||
assert isinstance(mask, bool)
|
||||
|
||||
self.enc_bi_rnn = enc_bi_rnn
|
||||
self.enc_drop_rnn = enc_drop_rnn
|
||||
self.mask = mask
|
||||
|
||||
# LSTM Encoder
|
||||
if enc_bi_rnn:
|
||||
direction = 'bidirectional'
|
||||
else:
|
||||
direction = 'forward'
|
||||
kwargs = dict(
|
||||
input_size=d_model,
|
||||
hidden_size=d_enc,
|
||||
num_layers=2,
|
||||
time_major=False,
|
||||
dropout=enc_drop_rnn,
|
||||
direction=direction)
|
||||
if enc_gru:
|
||||
self.rnn_encoder = nn.GRU(**kwargs)
|
||||
else:
|
||||
self.rnn_encoder = nn.LSTM(**kwargs)
|
||||
|
||||
# global feature transformation
|
||||
encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1)
|
||||
self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size)
|
||||
|
||||
def forward(self, feat, img_metas=None):
|
||||
if img_metas is not None:
|
||||
assert len(img_metas[0]) == feat.shape[0]
|
||||
|
||||
valid_ratios = None
|
||||
if img_metas is not None and self.mask:
|
||||
valid_ratios = img_metas[-1]
|
||||
|
||||
h_feat = feat.shape[2] # bsz c h w
|
||||
feat_v = F.max_pool2d(
|
||||
feat, kernel_size=(h_feat, 1), stride=1, padding=0)
|
||||
feat_v = feat_v.squeeze(2) # bsz * C * W
|
||||
feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C
|
||||
holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C
|
||||
|
||||
if valid_ratios is not None:
|
||||
valid_hf = []
|
||||
T = holistic_feat.shape[1]
|
||||
for i in range(len(valid_ratios)):
|
||||
valid_step = min(T, math.ceil(T * valid_ratios[i])) - 1
|
||||
valid_hf.append(holistic_feat[i, valid_step, :])
|
||||
valid_hf = paddle.stack(valid_hf, axis=0)
|
||||
else:
|
||||
valid_hf = holistic_feat[:, -1, :] # bsz * C
|
||||
holistic_feat = self.linear(valid_hf) # bsz * C
|
||||
|
||||
return holistic_feat
|
||||
|
||||
|
||||
class BaseDecoder(nn.Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__()
|
||||
|
||||
def forward_train(self, feat, out_enc, targets, img_metas):
|
||||
raise NotImplementedError
|
||||
|
||||
def forward_test(self, feat, out_enc, img_metas):
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(self,
|
||||
feat,
|
||||
out_enc,
|
||||
label=None,
|
||||
img_metas=None,
|
||||
train_mode=True):
|
||||
self.train_mode = train_mode
|
||||
|
||||
if train_mode:
|
||||
return self.forward_train(feat, out_enc, label, img_metas)
|
||||
return self.forward_test(feat, out_enc, img_metas)
|
||||
|
||||
|
||||
class ParallelSARDecoder(BaseDecoder):
|
||||
"""
|
||||
Args:
|
||||
out_channels (int): Output class number.
|
||||
enc_bi_rnn (bool): If True, use bidirectional RNN in encoder.
|
||||
dec_bi_rnn (bool): If True, use bidirectional RNN in decoder.
|
||||
dec_drop_rnn (float): Dropout of RNN layer in decoder.
|
||||
dec_gru (bool): If True, use GRU, else LSTM in decoder.
|
||||
d_model (int): Dim of channels from backbone.
|
||||
d_enc (int): Dim of encoder RNN layer.
|
||||
d_k (int): Dim of channels of attention module.
|
||||
pred_dropout (float): Dropout probability of prediction layer.
|
||||
max_seq_len (int): Maximum sequence length for decoding.
|
||||
mask (bool): If True, mask padding in feature map.
|
||||
start_idx (int): Index of start token.
|
||||
padding_idx (int): Index of padding token.
|
||||
pred_concat (bool): If True, concat glimpse feature from
|
||||
attention with holistic feature and hidden state.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
out_channels, # 90 + unknown + start + padding
|
||||
enc_bi_rnn=False,
|
||||
dec_bi_rnn=False,
|
||||
dec_drop_rnn=0.0,
|
||||
dec_gru=False,
|
||||
d_model=512,
|
||||
d_enc=512,
|
||||
d_k=64,
|
||||
pred_dropout=0.1,
|
||||
max_text_length=30,
|
||||
mask=True,
|
||||
pred_concat=True,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
|
||||
self.num_classes = out_channels
|
||||
self.enc_bi_rnn = enc_bi_rnn
|
||||
self.d_k = d_k
|
||||
self.start_idx = out_channels - 2
|
||||
self.padding_idx = out_channels - 1
|
||||
self.max_seq_len = max_text_length
|
||||
self.mask = mask
|
||||
self.pred_concat = pred_concat
|
||||
|
||||
encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1)
|
||||
decoder_rnn_out_size = encoder_rnn_out_size * (int(dec_bi_rnn) + 1)
|
||||
|
||||
# 2D attention layer
|
||||
self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k)
|
||||
self.conv3x3_1 = nn.Conv2D(
|
||||
d_model, d_k, kernel_size=3, stride=1, padding=1)
|
||||
self.conv1x1_2 = nn.Linear(d_k, 1)
|
||||
|
||||
# Decoder RNN layer
|
||||
if dec_bi_rnn:
|
||||
direction = 'bidirectional'
|
||||
else:
|
||||
direction = 'forward'
|
||||
|
||||
kwargs = dict(
|
||||
input_size=encoder_rnn_out_size,
|
||||
hidden_size=encoder_rnn_out_size,
|
||||
num_layers=2,
|
||||
time_major=False,
|
||||
dropout=dec_drop_rnn,
|
||||
direction=direction)
|
||||
if dec_gru:
|
||||
self.rnn_decoder = nn.GRU(**kwargs)
|
||||
else:
|
||||
self.rnn_decoder = nn.LSTM(**kwargs)
|
||||
|
||||
# Decoder input embedding
|
||||
self.embedding = nn.Embedding(
|
||||
self.num_classes,
|
||||
encoder_rnn_out_size,
|
||||
padding_idx=self.padding_idx)
|
||||
|
||||
# Prediction layer
|
||||
self.pred_dropout = nn.Dropout(pred_dropout)
|
||||
pred_num_classes = self.num_classes - 1
|
||||
if pred_concat:
|
||||
fc_in_channel = decoder_rnn_out_size + d_model + encoder_rnn_out_size
|
||||
else:
|
||||
fc_in_channel = d_model
|
||||
self.prediction = nn.Linear(fc_in_channel, pred_num_classes)
|
||||
|
||||
def _2d_attention(self,
|
||||
decoder_input,
|
||||
feat,
|
||||
holistic_feat,
|
||||
valid_ratios=None):
|
||||
|
||||
y = self.rnn_decoder(decoder_input)[0]
|
||||
# y: bsz * (seq_len + 1) * hidden_size
|
||||
|
||||
attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size
|
||||
bsz, seq_len, attn_size = attn_query.shape
|
||||
attn_query = paddle.unsqueeze(attn_query, axis=[3, 4])
|
||||
# (bsz, seq_len + 1, attn_size, 1, 1)
|
||||
|
||||
attn_key = self.conv3x3_1(feat)
|
||||
# bsz * attn_size * h * w
|
||||
attn_key = attn_key.unsqueeze(1)
|
||||
# bsz * 1 * attn_size * h * w
|
||||
|
||||
attn_weight = paddle.tanh(paddle.add(attn_key, attn_query))
|
||||
|
||||
# bsz * (seq_len + 1) * attn_size * h * w
|
||||
attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 3, 4, 2])
|
||||
# bsz * (seq_len + 1) * h * w * attn_size
|
||||
attn_weight = self.conv1x1_2(attn_weight)
|
||||
# bsz * (seq_len + 1) * h * w * 1
|
||||
bsz, T, h, w, c = attn_weight.shape
|
||||
assert c == 1
|
||||
|
||||
if valid_ratios is not None:
|
||||
# cal mask of attention weight
|
||||
for i in range(len(valid_ratios)):
|
||||
valid_width = min(w, math.ceil(w * valid_ratios[i]))
|
||||
if valid_width < w:
|
||||
attn_weight[i, :, :, valid_width:, :] = float('-inf')
|
||||
|
||||
attn_weight = paddle.reshape(attn_weight, [bsz, T, -1])
|
||||
attn_weight = F.softmax(attn_weight, axis=-1)
|
||||
|
||||
attn_weight = paddle.reshape(attn_weight, [bsz, T, h, w, c])
|
||||
attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 4, 2, 3])
|
||||
# attn_weight: bsz * T * c * h * w
|
||||
# feat: bsz * c * h * w
|
||||
attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight),
|
||||
(3, 4),
|
||||
keepdim=False)
|
||||
# bsz * (seq_len + 1) * C
|
||||
|
||||
# Linear transformation
|
||||
if self.pred_concat:
|
||||
hf_c = holistic_feat.shape[-1]
|
||||
holistic_feat = paddle.expand(
|
||||
holistic_feat, shape=[bsz, seq_len, hf_c])
|
||||
y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2))
|
||||
else:
|
||||
y = self.prediction(attn_feat)
|
||||
# bsz * (seq_len + 1) * num_classes
|
||||
if self.train_mode:
|
||||
y = self.pred_dropout(y)
|
||||
|
||||
return y
|
||||
|
||||
def forward_train(self, feat, out_enc, label, img_metas):
|
||||
'''
|
||||
img_metas: [label, valid_ratio]
|
||||
'''
|
||||
if img_metas is not None:
|
||||
assert len(img_metas[0]) == feat.shape[0]
|
||||
|
||||
valid_ratios = None
|
||||
if img_metas is not None and self.mask:
|
||||
valid_ratios = img_metas[-1]
|
||||
|
||||
lab_embedding = self.embedding(label)
|
||||
# bsz * seq_len * emb_dim
|
||||
out_enc = out_enc.unsqueeze(1)
|
||||
# bsz * 1 * emb_dim
|
||||
in_dec = paddle.concat((out_enc, lab_embedding), axis=1)
|
||||
# bsz * (seq_len + 1) * C
|
||||
out_dec = self._2d_attention(
|
||||
in_dec, feat, out_enc, valid_ratios=valid_ratios)
|
||||
# bsz * (seq_len + 1) * num_classes
|
||||
|
||||
return out_dec[:, 1:, :] # bsz * seq_len * num_classes
|
||||
|
||||
def forward_test(self, feat, out_enc, img_metas):
|
||||
if img_metas is not None:
|
||||
assert len(img_metas[0]) == feat.shape[0]
|
||||
|
||||
valid_ratios = None
|
||||
if img_metas is not None and self.mask:
|
||||
valid_ratios = img_metas[-1]
|
||||
|
||||
seq_len = self.max_seq_len
|
||||
bsz = feat.shape[0]
|
||||
start_token = paddle.full(
|
||||
(bsz, ), fill_value=self.start_idx, dtype='int64')
|
||||
# bsz
|
||||
start_token = self.embedding(start_token)
|
||||
# bsz * emb_dim
|
||||
emb_dim = start_token.shape[1]
|
||||
start_token = start_token.unsqueeze(1)
|
||||
start_token = paddle.expand(start_token, shape=[bsz, seq_len, emb_dim])
|
||||
# bsz * seq_len * emb_dim
|
||||
out_enc = out_enc.unsqueeze(1)
|
||||
# bsz * 1 * emb_dim
|
||||
decoder_input = paddle.concat((out_enc, start_token), axis=1)
|
||||
# bsz * (seq_len + 1) * emb_dim
|
||||
|
||||
outputs = []
|
||||
for i in range(1, seq_len + 1):
|
||||
decoder_output = self._2d_attention(
|
||||
decoder_input, feat, out_enc, valid_ratios=valid_ratios)
|
||||
char_output = decoder_output[:, i, :] # bsz * num_classes
|
||||
char_output = F.softmax(char_output, -1)
|
||||
outputs.append(char_output)
|
||||
max_idx = paddle.argmax(char_output, axis=1, keepdim=False)
|
||||
char_embedding = self.embedding(max_idx) # bsz * emb_dim
|
||||
if i < seq_len:
|
||||
decoder_input[:, i + 1, :] = char_embedding
|
||||
|
||||
outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class SARHead(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
enc_dim=512,
|
||||
max_text_length=30,
|
||||
enc_bi_rnn=False,
|
||||
enc_drop_rnn=0.1,
|
||||
enc_gru=False,
|
||||
dec_bi_rnn=False,
|
||||
dec_drop_rnn=0.0,
|
||||
dec_gru=False,
|
||||
d_k=512,
|
||||
pred_dropout=0.1,
|
||||
pred_concat=True,
|
||||
**kwargs):
|
||||
super(SARHead, self).__init__()
|
||||
|
||||
# encoder module
|
||||
self.encoder = SAREncoder(
|
||||
enc_bi_rnn=enc_bi_rnn,
|
||||
enc_drop_rnn=enc_drop_rnn,
|
||||
enc_gru=enc_gru,
|
||||
d_model=in_channels,
|
||||
d_enc=enc_dim)
|
||||
|
||||
# decoder module
|
||||
self.decoder = ParallelSARDecoder(
|
||||
out_channels=out_channels,
|
||||
enc_bi_rnn=enc_bi_rnn,
|
||||
dec_bi_rnn=dec_bi_rnn,
|
||||
dec_drop_rnn=dec_drop_rnn,
|
||||
dec_gru=dec_gru,
|
||||
d_model=in_channels,
|
||||
d_enc=enc_dim,
|
||||
d_k=d_k,
|
||||
pred_dropout=pred_dropout,
|
||||
max_text_length=max_text_length,
|
||||
pred_concat=pred_concat)
|
||||
|
||||
def forward(self, feat, targets=None):
|
||||
'''
|
||||
img_metas: [label, valid_ratio]
|
||||
'''
|
||||
holistic_feat = self.encoder(feat, targets) # bsz c
|
||||
|
||||
if self.training:
|
||||
label = targets[0] # label
|
||||
label = paddle.to_tensor(label, dtype='int64')
|
||||
final_out = self.decoder(
|
||||
feat, holistic_feat, label, img_metas=targets)
|
||||
else:
|
||||
final_out = self.decoder(
|
||||
feat,
|
||||
holistic_feat,
|
||||
label=None,
|
||||
img_metas=targets,
|
||||
train_mode=False)
|
||||
# (bsz, seq_len, num_classes)
|
||||
|
||||
return final_out
|
||||
280
backend/ppocr/modeling/heads/rec_srn_head.py
Normal file
280
backend/ppocr/modeling/heads/rec_srn_head.py
Normal file
@@ -0,0 +1,280 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn, ParamAttr
|
||||
from paddle.nn import functional as F
|
||||
import paddle.fluid as fluid
|
||||
import numpy as np
|
||||
from .self_attention import WrapEncoderForFeature
|
||||
from .self_attention import WrapEncoder
|
||||
from paddle.static import Program
|
||||
from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN
|
||||
import paddle.fluid.framework as framework
|
||||
|
||||
from collections import OrderedDict
|
||||
gradient_clip = 10
|
||||
|
||||
|
||||
class PVAM(nn.Layer):
|
||||
def __init__(self, in_channels, char_num, max_text_length, num_heads,
|
||||
num_encoder_tus, hidden_dims):
|
||||
super(PVAM, self).__init__()
|
||||
self.char_num = char_num
|
||||
self.max_length = max_text_length
|
||||
self.num_heads = num_heads
|
||||
self.num_encoder_TUs = num_encoder_tus
|
||||
self.hidden_dims = hidden_dims
|
||||
# Transformer encoder
|
||||
t = 256
|
||||
c = 512
|
||||
self.wrap_encoder_for_feature = WrapEncoderForFeature(
|
||||
src_vocab_size=1,
|
||||
max_length=t,
|
||||
n_layer=self.num_encoder_TUs,
|
||||
n_head=self.num_heads,
|
||||
d_key=int(self.hidden_dims / self.num_heads),
|
||||
d_value=int(self.hidden_dims / self.num_heads),
|
||||
d_model=self.hidden_dims,
|
||||
d_inner_hid=self.hidden_dims,
|
||||
prepostprocess_dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da",
|
||||
weight_sharing=True)
|
||||
|
||||
# PVAM
|
||||
self.flatten0 = paddle.nn.Flatten(start_axis=0, stop_axis=1)
|
||||
self.fc0 = paddle.nn.Linear(
|
||||
in_features=in_channels,
|
||||
out_features=in_channels, )
|
||||
self.emb = paddle.nn.Embedding(
|
||||
num_embeddings=self.max_length, embedding_dim=in_channels)
|
||||
self.flatten1 = paddle.nn.Flatten(start_axis=0, stop_axis=2)
|
||||
self.fc1 = paddle.nn.Linear(
|
||||
in_features=in_channels, out_features=1, bias_attr=False)
|
||||
|
||||
def forward(self, inputs, encoder_word_pos, gsrm_word_pos):
|
||||
b, c, h, w = inputs.shape
|
||||
conv_features = paddle.reshape(inputs, shape=[-1, c, h * w])
|
||||
conv_features = paddle.transpose(conv_features, perm=[0, 2, 1])
|
||||
# transformer encoder
|
||||
b, t, c = conv_features.shape
|
||||
|
||||
enc_inputs = [conv_features, encoder_word_pos, None]
|
||||
word_features = self.wrap_encoder_for_feature(enc_inputs)
|
||||
|
||||
# pvam
|
||||
b, t, c = word_features.shape
|
||||
word_features = self.fc0(word_features)
|
||||
word_features_ = paddle.reshape(word_features, [-1, 1, t, c])
|
||||
word_features_ = paddle.tile(word_features_, [1, self.max_length, 1, 1])
|
||||
word_pos_feature = self.emb(gsrm_word_pos)
|
||||
word_pos_feature_ = paddle.reshape(word_pos_feature,
|
||||
[-1, self.max_length, 1, c])
|
||||
word_pos_feature_ = paddle.tile(word_pos_feature_, [1, 1, t, 1])
|
||||
y = word_pos_feature_ + word_features_
|
||||
y = F.tanh(y)
|
||||
attention_weight = self.fc1(y)
|
||||
attention_weight = paddle.reshape(
|
||||
attention_weight, shape=[-1, self.max_length, t])
|
||||
attention_weight = F.softmax(attention_weight, axis=-1)
|
||||
pvam_features = paddle.matmul(attention_weight,
|
||||
word_features) #[b, max_length, c]
|
||||
return pvam_features
|
||||
|
||||
|
||||
class GSRM(nn.Layer):
|
||||
def __init__(self, in_channels, char_num, max_text_length, num_heads,
|
||||
num_encoder_tus, num_decoder_tus, hidden_dims):
|
||||
super(GSRM, self).__init__()
|
||||
self.char_num = char_num
|
||||
self.max_length = max_text_length
|
||||
self.num_heads = num_heads
|
||||
self.num_encoder_TUs = num_encoder_tus
|
||||
self.num_decoder_TUs = num_decoder_tus
|
||||
self.hidden_dims = hidden_dims
|
||||
|
||||
self.fc0 = paddle.nn.Linear(
|
||||
in_features=in_channels, out_features=self.char_num)
|
||||
self.wrap_encoder0 = WrapEncoder(
|
||||
src_vocab_size=self.char_num + 1,
|
||||
max_length=self.max_length,
|
||||
n_layer=self.num_decoder_TUs,
|
||||
n_head=self.num_heads,
|
||||
d_key=int(self.hidden_dims / self.num_heads),
|
||||
d_value=int(self.hidden_dims / self.num_heads),
|
||||
d_model=self.hidden_dims,
|
||||
d_inner_hid=self.hidden_dims,
|
||||
prepostprocess_dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da",
|
||||
weight_sharing=True)
|
||||
|
||||
self.wrap_encoder1 = WrapEncoder(
|
||||
src_vocab_size=self.char_num + 1,
|
||||
max_length=self.max_length,
|
||||
n_layer=self.num_decoder_TUs,
|
||||
n_head=self.num_heads,
|
||||
d_key=int(self.hidden_dims / self.num_heads),
|
||||
d_value=int(self.hidden_dims / self.num_heads),
|
||||
d_model=self.hidden_dims,
|
||||
d_inner_hid=self.hidden_dims,
|
||||
prepostprocess_dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da",
|
||||
weight_sharing=True)
|
||||
|
||||
self.mul = lambda x: paddle.matmul(x=x,
|
||||
y=self.wrap_encoder0.prepare_decoder.emb0.weight,
|
||||
transpose_y=True)
|
||||
|
||||
def forward(self, inputs, gsrm_word_pos, gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2):
|
||||
# ===== GSRM Visual-to-semantic embedding block =====
|
||||
b, t, c = inputs.shape
|
||||
pvam_features = paddle.reshape(inputs, [-1, c])
|
||||
word_out = self.fc0(pvam_features)
|
||||
word_ids = paddle.argmax(F.softmax(word_out), axis=1)
|
||||
word_ids = paddle.reshape(x=word_ids, shape=[-1, t, 1])
|
||||
|
||||
#===== GSRM Semantic reasoning block =====
|
||||
"""
|
||||
This module is achieved through bi-transformers,
|
||||
ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
|
||||
"""
|
||||
pad_idx = self.char_num
|
||||
|
||||
word1 = paddle.cast(word_ids, "float32")
|
||||
word1 = F.pad(word1, [1, 0], value=1.0 * pad_idx, data_format="NLC")
|
||||
word1 = paddle.cast(word1, "int64")
|
||||
word1 = word1[:, :-1, :]
|
||||
word2 = word_ids
|
||||
|
||||
enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
|
||||
enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
|
||||
|
||||
gsrm_feature1 = self.wrap_encoder0(enc_inputs_1)
|
||||
gsrm_feature2 = self.wrap_encoder1(enc_inputs_2)
|
||||
|
||||
gsrm_feature2 = F.pad(gsrm_feature2, [0, 1],
|
||||
value=0.,
|
||||
data_format="NLC")
|
||||
gsrm_feature2 = gsrm_feature2[:, 1:, ]
|
||||
gsrm_features = gsrm_feature1 + gsrm_feature2
|
||||
|
||||
gsrm_out = self.mul(gsrm_features)
|
||||
|
||||
b, t, c = gsrm_out.shape
|
||||
gsrm_out = paddle.reshape(gsrm_out, [-1, c])
|
||||
|
||||
return gsrm_features, word_out, gsrm_out
|
||||
|
||||
|
||||
class VSFD(nn.Layer):
|
||||
def __init__(self, in_channels=512, pvam_ch=512, char_num=38):
|
||||
super(VSFD, self).__init__()
|
||||
self.char_num = char_num
|
||||
self.fc0 = paddle.nn.Linear(
|
||||
in_features=in_channels * 2, out_features=pvam_ch)
|
||||
self.fc1 = paddle.nn.Linear(
|
||||
in_features=pvam_ch, out_features=self.char_num)
|
||||
|
||||
def forward(self, pvam_feature, gsrm_feature):
|
||||
b, t, c1 = pvam_feature.shape
|
||||
b, t, c2 = gsrm_feature.shape
|
||||
combine_feature_ = paddle.concat([pvam_feature, gsrm_feature], axis=2)
|
||||
img_comb_feature_ = paddle.reshape(
|
||||
combine_feature_, shape=[-1, c1 + c2])
|
||||
img_comb_feature_map = self.fc0(img_comb_feature_)
|
||||
img_comb_feature_map = F.sigmoid(img_comb_feature_map)
|
||||
img_comb_feature_map = paddle.reshape(
|
||||
img_comb_feature_map, shape=[-1, t, c1])
|
||||
combine_feature = img_comb_feature_map * pvam_feature + (
|
||||
1.0 - img_comb_feature_map) * gsrm_feature
|
||||
img_comb_feature = paddle.reshape(combine_feature, shape=[-1, c1])
|
||||
|
||||
out = self.fc1(img_comb_feature)
|
||||
return out
|
||||
|
||||
|
||||
class SRNHead(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, max_text_length, num_heads,
|
||||
num_encoder_TUs, num_decoder_TUs, hidden_dims, **kwargs):
|
||||
super(SRNHead, self).__init__()
|
||||
self.char_num = out_channels
|
||||
self.max_length = max_text_length
|
||||
self.num_heads = num_heads
|
||||
self.num_encoder_TUs = num_encoder_TUs
|
||||
self.num_decoder_TUs = num_decoder_TUs
|
||||
self.hidden_dims = hidden_dims
|
||||
|
||||
self.pvam = PVAM(
|
||||
in_channels=in_channels,
|
||||
char_num=self.char_num,
|
||||
max_text_length=self.max_length,
|
||||
num_heads=self.num_heads,
|
||||
num_encoder_tus=self.num_encoder_TUs,
|
||||
hidden_dims=self.hidden_dims)
|
||||
|
||||
self.gsrm = GSRM(
|
||||
in_channels=in_channels,
|
||||
char_num=self.char_num,
|
||||
max_text_length=self.max_length,
|
||||
num_heads=self.num_heads,
|
||||
num_encoder_tus=self.num_encoder_TUs,
|
||||
num_decoder_tus=self.num_decoder_TUs,
|
||||
hidden_dims=self.hidden_dims)
|
||||
self.vsfd = VSFD(in_channels=in_channels, char_num=self.char_num)
|
||||
|
||||
self.gsrm.wrap_encoder1.prepare_decoder.emb0 = self.gsrm.wrap_encoder0.prepare_decoder.emb0
|
||||
|
||||
def forward(self, inputs, targets=None):
|
||||
others = targets[-4:]
|
||||
encoder_word_pos = others[0]
|
||||
gsrm_word_pos = others[1]
|
||||
gsrm_slf_attn_bias1 = others[2]
|
||||
gsrm_slf_attn_bias2 = others[3]
|
||||
|
||||
pvam_feature = self.pvam(inputs, encoder_word_pos, gsrm_word_pos)
|
||||
|
||||
gsrm_feature, word_out, gsrm_out = self.gsrm(
|
||||
pvam_feature, gsrm_word_pos, gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2)
|
||||
|
||||
final_out = self.vsfd(pvam_feature, gsrm_feature)
|
||||
if not self.training:
|
||||
final_out = F.softmax(final_out, axis=1)
|
||||
|
||||
_, decoded_out = paddle.topk(final_out, k=1)
|
||||
|
||||
predicts = OrderedDict([
|
||||
('predict', final_out),
|
||||
('pvam_feature', pvam_feature),
|
||||
('decoded_out', decoded_out),
|
||||
('word_out', word_out),
|
||||
('gsrm_out', gsrm_out),
|
||||
])
|
||||
|
||||
return predicts
|
||||
406
backend/ppocr/modeling/heads/self_attention.py
Normal file
406
backend/ppocr/modeling/heads/self_attention.py
Normal file
@@ -0,0 +1,406 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr, nn
|
||||
from paddle import nn, ParamAttr
|
||||
from paddle.nn import functional as F
|
||||
import paddle.fluid as fluid
|
||||
import numpy as np
|
||||
gradient_clip = 10
|
||||
|
||||
|
||||
class WrapEncoderForFeature(nn.Layer):
|
||||
def __init__(self,
|
||||
src_vocab_size,
|
||||
max_length,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd,
|
||||
weight_sharing,
|
||||
bos_idx=0):
|
||||
super(WrapEncoderForFeature, self).__init__()
|
||||
|
||||
self.prepare_encoder = PrepareEncoder(
|
||||
src_vocab_size,
|
||||
d_model,
|
||||
max_length,
|
||||
prepostprocess_dropout,
|
||||
bos_idx=bos_idx,
|
||||
word_emb_param_name="src_word_emb_table")
|
||||
self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model,
|
||||
d_inner_hid, prepostprocess_dropout,
|
||||
attention_dropout, relu_dropout, preprocess_cmd,
|
||||
postprocess_cmd)
|
||||
|
||||
def forward(self, enc_inputs):
|
||||
conv_features, src_pos, src_slf_attn_bias = enc_inputs
|
||||
enc_input = self.prepare_encoder(conv_features, src_pos)
|
||||
enc_output = self.encoder(enc_input, src_slf_attn_bias)
|
||||
return enc_output
|
||||
|
||||
|
||||
class WrapEncoder(nn.Layer):
|
||||
"""
|
||||
embedder + encoder
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
src_vocab_size,
|
||||
max_length,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd,
|
||||
weight_sharing,
|
||||
bos_idx=0):
|
||||
super(WrapEncoder, self).__init__()
|
||||
|
||||
self.prepare_decoder = PrepareDecoder(
|
||||
src_vocab_size,
|
||||
d_model,
|
||||
max_length,
|
||||
prepostprocess_dropout,
|
||||
bos_idx=bos_idx)
|
||||
self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model,
|
||||
d_inner_hid, prepostprocess_dropout,
|
||||
attention_dropout, relu_dropout, preprocess_cmd,
|
||||
postprocess_cmd)
|
||||
|
||||
def forward(self, enc_inputs):
|
||||
src_word, src_pos, src_slf_attn_bias = enc_inputs
|
||||
enc_input = self.prepare_decoder(src_word, src_pos)
|
||||
enc_output = self.encoder(enc_input, src_slf_attn_bias)
|
||||
return enc_output
|
||||
|
||||
|
||||
class Encoder(nn.Layer):
|
||||
"""
|
||||
encoder
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da"):
|
||||
|
||||
super(Encoder, self).__init__()
|
||||
|
||||
self.encoder_layers = list()
|
||||
for i in range(n_layer):
|
||||
self.encoder_layers.append(
|
||||
self.add_sublayer(
|
||||
"layer_%d" % i,
|
||||
EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid,
|
||||
prepostprocess_dropout, attention_dropout,
|
||||
relu_dropout, preprocess_cmd,
|
||||
postprocess_cmd)))
|
||||
self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
|
||||
prepostprocess_dropout)
|
||||
|
||||
def forward(self, enc_input, attn_bias):
|
||||
for encoder_layer in self.encoder_layers:
|
||||
enc_output = encoder_layer(enc_input, attn_bias)
|
||||
enc_input = enc_output
|
||||
enc_output = self.processer(enc_output)
|
||||
return enc_output
|
||||
|
||||
|
||||
class EncoderLayer(nn.Layer):
|
||||
"""
|
||||
EncoderLayer
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da"):
|
||||
|
||||
super(EncoderLayer, self).__init__()
|
||||
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
|
||||
prepostprocess_dropout)
|
||||
self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
|
||||
attention_dropout)
|
||||
self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
|
||||
prepostprocess_dropout)
|
||||
|
||||
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
|
||||
prepostprocess_dropout)
|
||||
self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
|
||||
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
|
||||
prepostprocess_dropout)
|
||||
|
||||
def forward(self, enc_input, attn_bias):
|
||||
attn_output = self.self_attn(
|
||||
self.preprocesser1(enc_input), None, None, attn_bias)
|
||||
attn_output = self.postprocesser1(attn_output, enc_input)
|
||||
ffn_output = self.ffn(self.preprocesser2(attn_output))
|
||||
ffn_output = self.postprocesser2(ffn_output, attn_output)
|
||||
return ffn_output
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Layer):
|
||||
"""
|
||||
Multi-Head Attention
|
||||
"""
|
||||
|
||||
def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
|
||||
super(MultiHeadAttention, self).__init__()
|
||||
self.n_head = n_head
|
||||
self.d_key = d_key
|
||||
self.d_value = d_value
|
||||
self.d_model = d_model
|
||||
self.dropout_rate = dropout_rate
|
||||
self.q_fc = paddle.nn.Linear(
|
||||
in_features=d_model, out_features=d_key * n_head, bias_attr=False)
|
||||
self.k_fc = paddle.nn.Linear(
|
||||
in_features=d_model, out_features=d_key * n_head, bias_attr=False)
|
||||
self.v_fc = paddle.nn.Linear(
|
||||
in_features=d_model, out_features=d_value * n_head, bias_attr=False)
|
||||
self.proj_fc = paddle.nn.Linear(
|
||||
in_features=d_value * n_head, out_features=d_model, bias_attr=False)
|
||||
|
||||
def _prepare_qkv(self, queries, keys, values, cache=None):
|
||||
if keys is None: # self-attention
|
||||
keys, values = queries, queries
|
||||
static_kv = False
|
||||
else: # cross-attention
|
||||
static_kv = True
|
||||
|
||||
q = self.q_fc(queries)
|
||||
q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
|
||||
q = paddle.transpose(x=q, perm=[0, 2, 1, 3])
|
||||
|
||||
if cache is not None and static_kv and "static_k" in cache:
|
||||
# for encoder-decoder attention in inference and has cached
|
||||
k = cache["static_k"]
|
||||
v = cache["static_v"]
|
||||
else:
|
||||
k = self.k_fc(keys)
|
||||
v = self.v_fc(values)
|
||||
k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
|
||||
k = paddle.transpose(x=k, perm=[0, 2, 1, 3])
|
||||
v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
|
||||
v = paddle.transpose(x=v, perm=[0, 2, 1, 3])
|
||||
|
||||
if cache is not None:
|
||||
if static_kv and not "static_k" in cache:
|
||||
# for encoder-decoder attention in inference and has not cached
|
||||
cache["static_k"], cache["static_v"] = k, v
|
||||
elif not static_kv:
|
||||
# for decoder self-attention in inference
|
||||
cache_k, cache_v = cache["k"], cache["v"]
|
||||
k = paddle.concat([cache_k, k], axis=2)
|
||||
v = paddle.concat([cache_v, v], axis=2)
|
||||
cache["k"], cache["v"] = k, v
|
||||
|
||||
return q, k, v
|
||||
|
||||
def forward(self, queries, keys, values, attn_bias, cache=None):
|
||||
# compute q ,k ,v
|
||||
keys = queries if keys is None else keys
|
||||
values = keys if values is None else values
|
||||
q, k, v = self._prepare_qkv(queries, keys, values, cache)
|
||||
|
||||
# scale dot product attention
|
||||
product = paddle.matmul(x=q, y=k, transpose_y=True)
|
||||
product = product * self.d_model**-0.5
|
||||
if attn_bias is not None:
|
||||
product += attn_bias
|
||||
weights = F.softmax(product)
|
||||
if self.dropout_rate:
|
||||
weights = F.dropout(
|
||||
weights, p=self.dropout_rate, mode="downscale_in_infer")
|
||||
out = paddle.matmul(weights, v)
|
||||
|
||||
# combine heads
|
||||
out = paddle.transpose(out, perm=[0, 2, 1, 3])
|
||||
out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
|
||||
|
||||
# project to output
|
||||
out = self.proj_fc(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class PrePostProcessLayer(nn.Layer):
|
||||
"""
|
||||
PrePostProcessLayer
|
||||
"""
|
||||
|
||||
def __init__(self, process_cmd, d_model, dropout_rate):
|
||||
super(PrePostProcessLayer, self).__init__()
|
||||
self.process_cmd = process_cmd
|
||||
self.functors = []
|
||||
for cmd in self.process_cmd:
|
||||
if cmd == "a": # add residual connection
|
||||
self.functors.append(lambda x, y: x + y if y is not None else x)
|
||||
elif cmd == "n": # add layer normalization
|
||||
self.functors.append(
|
||||
self.add_sublayer(
|
||||
"layer_norm_%d" % len(self.sublayers()),
|
||||
paddle.nn.LayerNorm(
|
||||
normalized_shape=d_model,
|
||||
weight_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Constant(1.)),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Constant(0.)))))
|
||||
elif cmd == "d": # add dropout
|
||||
self.functors.append(lambda x: F.dropout(
|
||||
x, p=dropout_rate, mode="downscale_in_infer")
|
||||
if dropout_rate else x)
|
||||
|
||||
def forward(self, x, residual=None):
|
||||
for i, cmd in enumerate(self.process_cmd):
|
||||
if cmd == "a":
|
||||
x = self.functors[i](x, residual)
|
||||
else:
|
||||
x = self.functors[i](x)
|
||||
return x
|
||||
|
||||
|
||||
class PrepareEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
src_vocab_size,
|
||||
src_emb_dim,
|
||||
src_max_len,
|
||||
dropout_rate=0,
|
||||
bos_idx=0,
|
||||
word_emb_param_name=None,
|
||||
pos_enc_param_name=None):
|
||||
super(PrepareEncoder, self).__init__()
|
||||
self.src_emb_dim = src_emb_dim
|
||||
self.src_max_len = src_max_len
|
||||
self.emb = paddle.nn.Embedding(
|
||||
num_embeddings=self.src_max_len, embedding_dim=self.src_emb_dim)
|
||||
self.dropout_rate = dropout_rate
|
||||
|
||||
def forward(self, src_word, src_pos):
|
||||
src_word_emb = src_word
|
||||
src_word_emb = fluid.layers.cast(src_word_emb, 'float32')
|
||||
src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
|
||||
src_pos = paddle.squeeze(src_pos, axis=-1)
|
||||
src_pos_enc = self.emb(src_pos)
|
||||
src_pos_enc.stop_gradient = True
|
||||
enc_input = src_word_emb + src_pos_enc
|
||||
if self.dropout_rate:
|
||||
out = F.dropout(
|
||||
x=enc_input, p=self.dropout_rate, mode="downscale_in_infer")
|
||||
else:
|
||||
out = enc_input
|
||||
return out
|
||||
|
||||
|
||||
class PrepareDecoder(nn.Layer):
|
||||
def __init__(self,
|
||||
src_vocab_size,
|
||||
src_emb_dim,
|
||||
src_max_len,
|
||||
dropout_rate=0,
|
||||
bos_idx=0,
|
||||
word_emb_param_name=None,
|
||||
pos_enc_param_name=None):
|
||||
super(PrepareDecoder, self).__init__()
|
||||
self.src_emb_dim = src_emb_dim
|
||||
"""
|
||||
self.emb0 = Embedding(num_embeddings=src_vocab_size,
|
||||
embedding_dim=src_emb_dim)
|
||||
"""
|
||||
self.emb0 = paddle.nn.Embedding(
|
||||
num_embeddings=src_vocab_size,
|
||||
embedding_dim=self.src_emb_dim,
|
||||
padding_idx=bos_idx,
|
||||
weight_attr=paddle.ParamAttr(
|
||||
name=word_emb_param_name,
|
||||
initializer=nn.initializer.Normal(0., src_emb_dim**-0.5)))
|
||||
self.emb1 = paddle.nn.Embedding(
|
||||
num_embeddings=src_max_len,
|
||||
embedding_dim=self.src_emb_dim,
|
||||
weight_attr=paddle.ParamAttr(name=pos_enc_param_name))
|
||||
self.dropout_rate = dropout_rate
|
||||
|
||||
def forward(self, src_word, src_pos):
|
||||
src_word = fluid.layers.cast(src_word, 'int64')
|
||||
src_word = paddle.squeeze(src_word, axis=-1)
|
||||
src_word_emb = self.emb0(src_word)
|
||||
src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
|
||||
src_pos = paddle.squeeze(src_pos, axis=-1)
|
||||
src_pos_enc = self.emb1(src_pos)
|
||||
src_pos_enc.stop_gradient = True
|
||||
enc_input = src_word_emb + src_pos_enc
|
||||
if self.dropout_rate:
|
||||
out = F.dropout(
|
||||
x=enc_input, p=self.dropout_rate, mode="downscale_in_infer")
|
||||
else:
|
||||
out = enc_input
|
||||
return out
|
||||
|
||||
|
||||
class FFN(nn.Layer):
|
||||
"""
|
||||
Feed-Forward Network
|
||||
"""
|
||||
|
||||
def __init__(self, d_inner_hid, d_model, dropout_rate):
|
||||
super(FFN, self).__init__()
|
||||
self.dropout_rate = dropout_rate
|
||||
self.fc1 = paddle.nn.Linear(
|
||||
in_features=d_model, out_features=d_inner_hid)
|
||||
self.fc2 = paddle.nn.Linear(
|
||||
in_features=d_inner_hid, out_features=d_model)
|
||||
|
||||
def forward(self, x):
|
||||
hidden = self.fc1(x)
|
||||
hidden = F.relu(hidden)
|
||||
if self.dropout_rate:
|
||||
hidden = F.dropout(
|
||||
hidden, p=self.dropout_rate, mode="downscale_in_infer")
|
||||
out = self.fc2(hidden)
|
||||
return out
|
||||
246
backend/ppocr/modeling/heads/table_att_head.py
Normal file
246
backend/ppocr/modeling/heads/table_att_head.py
Normal file
@@ -0,0 +1,246 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TableAttentionHead(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
hidden_size,
|
||||
loc_type,
|
||||
in_max_len=488,
|
||||
max_text_length=100,
|
||||
max_elem_length=800,
|
||||
max_cell_num=500,
|
||||
**kwargs):
|
||||
super(TableAttentionHead, self).__init__()
|
||||
self.input_size = in_channels[-1]
|
||||
self.hidden_size = hidden_size
|
||||
self.elem_num = 30
|
||||
self.max_text_length = max_text_length
|
||||
self.max_elem_length = max_elem_length
|
||||
self.max_cell_num = max_cell_num
|
||||
|
||||
self.structure_attention_cell = AttentionGRUCell(
|
||||
self.input_size, hidden_size, self.elem_num, use_gru=False)
|
||||
self.structure_generator = nn.Linear(hidden_size, self.elem_num)
|
||||
self.loc_type = loc_type
|
||||
self.in_max_len = in_max_len
|
||||
|
||||
if self.loc_type == 1:
|
||||
self.loc_generator = nn.Linear(hidden_size, 4)
|
||||
else:
|
||||
if self.in_max_len == 640:
|
||||
self.loc_fea_trans = nn.Linear(400, self.max_elem_length + 1)
|
||||
elif self.in_max_len == 800:
|
||||
self.loc_fea_trans = nn.Linear(625, self.max_elem_length + 1)
|
||||
else:
|
||||
self.loc_fea_trans = nn.Linear(256, self.max_elem_length + 1)
|
||||
self.loc_generator = nn.Linear(self.input_size + hidden_size, 4)
|
||||
|
||||
def _char_to_onehot(self, input_char, onehot_dim):
|
||||
input_ont_hot = F.one_hot(input_char, onehot_dim)
|
||||
return input_ont_hot
|
||||
|
||||
def forward(self, inputs, targets=None):
|
||||
# if and else branch are both needed when you want to assign a variable
|
||||
# if you modify the var in just one branch, then the modification will not work.
|
||||
fea = inputs[-1]
|
||||
if len(fea.shape) == 3:
|
||||
pass
|
||||
else:
|
||||
last_shape = int(np.prod(fea.shape[2:])) # gry added
|
||||
fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape])
|
||||
fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels)
|
||||
batch_size = fea.shape[0]
|
||||
|
||||
hidden = paddle.zeros((batch_size, self.hidden_size))
|
||||
output_hiddens = []
|
||||
if self.training and targets is not None:
|
||||
structure = targets[0]
|
||||
for i in range(self.max_elem_length + 1):
|
||||
elem_onehots = self._char_to_onehot(
|
||||
structure[:, i], onehot_dim=self.elem_num)
|
||||
(outputs, hidden), alpha = self.structure_attention_cell(
|
||||
hidden, fea, elem_onehots)
|
||||
output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
|
||||
output = paddle.concat(output_hiddens, axis=1)
|
||||
structure_probs = self.structure_generator(output)
|
||||
if self.loc_type == 1:
|
||||
loc_preds = self.loc_generator(output)
|
||||
loc_preds = F.sigmoid(loc_preds)
|
||||
else:
|
||||
loc_fea = fea.transpose([0, 2, 1])
|
||||
loc_fea = self.loc_fea_trans(loc_fea)
|
||||
loc_fea = loc_fea.transpose([0, 2, 1])
|
||||
loc_concat = paddle.concat([output, loc_fea], axis=2)
|
||||
loc_preds = self.loc_generator(loc_concat)
|
||||
loc_preds = F.sigmoid(loc_preds)
|
||||
else:
|
||||
temp_elem = paddle.zeros(shape=[batch_size], dtype="int32")
|
||||
structure_probs = None
|
||||
loc_preds = None
|
||||
elem_onehots = None
|
||||
outputs = None
|
||||
alpha = None
|
||||
max_elem_length = paddle.to_tensor(self.max_elem_length)
|
||||
i = 0
|
||||
while i < max_elem_length + 1:
|
||||
elem_onehots = self._char_to_onehot(
|
||||
temp_elem, onehot_dim=self.elem_num)
|
||||
(outputs, hidden), alpha = self.structure_attention_cell(
|
||||
hidden, fea, elem_onehots)
|
||||
output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
|
||||
structure_probs_step = self.structure_generator(outputs)
|
||||
temp_elem = structure_probs_step.argmax(axis=1, dtype="int32")
|
||||
i += 1
|
||||
|
||||
output = paddle.concat(output_hiddens, axis=1)
|
||||
structure_probs = self.structure_generator(output)
|
||||
structure_probs = F.softmax(structure_probs)
|
||||
if self.loc_type == 1:
|
||||
loc_preds = self.loc_generator(output)
|
||||
loc_preds = F.sigmoid(loc_preds)
|
||||
else:
|
||||
loc_fea = fea.transpose([0, 2, 1])
|
||||
loc_fea = self.loc_fea_trans(loc_fea)
|
||||
loc_fea = loc_fea.transpose([0, 2, 1])
|
||||
loc_concat = paddle.concat([output, loc_fea], axis=2)
|
||||
loc_preds = self.loc_generator(loc_concat)
|
||||
loc_preds = F.sigmoid(loc_preds)
|
||||
return {'structure_probs': structure_probs, 'loc_preds': loc_preds}
|
||||
|
||||
|
||||
class AttentionGRUCell(nn.Layer):
|
||||
def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
|
||||
super(AttentionGRUCell, self).__init__()
|
||||
self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
|
||||
self.h2h = nn.Linear(hidden_size, hidden_size)
|
||||
self.score = nn.Linear(hidden_size, 1, bias_attr=False)
|
||||
self.rnn = nn.GRUCell(
|
||||
input_size=input_size + num_embeddings, hidden_size=hidden_size)
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
def forward(self, prev_hidden, batch_H, char_onehots):
|
||||
batch_H_proj = self.i2h(batch_H)
|
||||
prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
|
||||
res = paddle.add(batch_H_proj, prev_hidden_proj)
|
||||
res = paddle.tanh(res)
|
||||
e = self.score(res)
|
||||
alpha = F.softmax(e, axis=1)
|
||||
alpha = paddle.transpose(alpha, [0, 2, 1])
|
||||
context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
|
||||
concat_context = paddle.concat([context, char_onehots], 1)
|
||||
cur_hidden = self.rnn(concat_context, prev_hidden)
|
||||
return cur_hidden, alpha
|
||||
|
||||
|
||||
class AttentionLSTM(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
|
||||
super(AttentionLSTM, self).__init__()
|
||||
self.input_size = in_channels
|
||||
self.hidden_size = hidden_size
|
||||
self.num_classes = out_channels
|
||||
|
||||
self.attention_cell = AttentionLSTMCell(
|
||||
in_channels, hidden_size, out_channels, use_gru=False)
|
||||
self.generator = nn.Linear(hidden_size, out_channels)
|
||||
|
||||
def _char_to_onehot(self, input_char, onehot_dim):
|
||||
input_ont_hot = F.one_hot(input_char, onehot_dim)
|
||||
return input_ont_hot
|
||||
|
||||
def forward(self, inputs, targets=None, batch_max_length=25):
|
||||
batch_size = inputs.shape[0]
|
||||
num_steps = batch_max_length
|
||||
|
||||
hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
|
||||
(batch_size, self.hidden_size)))
|
||||
output_hiddens = []
|
||||
|
||||
if targets is not None:
|
||||
for i in range(num_steps):
|
||||
# one-hot vectors for a i-th char
|
||||
char_onehots = self._char_to_onehot(
|
||||
targets[:, i], onehot_dim=self.num_classes)
|
||||
hidden, alpha = self.attention_cell(hidden, inputs,
|
||||
char_onehots)
|
||||
|
||||
hidden = (hidden[1][0], hidden[1][1])
|
||||
output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
|
||||
output = paddle.concat(output_hiddens, axis=1)
|
||||
probs = self.generator(output)
|
||||
|
||||
else:
|
||||
targets = paddle.zeros(shape=[batch_size], dtype="int32")
|
||||
probs = None
|
||||
|
||||
for i in range(num_steps):
|
||||
char_onehots = self._char_to_onehot(
|
||||
targets, onehot_dim=self.num_classes)
|
||||
hidden, alpha = self.attention_cell(hidden, inputs,
|
||||
char_onehots)
|
||||
probs_step = self.generator(hidden[0])
|
||||
hidden = (hidden[1][0], hidden[1][1])
|
||||
if probs is None:
|
||||
probs = paddle.unsqueeze(probs_step, axis=1)
|
||||
else:
|
||||
probs = paddle.concat(
|
||||
[probs, paddle.unsqueeze(
|
||||
probs_step, axis=1)], axis=1)
|
||||
|
||||
next_input = probs_step.argmax(axis=1)
|
||||
|
||||
targets = next_input
|
||||
|
||||
return probs
|
||||
|
||||
|
||||
class AttentionLSTMCell(nn.Layer):
|
||||
def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
|
||||
super(AttentionLSTMCell, self).__init__()
|
||||
self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
|
||||
self.h2h = nn.Linear(hidden_size, hidden_size)
|
||||
self.score = nn.Linear(hidden_size, 1, bias_attr=False)
|
||||
if not use_gru:
|
||||
self.rnn = nn.LSTMCell(
|
||||
input_size=input_size + num_embeddings, hidden_size=hidden_size)
|
||||
else:
|
||||
self.rnn = nn.GRUCell(
|
||||
input_size=input_size + num_embeddings, hidden_size=hidden_size)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
def forward(self, prev_hidden, batch_H, char_onehots):
|
||||
batch_H_proj = self.i2h(batch_H)
|
||||
prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1)
|
||||
res = paddle.add(batch_H_proj, prev_hidden_proj)
|
||||
res = paddle.tanh(res)
|
||||
e = self.score(res)
|
||||
|
||||
alpha = F.softmax(e, axis=1)
|
||||
alpha = paddle.transpose(alpha, [0, 2, 1])
|
||||
context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
|
||||
concat_context = paddle.concat([context, char_onehots], 1)
|
||||
cur_hidden = self.rnn(concat_context, prev_hidden)
|
||||
|
||||
return cur_hidden, alpha
|
||||
Reference in New Issue
Block a user