mirror of
https://github.com/YaoFANGUK/video-subtitle-remover.git
synced 2026-02-16 05:01:06 +08:00
init
This commit is contained in:
109
backend/ppocr/data/__init__.py
Normal file
109
backend/ppocr/data/__init__.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import skimage
|
||||
import paddle
|
||||
import signal
|
||||
import random
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
||||
import copy
|
||||
from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler
|
||||
import paddle.distributed as dist
|
||||
|
||||
from ppocr.data.imaug import transform, create_operators
|
||||
from ppocr.data.simple_dataset import SimpleDataSet
|
||||
from ppocr.data.lmdb_dataset import LMDBDataSet
|
||||
from ppocr.data.pgnet_dataset import PGDataSet
|
||||
from ppocr.data.pubtab_dataset import PubTabDataSet
|
||||
|
||||
__all__ = ['build_dataloader', 'transform', 'create_operators']
|
||||
|
||||
|
||||
def term_mp(sig_num, frame):
|
||||
""" kill all child processes
|
||||
"""
|
||||
pid = os.getpid()
|
||||
pgid = os.getpgid(os.getpid())
|
||||
print("main proc {} exit, kill process group " "{}".format(pid, pgid))
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
|
||||
|
||||
def build_dataloader(config, mode, device, logger, seed=None):
|
||||
config = copy.deepcopy(config)
|
||||
|
||||
support_dict = [
|
||||
'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet'
|
||||
]
|
||||
module_name = config[mode]['dataset']['name']
|
||||
assert module_name in support_dict, Exception(
|
||||
'DataSet only support {}'.format(support_dict))
|
||||
assert mode in ['Train', 'Eval', 'Test'
|
||||
], "Mode should be Train, Eval or Test."
|
||||
|
||||
dataset = eval(module_name)(config, mode, logger, seed)
|
||||
loader_config = config[mode]['loader']
|
||||
batch_size = loader_config['batch_size_per_card']
|
||||
drop_last = loader_config['drop_last']
|
||||
shuffle = loader_config['shuffle']
|
||||
num_workers = loader_config['num_workers']
|
||||
if 'use_shared_memory' in loader_config.keys():
|
||||
use_shared_memory = loader_config['use_shared_memory']
|
||||
else:
|
||||
use_shared_memory = True
|
||||
|
||||
if mode == "Train":
|
||||
# Distribute data to multiple cards
|
||||
batch_sampler = DistributedBatchSampler(
|
||||
dataset=dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
drop_last=drop_last)
|
||||
else:
|
||||
# Distribute data to single card
|
||||
batch_sampler = BatchSampler(
|
||||
dataset=dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
drop_last=drop_last)
|
||||
|
||||
if 'collate_fn' in loader_config:
|
||||
from . import collate_fn
|
||||
collate_fn = getattr(collate_fn, loader_config['collate_fn'])()
|
||||
else:
|
||||
collate_fn = None
|
||||
data_loader = DataLoader(
|
||||
dataset=dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
places=device,
|
||||
num_workers=num_workers,
|
||||
return_list=True,
|
||||
use_shared_memory=use_shared_memory,
|
||||
collate_fn=collate_fn)
|
||||
|
||||
# support exit using ctrl+c
|
||||
signal.signal(signal.SIGINT, term_mp)
|
||||
signal.signal(signal.SIGTERM, term_mp)
|
||||
|
||||
return data_loader
|
||||
72
backend/ppocr/data/collate_fn.py
Normal file
72
backend/ppocr/data/collate_fn.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import numbers
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class DictCollator(object):
|
||||
"""
|
||||
data batch
|
||||
"""
|
||||
|
||||
def __call__(self, batch):
|
||||
# todo:support batch operators
|
||||
data_dict = defaultdict(list)
|
||||
to_tensor_keys = []
|
||||
for sample in batch:
|
||||
for k, v in sample.items():
|
||||
if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)):
|
||||
if k not in to_tensor_keys:
|
||||
to_tensor_keys.append(k)
|
||||
data_dict[k].append(v)
|
||||
for k in to_tensor_keys:
|
||||
data_dict[k] = paddle.to_tensor(data_dict[k])
|
||||
return data_dict
|
||||
|
||||
|
||||
class ListCollator(object):
|
||||
"""
|
||||
data batch
|
||||
"""
|
||||
|
||||
def __call__(self, batch):
|
||||
# todo:support batch operators
|
||||
data_dict = defaultdict(list)
|
||||
to_tensor_idxs = []
|
||||
for sample in batch:
|
||||
for idx, v in enumerate(sample):
|
||||
if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)):
|
||||
if idx not in to_tensor_idxs:
|
||||
to_tensor_idxs.append(idx)
|
||||
data_dict[idx].append(v)
|
||||
for idx in to_tensor_idxs:
|
||||
data_dict[idx] = paddle.to_tensor(data_dict[idx])
|
||||
return list(data_dict.values())
|
||||
|
||||
|
||||
class SSLRotateCollate(object):
|
||||
"""
|
||||
bach: [
|
||||
[(4*3xH*W), (4,)]
|
||||
[(4*3xH*W), (4,)]
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
def __call__(self, batch):
|
||||
output = [np.concatenate(d, axis=0) for d in zip(*batch)]
|
||||
return output
|
||||
26
backend/ppocr/data/imaug/ColorJitter.py
Normal file
26
backend/ppocr/data/imaug/ColorJitter.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from paddle.vision.transforms import ColorJitter as pp_ColorJitter
|
||||
|
||||
__all__ = ['ColorJitter']
|
||||
|
||||
class ColorJitter(object):
|
||||
def __init__(self, brightness=0, contrast=0, saturation=0, hue=0,**kwargs):
|
||||
self.aug = pp_ColorJitter(brightness, contrast, saturation, hue)
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
image = self.aug(image)
|
||||
data['image'] = image
|
||||
return data
|
||||
74
backend/ppocr/data/imaug/__init__.py
Normal file
74
backend/ppocr/data/imaug/__init__.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .iaa_augment import IaaAugment
|
||||
from .make_border_map import MakeBorderMap
|
||||
from .make_shrink_map import MakeShrinkMap
|
||||
from .random_crop_data import EastRandomCropData, RandomCropImgMask
|
||||
from .make_pse_gt import MakePseGt
|
||||
|
||||
from .rec_img_aug import RecAug, RecConAug, RecResizeImg, ClsResizeImg, \
|
||||
SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg, PRENResizeImg
|
||||
from .ssl_img_aug import SSLRotateResize
|
||||
from .randaugment import RandAugment
|
||||
from .copy_paste import CopyPaste
|
||||
from .ColorJitter import ColorJitter
|
||||
from .operators import *
|
||||
from .label_ops import *
|
||||
|
||||
from .east_process import *
|
||||
from .sast_process import *
|
||||
from .pg_process import *
|
||||
from .gen_table_mask import *
|
||||
|
||||
from .vqa import *
|
||||
|
||||
from .fce_aug import *
|
||||
from .fce_targets import FCENetTargets
|
||||
|
||||
|
||||
def transform(data, ops=None):
|
||||
""" transform """
|
||||
if ops is None:
|
||||
ops = []
|
||||
for op in ops:
|
||||
data = op(data)
|
||||
if data is None:
|
||||
return None
|
||||
return data
|
||||
|
||||
|
||||
def create_operators(op_param_list, global_config=None):
|
||||
"""
|
||||
create operators based on the config
|
||||
|
||||
Args:
|
||||
params(list): a dict list, used to create some operators
|
||||
"""
|
||||
assert isinstance(op_param_list, list), ('operator config should be a list')
|
||||
ops = []
|
||||
for operator in op_param_list:
|
||||
assert isinstance(operator,
|
||||
dict) and len(operator) == 1, "yaml format error"
|
||||
op_name = list(operator)[0]
|
||||
param = {} if operator[op_name] is None else operator[op_name]
|
||||
if global_config is not None:
|
||||
param.update(global_config)
|
||||
op = eval(op_name)(**param)
|
||||
ops.append(op)
|
||||
return ops
|
||||
170
backend/ppocr/data/imaug/copy_paste.py
Normal file
170
backend/ppocr/data/imaug/copy_paste.py
Normal file
@@ -0,0 +1,170 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import cv2
|
||||
import random
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
from ppocr.data.imaug.iaa_augment import IaaAugment
|
||||
from ppocr.data.imaug.random_crop_data import is_poly_outside_rect
|
||||
from tools.infer.utility import get_rotate_crop_image
|
||||
|
||||
|
||||
class CopyPaste(object):
|
||||
def __init__(self, objects_paste_ratio=0.2, limit_paste=True, **kwargs):
|
||||
self.ext_data_num = 1
|
||||
self.objects_paste_ratio = objects_paste_ratio
|
||||
self.limit_paste = limit_paste
|
||||
augmenter_args = [{'type': 'Resize', 'args': {'size': [0.5, 3]}}]
|
||||
self.aug = IaaAugment(augmenter_args)
|
||||
|
||||
def __call__(self, data):
|
||||
point_num = data['polys'].shape[1]
|
||||
src_img = data['image']
|
||||
src_polys = data['polys'].tolist()
|
||||
src_ignores = data['ignore_tags'].tolist()
|
||||
ext_data = data['ext_data'][0]
|
||||
ext_image = ext_data['image']
|
||||
ext_polys = ext_data['polys']
|
||||
ext_ignores = ext_data['ignore_tags']
|
||||
|
||||
indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]]
|
||||
select_num = max(
|
||||
1, min(int(self.objects_paste_ratio * len(ext_polys)), 30))
|
||||
|
||||
random.shuffle(indexs)
|
||||
select_idxs = indexs[:select_num]
|
||||
select_polys = ext_polys[select_idxs]
|
||||
select_ignores = ext_ignores[select_idxs]
|
||||
|
||||
src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB)
|
||||
ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB)
|
||||
src_img = Image.fromarray(src_img).convert('RGBA')
|
||||
for poly, tag in zip(select_polys, select_ignores):
|
||||
box_img = get_rotate_crop_image(ext_image, poly)
|
||||
|
||||
src_img, box = self.paste_img(src_img, box_img, src_polys)
|
||||
if box is not None:
|
||||
box = box.tolist()
|
||||
for _ in range(len(box), point_num):
|
||||
box.append(box[-1])
|
||||
src_polys.append(box)
|
||||
src_ignores.append(tag)
|
||||
src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR)
|
||||
h, w = src_img.shape[:2]
|
||||
src_polys = np.array(src_polys)
|
||||
src_polys[:, :, 0] = np.clip(src_polys[:, :, 0], 0, w)
|
||||
src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h)
|
||||
data['image'] = src_img
|
||||
data['polys'] = src_polys
|
||||
data['ignore_tags'] = np.array(src_ignores)
|
||||
return data
|
||||
|
||||
def paste_img(self, src_img, box_img, src_polys):
|
||||
box_img_pil = Image.fromarray(box_img).convert('RGBA')
|
||||
src_w, src_h = src_img.size
|
||||
box_w, box_h = box_img_pil.size
|
||||
|
||||
angle = np.random.randint(0, 360)
|
||||
box = np.array([[[0, 0], [box_w, 0], [box_w, box_h], [0, box_h]]])
|
||||
box = rotate_bbox(box_img, box, angle)[0]
|
||||
box_img_pil = box_img_pil.rotate(angle, expand=1)
|
||||
box_w, box_h = box_img_pil.width, box_img_pil.height
|
||||
if src_w - box_w < 0 or src_h - box_h < 0:
|
||||
return src_img, None
|
||||
|
||||
paste_x, paste_y = self.select_coord(src_polys, box, src_w - box_w,
|
||||
src_h - box_h)
|
||||
if paste_x is None:
|
||||
return src_img, None
|
||||
box[:, 0] += paste_x
|
||||
box[:, 1] += paste_y
|
||||
r, g, b, A = box_img_pil.split()
|
||||
src_img.paste(box_img_pil, (paste_x, paste_y), mask=A)
|
||||
|
||||
return src_img, box
|
||||
|
||||
def select_coord(self, src_polys, box, endx, endy):
|
||||
if self.limit_paste:
|
||||
xmin, ymin, xmax, ymax = box[:, 0].min(), box[:, 1].min(
|
||||
), box[:, 0].max(), box[:, 1].max()
|
||||
for _ in range(50):
|
||||
paste_x = random.randint(0, endx)
|
||||
paste_y = random.randint(0, endy)
|
||||
xmin1 = xmin + paste_x
|
||||
xmax1 = xmax + paste_x
|
||||
ymin1 = ymin + paste_y
|
||||
ymax1 = ymax + paste_y
|
||||
|
||||
num_poly_in_rect = 0
|
||||
for poly in src_polys:
|
||||
if not is_poly_outside_rect(poly, xmin1, ymin1,
|
||||
xmax1 - xmin1, ymax1 - ymin1):
|
||||
num_poly_in_rect += 1
|
||||
break
|
||||
if num_poly_in_rect == 0:
|
||||
return paste_x, paste_y
|
||||
return None, None
|
||||
else:
|
||||
paste_x = random.randint(0, endx)
|
||||
paste_y = random.randint(0, endy)
|
||||
return paste_x, paste_y
|
||||
|
||||
|
||||
def get_union(pD, pG):
|
||||
return Polygon(pD).union(Polygon(pG)).area
|
||||
|
||||
|
||||
def get_intersection_over_union(pD, pG):
|
||||
return get_intersection(pD, pG) / get_union(pD, pG)
|
||||
|
||||
|
||||
def get_intersection(pD, pG):
|
||||
return Polygon(pD).intersection(Polygon(pG)).area
|
||||
|
||||
|
||||
def rotate_bbox(img, text_polys, angle, scale=1):
|
||||
"""
|
||||
from https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/augment.py
|
||||
Args:
|
||||
img: np.ndarray
|
||||
text_polys: np.ndarray N*4*2
|
||||
angle: int
|
||||
scale: int
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
w = img.shape[1]
|
||||
h = img.shape[0]
|
||||
|
||||
rangle = np.deg2rad(angle)
|
||||
nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w))
|
||||
nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w))
|
||||
rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, scale)
|
||||
rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
|
||||
rot_mat[0, 2] += rot_move[0]
|
||||
rot_mat[1, 2] += rot_move[1]
|
||||
|
||||
# ---------------------- rotate box ----------------------
|
||||
rot_text_polys = list()
|
||||
for bbox in text_polys:
|
||||
point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
|
||||
point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
|
||||
point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
|
||||
point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
|
||||
rot_text_polys.append([point1, point2, point3, point4])
|
||||
return np.array(rot_text_polys, dtype=np.float32)
|
||||
436
backend/ppocr/data/imaug/east_process.py
Normal file
436
backend/ppocr/data/imaug/east_process.py
Normal file
@@ -0,0 +1,436 @@
|
||||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
"""
|
||||
This code is refered from:
|
||||
https://github.com/songdejia/EAST/blob/master/data_utils.py
|
||||
"""
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
__all__ = ['EASTProcessTrain']
|
||||
|
||||
|
||||
class EASTProcessTrain(object):
|
||||
def __init__(self,
|
||||
image_shape=[512, 512],
|
||||
background_ratio=0.125,
|
||||
min_crop_side_ratio=0.1,
|
||||
min_text_size=10,
|
||||
**kwargs):
|
||||
self.input_size = image_shape[1]
|
||||
self.random_scale = np.array([0.5, 1, 2.0, 3.0])
|
||||
self.background_ratio = background_ratio
|
||||
self.min_crop_side_ratio = min_crop_side_ratio
|
||||
self.min_text_size = min_text_size
|
||||
|
||||
def preprocess(self, im):
|
||||
input_size = self.input_size
|
||||
im_shape = im.shape
|
||||
im_size_min = np.min(im_shape[0:2])
|
||||
im_size_max = np.max(im_shape[0:2])
|
||||
im_scale = float(input_size) / float(im_size_max)
|
||||
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale)
|
||||
img_mean = [0.485, 0.456, 0.406]
|
||||
img_std = [0.229, 0.224, 0.225]
|
||||
# im = im[:, :, ::-1].astype(np.float32)
|
||||
im = im / 255
|
||||
im -= img_mean
|
||||
im /= img_std
|
||||
new_h, new_w, _ = im.shape
|
||||
im_padded = np.zeros((input_size, input_size, 3), dtype=np.float32)
|
||||
im_padded[:new_h, :new_w, :] = im
|
||||
im_padded = im_padded.transpose((2, 0, 1))
|
||||
im_padded = im_padded[np.newaxis, :]
|
||||
return im_padded, im_scale
|
||||
|
||||
def rotate_im_poly(self, im, text_polys):
|
||||
"""
|
||||
rotate image with 90 / 180 / 270 degre
|
||||
"""
|
||||
im_w, im_h = im.shape[1], im.shape[0]
|
||||
dst_im = im.copy()
|
||||
dst_polys = []
|
||||
rand_degree_ratio = np.random.rand()
|
||||
rand_degree_cnt = 1
|
||||
if 0.333 < rand_degree_ratio < 0.666:
|
||||
rand_degree_cnt = 2
|
||||
elif rand_degree_ratio > 0.666:
|
||||
rand_degree_cnt = 3
|
||||
for i in range(rand_degree_cnt):
|
||||
dst_im = np.rot90(dst_im)
|
||||
rot_degree = -90 * rand_degree_cnt
|
||||
rot_angle = rot_degree * math.pi / 180.0
|
||||
n_poly = text_polys.shape[0]
|
||||
cx, cy = 0.5 * im_w, 0.5 * im_h
|
||||
ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0]
|
||||
for i in range(n_poly):
|
||||
wordBB = text_polys[i]
|
||||
poly = []
|
||||
for j in range(4):
|
||||
sx, sy = wordBB[j][0], wordBB[j][1]
|
||||
dx = math.cos(rot_angle) * (sx - cx)\
|
||||
- math.sin(rot_angle) * (sy - cy) + ncx
|
||||
dy = math.sin(rot_angle) * (sx - cx)\
|
||||
+ math.cos(rot_angle) * (sy - cy) + ncy
|
||||
poly.append([dx, dy])
|
||||
dst_polys.append(poly)
|
||||
dst_polys = np.array(dst_polys, dtype=np.float32)
|
||||
return dst_im, dst_polys
|
||||
|
||||
def polygon_area(self, poly):
|
||||
"""
|
||||
compute area of a polygon
|
||||
:param poly:
|
||||
:return:
|
||||
"""
|
||||
edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
|
||||
(poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
|
||||
(poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
|
||||
(poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])]
|
||||
return np.sum(edge) / 2.
|
||||
|
||||
def check_and_validate_polys(self, polys, tags, img_height, img_width):
|
||||
"""
|
||||
check so that the text poly is in the same direction,
|
||||
and also filter some invalid polygons
|
||||
:param polys:
|
||||
:param tags:
|
||||
:return:
|
||||
"""
|
||||
h, w = img_height, img_width
|
||||
if polys.shape[0] == 0:
|
||||
return polys
|
||||
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
|
||||
polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)
|
||||
|
||||
validated_polys = []
|
||||
validated_tags = []
|
||||
for poly, tag in zip(polys, tags):
|
||||
p_area = self.polygon_area(poly)
|
||||
#invalid poly
|
||||
if abs(p_area) < 1:
|
||||
continue
|
||||
if p_area > 0:
|
||||
#'poly in wrong direction'
|
||||
if not tag:
|
||||
tag = True #reversed cases should be ignore
|
||||
poly = poly[(0, 3, 2, 1), :]
|
||||
validated_polys.append(poly)
|
||||
validated_tags.append(tag)
|
||||
return np.array(validated_polys), np.array(validated_tags)
|
||||
|
||||
def draw_img_polys(self, img, polys):
|
||||
if len(img.shape) == 4:
|
||||
img = np.squeeze(img, axis=0)
|
||||
if img.shape[0] == 3:
|
||||
img = img.transpose((1, 2, 0))
|
||||
img[:, :, 2] += 123.68
|
||||
img[:, :, 1] += 116.78
|
||||
img[:, :, 0] += 103.94
|
||||
cv2.imwrite("tmp.jpg", img)
|
||||
img = cv2.imread("tmp.jpg")
|
||||
for box in polys:
|
||||
box = box.astype(np.int32).reshape((-1, 1, 2))
|
||||
cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
|
||||
import random
|
||||
ino = random.randint(0, 100)
|
||||
cv2.imwrite("tmp_%d.jpg" % ino, img)
|
||||
return
|
||||
|
||||
def shrink_poly(self, poly, r):
|
||||
"""
|
||||
fit a poly inside the origin poly, maybe bugs here...
|
||||
used for generate the score map
|
||||
:param poly: the text poly
|
||||
:param r: r in the paper
|
||||
:return: the shrinked poly
|
||||
"""
|
||||
# shrink ratio
|
||||
R = 0.3
|
||||
# find the longer pair
|
||||
dist0 = np.linalg.norm(poly[0] - poly[1])
|
||||
dist1 = np.linalg.norm(poly[2] - poly[3])
|
||||
dist2 = np.linalg.norm(poly[0] - poly[3])
|
||||
dist3 = np.linalg.norm(poly[1] - poly[2])
|
||||
if dist0 + dist1 > dist2 + dist3:
|
||||
# first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2)
|
||||
## p0, p1
|
||||
theta = np.arctan2((poly[1][1] - poly[0][1]),
|
||||
(poly[1][0] - poly[0][0]))
|
||||
poly[0][0] += R * r[0] * np.cos(theta)
|
||||
poly[0][1] += R * r[0] * np.sin(theta)
|
||||
poly[1][0] -= R * r[1] * np.cos(theta)
|
||||
poly[1][1] -= R * r[1] * np.sin(theta)
|
||||
## p2, p3
|
||||
theta = np.arctan2((poly[2][1] - poly[3][1]),
|
||||
(poly[2][0] - poly[3][0]))
|
||||
poly[3][0] += R * r[3] * np.cos(theta)
|
||||
poly[3][1] += R * r[3] * np.sin(theta)
|
||||
poly[2][0] -= R * r[2] * np.cos(theta)
|
||||
poly[2][1] -= R * r[2] * np.sin(theta)
|
||||
## p0, p3
|
||||
theta = np.arctan2((poly[3][0] - poly[0][0]),
|
||||
(poly[3][1] - poly[0][1]))
|
||||
poly[0][0] += R * r[0] * np.sin(theta)
|
||||
poly[0][1] += R * r[0] * np.cos(theta)
|
||||
poly[3][0] -= R * r[3] * np.sin(theta)
|
||||
poly[3][1] -= R * r[3] * np.cos(theta)
|
||||
## p1, p2
|
||||
theta = np.arctan2((poly[2][0] - poly[1][0]),
|
||||
(poly[2][1] - poly[1][1]))
|
||||
poly[1][0] += R * r[1] * np.sin(theta)
|
||||
poly[1][1] += R * r[1] * np.cos(theta)
|
||||
poly[2][0] -= R * r[2] * np.sin(theta)
|
||||
poly[2][1] -= R * r[2] * np.cos(theta)
|
||||
else:
|
||||
## p0, p3
|
||||
# print poly
|
||||
theta = np.arctan2((poly[3][0] - poly[0][0]),
|
||||
(poly[3][1] - poly[0][1]))
|
||||
poly[0][0] += R * r[0] * np.sin(theta)
|
||||
poly[0][1] += R * r[0] * np.cos(theta)
|
||||
poly[3][0] -= R * r[3] * np.sin(theta)
|
||||
poly[3][1] -= R * r[3] * np.cos(theta)
|
||||
## p1, p2
|
||||
theta = np.arctan2((poly[2][0] - poly[1][0]),
|
||||
(poly[2][1] - poly[1][1]))
|
||||
poly[1][0] += R * r[1] * np.sin(theta)
|
||||
poly[1][1] += R * r[1] * np.cos(theta)
|
||||
poly[2][0] -= R * r[2] * np.sin(theta)
|
||||
poly[2][1] -= R * r[2] * np.cos(theta)
|
||||
## p0, p1
|
||||
theta = np.arctan2((poly[1][1] - poly[0][1]),
|
||||
(poly[1][0] - poly[0][0]))
|
||||
poly[0][0] += R * r[0] * np.cos(theta)
|
||||
poly[0][1] += R * r[0] * np.sin(theta)
|
||||
poly[1][0] -= R * r[1] * np.cos(theta)
|
||||
poly[1][1] -= R * r[1] * np.sin(theta)
|
||||
## p2, p3
|
||||
theta = np.arctan2((poly[2][1] - poly[3][1]),
|
||||
(poly[2][0] - poly[3][0]))
|
||||
poly[3][0] += R * r[3] * np.cos(theta)
|
||||
poly[3][1] += R * r[3] * np.sin(theta)
|
||||
poly[2][0] -= R * r[2] * np.cos(theta)
|
||||
poly[2][1] -= R * r[2] * np.sin(theta)
|
||||
return poly
|
||||
|
||||
def generate_quad(self, im_size, polys, tags):
|
||||
"""
|
||||
Generate quadrangle.
|
||||
"""
|
||||
h, w = im_size
|
||||
poly_mask = np.zeros((h, w), dtype=np.uint8)
|
||||
score_map = np.zeros((h, w), dtype=np.uint8)
|
||||
# (x1, y1, ..., x4, y4, short_edge_norm)
|
||||
geo_map = np.zeros((h, w, 9), dtype=np.float32)
|
||||
# mask used during traning, to ignore some hard areas
|
||||
training_mask = np.ones((h, w), dtype=np.uint8)
|
||||
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
|
||||
poly = poly_tag[0]
|
||||
tag = poly_tag[1]
|
||||
|
||||
r = [None, None, None, None]
|
||||
for i in range(4):
|
||||
dist1 = np.linalg.norm(poly[i] - poly[(i + 1) % 4])
|
||||
dist2 = np.linalg.norm(poly[i] - poly[(i - 1) % 4])
|
||||
r[i] = min(dist1, dist2)
|
||||
# score map
|
||||
shrinked_poly = self.shrink_poly(
|
||||
poly.copy(), r).astype(np.int32)[np.newaxis, :, :]
|
||||
cv2.fillPoly(score_map, shrinked_poly, 1)
|
||||
cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1)
|
||||
# if the poly is too small, then ignore it during training
|
||||
poly_h = min(
|
||||
np.linalg.norm(poly[0] - poly[3]),
|
||||
np.linalg.norm(poly[1] - poly[2]))
|
||||
poly_w = min(
|
||||
np.linalg.norm(poly[0] - poly[1]),
|
||||
np.linalg.norm(poly[2] - poly[3]))
|
||||
if min(poly_h, poly_w) < self.min_text_size:
|
||||
cv2.fillPoly(training_mask,
|
||||
poly.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
|
||||
if tag:
|
||||
cv2.fillPoly(training_mask,
|
||||
poly.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
|
||||
xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1))
|
||||
# geo map.
|
||||
y_in_poly = xy_in_poly[:, 0]
|
||||
x_in_poly = xy_in_poly[:, 1]
|
||||
poly[:, 0] = np.minimum(np.maximum(poly[:, 0], 0), w)
|
||||
poly[:, 1] = np.minimum(np.maximum(poly[:, 1], 0), h)
|
||||
for pno in range(4):
|
||||
geo_channel_beg = pno * 2
|
||||
geo_map[y_in_poly, x_in_poly, geo_channel_beg] =\
|
||||
x_in_poly - poly[pno, 0]
|
||||
geo_map[y_in_poly, x_in_poly, geo_channel_beg+1] =\
|
||||
y_in_poly - poly[pno, 1]
|
||||
geo_map[y_in_poly, x_in_poly, 8] = \
|
||||
1.0 / max(min(poly_h, poly_w), 1.0)
|
||||
return score_map, geo_map, training_mask
|
||||
|
||||
def crop_area(self, im, polys, tags, crop_background=False, max_tries=50):
|
||||
"""
|
||||
make random crop from the input image
|
||||
:param im:
|
||||
:param polys:
|
||||
:param tags:
|
||||
:param crop_background:
|
||||
:param max_tries:
|
||||
:return:
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
pad_h = h // 10
|
||||
pad_w = w // 10
|
||||
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
|
||||
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
|
||||
for poly in polys:
|
||||
poly = np.round(poly, decimals=0).astype(np.int32)
|
||||
minx = np.min(poly[:, 0])
|
||||
maxx = np.max(poly[:, 0])
|
||||
w_array[minx + pad_w:maxx + pad_w] = 1
|
||||
miny = np.min(poly[:, 1])
|
||||
maxy = np.max(poly[:, 1])
|
||||
h_array[miny + pad_h:maxy + pad_h] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return im, polys, tags
|
||||
|
||||
for i in range(max_tries):
|
||||
xx = np.random.choice(w_axis, size=2)
|
||||
xmin = np.min(xx) - pad_w
|
||||
xmax = np.max(xx) - pad_w
|
||||
xmin = np.clip(xmin, 0, w - 1)
|
||||
xmax = np.clip(xmax, 0, w - 1)
|
||||
yy = np.random.choice(h_axis, size=2)
|
||||
ymin = np.min(yy) - pad_h
|
||||
ymax = np.max(yy) - pad_h
|
||||
ymin = np.clip(ymin, 0, h - 1)
|
||||
ymax = np.clip(ymax, 0, h - 1)
|
||||
if xmax - xmin < self.min_crop_side_ratio * w or \
|
||||
ymax - ymin < self.min_crop_side_ratio * h:
|
||||
# area too small
|
||||
continue
|
||||
if polys.shape[0] != 0:
|
||||
poly_axis_in_area = (polys[:, :, 0] >= xmin)\
|
||||
& (polys[:, :, 0] <= xmax)\
|
||||
& (polys[:, :, 1] >= ymin)\
|
||||
& (polys[:, :, 1] <= ymax)
|
||||
selected_polys = np.where(
|
||||
np.sum(poly_axis_in_area, axis=1) == 4)[0]
|
||||
else:
|
||||
selected_polys = []
|
||||
|
||||
if len(selected_polys) == 0:
|
||||
# no text in this area
|
||||
if crop_background:
|
||||
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
|
||||
polys = []
|
||||
tags = []
|
||||
return im, polys, tags
|
||||
else:
|
||||
continue
|
||||
|
||||
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
|
||||
polys = polys[selected_polys]
|
||||
tags = tags[selected_polys]
|
||||
polys[:, :, 0] -= xmin
|
||||
polys[:, :, 1] -= ymin
|
||||
return im, polys, tags
|
||||
return im, polys, tags
|
||||
|
||||
def crop_background_infor(self, im, text_polys, text_tags):
|
||||
im, text_polys, text_tags = self.crop_area(
|
||||
im, text_polys, text_tags, crop_background=True)
|
||||
|
||||
if len(text_polys) > 0:
|
||||
return None
|
||||
# pad and resize image
|
||||
input_size = self.input_size
|
||||
im, ratio = self.preprocess(im)
|
||||
score_map = np.zeros((input_size, input_size), dtype=np.float32)
|
||||
geo_map = np.zeros((input_size, input_size, 9), dtype=np.float32)
|
||||
training_mask = np.ones((input_size, input_size), dtype=np.float32)
|
||||
return im, score_map, geo_map, training_mask
|
||||
|
||||
def crop_foreground_infor(self, im, text_polys, text_tags):
|
||||
im, text_polys, text_tags = self.crop_area(
|
||||
im, text_polys, text_tags, crop_background=False)
|
||||
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
#continue for all ignore case
|
||||
if np.sum((text_tags * 1.0)) >= text_tags.size:
|
||||
return None
|
||||
# pad and resize image
|
||||
input_size = self.input_size
|
||||
im, ratio = self.preprocess(im)
|
||||
text_polys[:, :, 0] *= ratio
|
||||
text_polys[:, :, 1] *= ratio
|
||||
_, _, new_h, new_w = im.shape
|
||||
# print(im.shape)
|
||||
# self.draw_img_polys(im, text_polys)
|
||||
score_map, geo_map, training_mask = self.generate_quad(
|
||||
(new_h, new_w), text_polys, text_tags)
|
||||
return im, score_map, geo_map, training_mask
|
||||
|
||||
def __call__(self, data):
|
||||
im = data['image']
|
||||
text_polys = data['polys']
|
||||
text_tags = data['ignore_tags']
|
||||
if im is None:
|
||||
return None
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
#add rotate cases
|
||||
if np.random.rand() < 0.5:
|
||||
im, text_polys = self.rotate_im_poly(im, text_polys)
|
||||
h, w, _ = im.shape
|
||||
text_polys, text_tags = self.check_and_validate_polys(text_polys,
|
||||
text_tags, h, w)
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
# random scale this image
|
||||
rd_scale = np.random.choice(self.random_scale)
|
||||
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
|
||||
text_polys *= rd_scale
|
||||
if np.random.rand() < self.background_ratio:
|
||||
outs = self.crop_background_infor(im, text_polys, text_tags)
|
||||
else:
|
||||
outs = self.crop_foreground_infor(im, text_polys, text_tags)
|
||||
|
||||
if outs is None:
|
||||
return None
|
||||
im, score_map, geo_map, training_mask = outs
|
||||
score_map = score_map[np.newaxis, ::4, ::4].astype(np.float32)
|
||||
geo_map = np.swapaxes(geo_map, 1, 2)
|
||||
geo_map = np.swapaxes(geo_map, 1, 0)
|
||||
geo_map = geo_map[:, ::4, ::4].astype(np.float32)
|
||||
training_mask = training_mask[np.newaxis, ::4, ::4]
|
||||
training_mask = training_mask.astype(np.float32)
|
||||
|
||||
data['image'] = im[0]
|
||||
data['score_map'] = score_map
|
||||
data['geo_map'] = geo_map
|
||||
data['training_mask'] = training_mask
|
||||
return data
|
||||
564
backend/ppocr/data/imaug/fce_aug.py
Normal file
564
backend/ppocr/data/imaug/fce_aug.py
Normal file
@@ -0,0 +1,564 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/transforms.py
|
||||
"""
|
||||
import numpy as np
|
||||
from PIL import Image, ImageDraw
|
||||
import cv2
|
||||
from shapely.geometry import Polygon
|
||||
import math
|
||||
from ppocr.utils.poly_nms import poly_intersection
|
||||
|
||||
|
||||
class RandomScaling:
|
||||
def __init__(self, size=800, scale=(3. / 4, 5. / 2), **kwargs):
|
||||
"""Random scale the image while keeping aspect.
|
||||
|
||||
Args:
|
||||
size (int) : Base size before scaling.
|
||||
scale (tuple(float)) : The range of scaling.
|
||||
"""
|
||||
assert isinstance(size, int)
|
||||
assert isinstance(scale, float) or isinstance(scale, tuple)
|
||||
self.size = size
|
||||
self.scale = scale if isinstance(scale, tuple) \
|
||||
else (1 - scale, 1 + scale)
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
text_polys = data['polys']
|
||||
h, w, _ = image.shape
|
||||
|
||||
aspect_ratio = np.random.uniform(min(self.scale), max(self.scale))
|
||||
scales = self.size * 1.0 / max(h, w) * aspect_ratio
|
||||
scales = np.array([scales, scales])
|
||||
out_size = (int(h * scales[1]), int(w * scales[0]))
|
||||
image = cv2.resize(image, out_size[::-1])
|
||||
|
||||
data['image'] = image
|
||||
text_polys[:, :, 0::2] = text_polys[:, :, 0::2] * scales[1]
|
||||
text_polys[:, :, 1::2] = text_polys[:, :, 1::2] * scales[0]
|
||||
data['polys'] = text_polys
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class RandomCropFlip:
|
||||
def __init__(self,
|
||||
pad_ratio=0.1,
|
||||
crop_ratio=0.5,
|
||||
iter_num=1,
|
||||
min_area_ratio=0.2,
|
||||
**kwargs):
|
||||
"""Random crop and flip a patch of the image.
|
||||
|
||||
Args:
|
||||
crop_ratio (float): The ratio of cropping.
|
||||
iter_num (int): Number of operations.
|
||||
min_area_ratio (float): Minimal area ratio between cropped patch
|
||||
and original image.
|
||||
"""
|
||||
assert isinstance(crop_ratio, float)
|
||||
assert isinstance(iter_num, int)
|
||||
assert isinstance(min_area_ratio, float)
|
||||
|
||||
self.pad_ratio = pad_ratio
|
||||
self.epsilon = 1e-2
|
||||
self.crop_ratio = crop_ratio
|
||||
self.iter_num = iter_num
|
||||
self.min_area_ratio = min_area_ratio
|
||||
|
||||
def __call__(self, results):
|
||||
for i in range(self.iter_num):
|
||||
results = self.random_crop_flip(results)
|
||||
|
||||
return results
|
||||
|
||||
def random_crop_flip(self, results):
|
||||
image = results['image']
|
||||
polygons = results['polys']
|
||||
ignore_tags = results['ignore_tags']
|
||||
if len(polygons) == 0:
|
||||
return results
|
||||
|
||||
if np.random.random() >= self.crop_ratio:
|
||||
return results
|
||||
|
||||
h, w, _ = image.shape
|
||||
area = h * w
|
||||
pad_h = int(h * self.pad_ratio)
|
||||
pad_w = int(w * self.pad_ratio)
|
||||
h_axis, w_axis = self.generate_crop_target(image, polygons, pad_h,
|
||||
pad_w)
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return results
|
||||
|
||||
attempt = 0
|
||||
while attempt < 50:
|
||||
attempt += 1
|
||||
polys_keep = []
|
||||
polys_new = []
|
||||
ignore_tags_keep = []
|
||||
ignore_tags_new = []
|
||||
xx = np.random.choice(w_axis, size=2)
|
||||
xmin = np.min(xx) - pad_w
|
||||
xmax = np.max(xx) - pad_w
|
||||
xmin = np.clip(xmin, 0, w - 1)
|
||||
xmax = np.clip(xmax, 0, w - 1)
|
||||
yy = np.random.choice(h_axis, size=2)
|
||||
ymin = np.min(yy) - pad_h
|
||||
ymax = np.max(yy) - pad_h
|
||||
ymin = np.clip(ymin, 0, h - 1)
|
||||
ymax = np.clip(ymax, 0, h - 1)
|
||||
if (xmax - xmin) * (ymax - ymin) < area * self.min_area_ratio:
|
||||
# area too small
|
||||
continue
|
||||
|
||||
pts = np.stack([[xmin, xmax, xmax, xmin],
|
||||
[ymin, ymin, ymax, ymax]]).T.astype(np.int32)
|
||||
pp = Polygon(pts)
|
||||
fail_flag = False
|
||||
for polygon, ignore_tag in zip(polygons, ignore_tags):
|
||||
ppi = Polygon(polygon.reshape(-1, 2))
|
||||
ppiou, _ = poly_intersection(ppi, pp, buffer=0)
|
||||
if np.abs(ppiou - float(ppi.area)) > self.epsilon and \
|
||||
np.abs(ppiou) > self.epsilon:
|
||||
fail_flag = True
|
||||
break
|
||||
elif np.abs(ppiou - float(ppi.area)) < self.epsilon:
|
||||
polys_new.append(polygon)
|
||||
ignore_tags_new.append(ignore_tag)
|
||||
else:
|
||||
polys_keep.append(polygon)
|
||||
ignore_tags_keep.append(ignore_tag)
|
||||
|
||||
if fail_flag:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
cropped = image[ymin:ymax, xmin:xmax, :]
|
||||
select_type = np.random.randint(3)
|
||||
if select_type == 0:
|
||||
img = np.ascontiguousarray(cropped[:, ::-1])
|
||||
elif select_type == 1:
|
||||
img = np.ascontiguousarray(cropped[::-1, :])
|
||||
else:
|
||||
img = np.ascontiguousarray(cropped[::-1, ::-1])
|
||||
image[ymin:ymax, xmin:xmax, :] = img
|
||||
results['img'] = image
|
||||
|
||||
if len(polys_new) != 0:
|
||||
height, width, _ = cropped.shape
|
||||
if select_type == 0:
|
||||
for idx, polygon in enumerate(polys_new):
|
||||
poly = polygon.reshape(-1, 2)
|
||||
poly[:, 0] = width - poly[:, 0] + 2 * xmin
|
||||
polys_new[idx] = poly
|
||||
elif select_type == 1:
|
||||
for idx, polygon in enumerate(polys_new):
|
||||
poly = polygon.reshape(-1, 2)
|
||||
poly[:, 1] = height - poly[:, 1] + 2 * ymin
|
||||
polys_new[idx] = poly
|
||||
else:
|
||||
for idx, polygon in enumerate(polys_new):
|
||||
poly = polygon.reshape(-1, 2)
|
||||
poly[:, 0] = width - poly[:, 0] + 2 * xmin
|
||||
poly[:, 1] = height - poly[:, 1] + 2 * ymin
|
||||
polys_new[idx] = poly
|
||||
polygons = polys_keep + polys_new
|
||||
ignore_tags = ignore_tags_keep + ignore_tags_new
|
||||
results['polys'] = np.array(polygons)
|
||||
results['ignore_tags'] = ignore_tags
|
||||
|
||||
return results
|
||||
|
||||
def generate_crop_target(self, image, all_polys, pad_h, pad_w):
|
||||
"""Generate crop target and make sure not to crop the polygon
|
||||
instances.
|
||||
|
||||
Args:
|
||||
image (ndarray): The image waited to be crop.
|
||||
all_polys (list[list[ndarray]]): All polygons including ground
|
||||
truth polygons and ground truth ignored polygons.
|
||||
pad_h (int): Padding length of height.
|
||||
pad_w (int): Padding length of width.
|
||||
Returns:
|
||||
h_axis (ndarray): Vertical cropping range.
|
||||
w_axis (ndarray): Horizontal cropping range.
|
||||
"""
|
||||
h, w, _ = image.shape
|
||||
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
|
||||
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
|
||||
|
||||
text_polys = []
|
||||
for polygon in all_polys:
|
||||
rect = cv2.minAreaRect(polygon.astype(np.int32).reshape(-1, 2))
|
||||
box = cv2.boxPoints(rect)
|
||||
box = np.int0(box)
|
||||
text_polys.append([box[0], box[1], box[2], box[3]])
|
||||
|
||||
polys = np.array(text_polys, dtype=np.int32)
|
||||
for poly in polys:
|
||||
poly = np.round(poly, decimals=0).astype(np.int32)
|
||||
minx = np.min(poly[:, 0])
|
||||
maxx = np.max(poly[:, 0])
|
||||
w_array[minx + pad_w:maxx + pad_w] = 1
|
||||
miny = np.min(poly[:, 1])
|
||||
maxy = np.max(poly[:, 1])
|
||||
h_array[miny + pad_h:maxy + pad_h] = 1
|
||||
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
return h_axis, w_axis
|
||||
|
||||
|
||||
class RandomCropPolyInstances:
|
||||
"""Randomly crop images and make sure to contain at least one intact
|
||||
instance."""
|
||||
|
||||
def __init__(self, crop_ratio=5.0 / 8.0, min_side_ratio=0.4, **kwargs):
|
||||
super().__init__()
|
||||
self.crop_ratio = crop_ratio
|
||||
self.min_side_ratio = min_side_ratio
|
||||
|
||||
def sample_valid_start_end(self, valid_array, min_len, max_start, min_end):
|
||||
|
||||
assert isinstance(min_len, int)
|
||||
assert len(valid_array) > min_len
|
||||
|
||||
start_array = valid_array.copy()
|
||||
max_start = min(len(start_array) - min_len, max_start)
|
||||
start_array[max_start:] = 0
|
||||
start_array[0] = 1
|
||||
diff_array = np.hstack([0, start_array]) - np.hstack([start_array, 0])
|
||||
region_starts = np.where(diff_array < 0)[0]
|
||||
region_ends = np.where(diff_array > 0)[0]
|
||||
region_ind = np.random.randint(0, len(region_starts))
|
||||
start = np.random.randint(region_starts[region_ind],
|
||||
region_ends[region_ind])
|
||||
|
||||
end_array = valid_array.copy()
|
||||
min_end = max(start + min_len, min_end)
|
||||
end_array[:min_end] = 0
|
||||
end_array[-1] = 1
|
||||
diff_array = np.hstack([0, end_array]) - np.hstack([end_array, 0])
|
||||
region_starts = np.where(diff_array < 0)[0]
|
||||
region_ends = np.where(diff_array > 0)[0]
|
||||
region_ind = np.random.randint(0, len(region_starts))
|
||||
end = np.random.randint(region_starts[region_ind],
|
||||
region_ends[region_ind])
|
||||
return start, end
|
||||
|
||||
def sample_crop_box(self, img_size, results):
|
||||
"""Generate crop box and make sure not to crop the polygon instances.
|
||||
|
||||
Args:
|
||||
img_size (tuple(int)): The image size (h, w).
|
||||
results (dict): The results dict.
|
||||
"""
|
||||
|
||||
assert isinstance(img_size, tuple)
|
||||
h, w = img_size[:2]
|
||||
|
||||
key_masks = results['polys']
|
||||
|
||||
x_valid_array = np.ones(w, dtype=np.int32)
|
||||
y_valid_array = np.ones(h, dtype=np.int32)
|
||||
|
||||
selected_mask = key_masks[np.random.randint(0, len(key_masks))]
|
||||
selected_mask = selected_mask.reshape((-1, 2)).astype(np.int32)
|
||||
max_x_start = max(np.min(selected_mask[:, 0]) - 2, 0)
|
||||
min_x_end = min(np.max(selected_mask[:, 0]) + 3, w - 1)
|
||||
max_y_start = max(np.min(selected_mask[:, 1]) - 2, 0)
|
||||
min_y_end = min(np.max(selected_mask[:, 1]) + 3, h - 1)
|
||||
|
||||
for mask in key_masks:
|
||||
mask = mask.reshape((-1, 2)).astype(np.int32)
|
||||
clip_x = np.clip(mask[:, 0], 0, w - 1)
|
||||
clip_y = np.clip(mask[:, 1], 0, h - 1)
|
||||
min_x, max_x = np.min(clip_x), np.max(clip_x)
|
||||
min_y, max_y = np.min(clip_y), np.max(clip_y)
|
||||
|
||||
x_valid_array[min_x - 2:max_x + 3] = 0
|
||||
y_valid_array[min_y - 2:max_y + 3] = 0
|
||||
|
||||
min_w = int(w * self.min_side_ratio)
|
||||
min_h = int(h * self.min_side_ratio)
|
||||
|
||||
x1, x2 = self.sample_valid_start_end(x_valid_array, min_w, max_x_start,
|
||||
min_x_end)
|
||||
y1, y2 = self.sample_valid_start_end(y_valid_array, min_h, max_y_start,
|
||||
min_y_end)
|
||||
|
||||
return np.array([x1, y1, x2, y2])
|
||||
|
||||
def crop_img(self, img, bbox):
|
||||
assert img.ndim == 3
|
||||
h, w, _ = img.shape
|
||||
assert 0 <= bbox[1] < bbox[3] <= h
|
||||
assert 0 <= bbox[0] < bbox[2] <= w
|
||||
return img[bbox[1]:bbox[3], bbox[0]:bbox[2]]
|
||||
|
||||
def __call__(self, results):
|
||||
image = results['image']
|
||||
polygons = results['polys']
|
||||
ignore_tags = results['ignore_tags']
|
||||
if len(polygons) < 1:
|
||||
return results
|
||||
|
||||
if np.random.random_sample() < self.crop_ratio:
|
||||
|
||||
crop_box = self.sample_crop_box(image.shape, results)
|
||||
img = self.crop_img(image, crop_box)
|
||||
results['image'] = img
|
||||
# crop and filter masks
|
||||
x1, y1, x2, y2 = crop_box
|
||||
w = max(x2 - x1, 1)
|
||||
h = max(y2 - y1, 1)
|
||||
polygons[:, :, 0::2] = polygons[:, :, 0::2] - x1
|
||||
polygons[:, :, 1::2] = polygons[:, :, 1::2] - y1
|
||||
|
||||
valid_masks_list = []
|
||||
valid_tags_list = []
|
||||
for ind, polygon in enumerate(polygons):
|
||||
if (polygon[:, ::2] > -4).all() and (
|
||||
polygon[:, ::2] < w + 4).all() and (
|
||||
polygon[:, 1::2] > -4).all() and (
|
||||
polygon[:, 1::2] < h + 4).all():
|
||||
polygon[:, ::2] = np.clip(polygon[:, ::2], 0, w)
|
||||
polygon[:, 1::2] = np.clip(polygon[:, 1::2], 0, h)
|
||||
valid_masks_list.append(polygon)
|
||||
valid_tags_list.append(ignore_tags[ind])
|
||||
|
||||
results['polys'] = np.array(valid_masks_list)
|
||||
results['ignore_tags'] = valid_tags_list
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
return repr_str
|
||||
|
||||
|
||||
class RandomRotatePolyInstances:
|
||||
def __init__(self,
|
||||
rotate_ratio=0.5,
|
||||
max_angle=10,
|
||||
pad_with_fixed_color=False,
|
||||
pad_value=(0, 0, 0),
|
||||
**kwargs):
|
||||
"""Randomly rotate images and polygon masks.
|
||||
|
||||
Args:
|
||||
rotate_ratio (float): The ratio of samples to operate rotation.
|
||||
max_angle (int): The maximum rotation angle.
|
||||
pad_with_fixed_color (bool): The flag for whether to pad rotated
|
||||
image with fixed value. If set to False, the rotated image will
|
||||
be padded onto cropped image.
|
||||
pad_value (tuple(int)): The color value for padding rotated image.
|
||||
"""
|
||||
self.rotate_ratio = rotate_ratio
|
||||
self.max_angle = max_angle
|
||||
self.pad_with_fixed_color = pad_with_fixed_color
|
||||
self.pad_value = pad_value
|
||||
|
||||
def rotate(self, center, points, theta, center_shift=(0, 0)):
|
||||
# rotate points.
|
||||
(center_x, center_y) = center
|
||||
center_y = -center_y
|
||||
x, y = points[:, ::2], points[:, 1::2]
|
||||
y = -y
|
||||
|
||||
theta = theta / 180 * math.pi
|
||||
cos = math.cos(theta)
|
||||
sin = math.sin(theta)
|
||||
|
||||
x = (x - center_x)
|
||||
y = (y - center_y)
|
||||
|
||||
_x = center_x + x * cos - y * sin + center_shift[0]
|
||||
_y = -(center_y + x * sin + y * cos) + center_shift[1]
|
||||
|
||||
points[:, ::2], points[:, 1::2] = _x, _y
|
||||
return points
|
||||
|
||||
def cal_canvas_size(self, ori_size, degree):
|
||||
assert isinstance(ori_size, tuple)
|
||||
angle = degree * math.pi / 180.0
|
||||
h, w = ori_size[:2]
|
||||
|
||||
cos = math.cos(angle)
|
||||
sin = math.sin(angle)
|
||||
canvas_h = int(w * math.fabs(sin) + h * math.fabs(cos))
|
||||
canvas_w = int(w * math.fabs(cos) + h * math.fabs(sin))
|
||||
|
||||
canvas_size = (canvas_h, canvas_w)
|
||||
return canvas_size
|
||||
|
||||
def sample_angle(self, max_angle):
|
||||
angle = np.random.random_sample() * 2 * max_angle - max_angle
|
||||
return angle
|
||||
|
||||
def rotate_img(self, img, angle, canvas_size):
|
||||
h, w = img.shape[:2]
|
||||
rotation_matrix = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1)
|
||||
rotation_matrix[0, 2] += int((canvas_size[1] - w) / 2)
|
||||
rotation_matrix[1, 2] += int((canvas_size[0] - h) / 2)
|
||||
|
||||
if self.pad_with_fixed_color:
|
||||
target_img = cv2.warpAffine(
|
||||
img,
|
||||
rotation_matrix, (canvas_size[1], canvas_size[0]),
|
||||
flags=cv2.INTER_NEAREST,
|
||||
borderValue=self.pad_value)
|
||||
else:
|
||||
mask = np.zeros_like(img)
|
||||
(h_ind, w_ind) = (np.random.randint(0, h * 7 // 8),
|
||||
np.random.randint(0, w * 7 // 8))
|
||||
img_cut = img[h_ind:(h_ind + h // 9), w_ind:(w_ind + w // 9)]
|
||||
img_cut = cv2.resize(img_cut, (canvas_size[1], canvas_size[0]))
|
||||
|
||||
mask = cv2.warpAffine(
|
||||
mask,
|
||||
rotation_matrix, (canvas_size[1], canvas_size[0]),
|
||||
borderValue=[1, 1, 1])
|
||||
target_img = cv2.warpAffine(
|
||||
img,
|
||||
rotation_matrix, (canvas_size[1], canvas_size[0]),
|
||||
borderValue=[0, 0, 0])
|
||||
target_img = target_img + img_cut * mask
|
||||
|
||||
return target_img
|
||||
|
||||
def __call__(self, results):
|
||||
if np.random.random_sample() < self.rotate_ratio:
|
||||
image = results['image']
|
||||
polygons = results['polys']
|
||||
h, w = image.shape[:2]
|
||||
|
||||
angle = self.sample_angle(self.max_angle)
|
||||
canvas_size = self.cal_canvas_size((h, w), angle)
|
||||
center_shift = (int((canvas_size[1] - w) / 2), int(
|
||||
(canvas_size[0] - h) / 2))
|
||||
image = self.rotate_img(image, angle, canvas_size)
|
||||
results['image'] = image
|
||||
# rotate polygons
|
||||
rotated_masks = []
|
||||
for mask in polygons:
|
||||
rotated_mask = self.rotate((w / 2, h / 2), mask, angle,
|
||||
center_shift)
|
||||
rotated_masks.append(rotated_mask)
|
||||
results['polys'] = np.array(rotated_masks)
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
return repr_str
|
||||
|
||||
|
||||
class SquareResizePad:
|
||||
def __init__(self,
|
||||
target_size,
|
||||
pad_ratio=0.6,
|
||||
pad_with_fixed_color=False,
|
||||
pad_value=(0, 0, 0),
|
||||
**kwargs):
|
||||
"""Resize or pad images to be square shape.
|
||||
|
||||
Args:
|
||||
target_size (int): The target size of square shaped image.
|
||||
pad_with_fixed_color (bool): The flag for whether to pad rotated
|
||||
image with fixed value. If set to False, the rescales image will
|
||||
be padded onto cropped image.
|
||||
pad_value (tuple(int)): The color value for padding rotated image.
|
||||
"""
|
||||
assert isinstance(target_size, int)
|
||||
assert isinstance(pad_ratio, float)
|
||||
assert isinstance(pad_with_fixed_color, bool)
|
||||
assert isinstance(pad_value, tuple)
|
||||
|
||||
self.target_size = target_size
|
||||
self.pad_ratio = pad_ratio
|
||||
self.pad_with_fixed_color = pad_with_fixed_color
|
||||
self.pad_value = pad_value
|
||||
|
||||
def resize_img(self, img, keep_ratio=True):
|
||||
h, w, _ = img.shape
|
||||
if keep_ratio:
|
||||
t_h = self.target_size if h >= w else int(h * self.target_size / w)
|
||||
t_w = self.target_size if h <= w else int(w * self.target_size / h)
|
||||
else:
|
||||
t_h = t_w = self.target_size
|
||||
img = cv2.resize(img, (t_w, t_h))
|
||||
return img, (t_h, t_w)
|
||||
|
||||
def square_pad(self, img):
|
||||
h, w = img.shape[:2]
|
||||
if h == w:
|
||||
return img, (0, 0)
|
||||
pad_size = max(h, w)
|
||||
if self.pad_with_fixed_color:
|
||||
expand_img = np.ones((pad_size, pad_size, 3), dtype=np.uint8)
|
||||
expand_img[:] = self.pad_value
|
||||
else:
|
||||
(h_ind, w_ind) = (np.random.randint(0, h * 7 // 8),
|
||||
np.random.randint(0, w * 7 // 8))
|
||||
img_cut = img[h_ind:(h_ind + h // 9), w_ind:(w_ind + w // 9)]
|
||||
expand_img = cv2.resize(img_cut, (pad_size, pad_size))
|
||||
if h > w:
|
||||
y0, x0 = 0, (h - w) // 2
|
||||
else:
|
||||
y0, x0 = (w - h) // 2, 0
|
||||
expand_img[y0:y0 + h, x0:x0 + w] = img
|
||||
offset = (x0, y0)
|
||||
|
||||
return expand_img, offset
|
||||
|
||||
def square_pad_mask(self, points, offset):
|
||||
x0, y0 = offset
|
||||
pad_points = points.copy()
|
||||
pad_points[::2] = pad_points[::2] + x0
|
||||
pad_points[1::2] = pad_points[1::2] + y0
|
||||
return pad_points
|
||||
|
||||
def __call__(self, results):
|
||||
image = results['image']
|
||||
polygons = results['polys']
|
||||
h, w = image.shape[:2]
|
||||
|
||||
if np.random.random_sample() < self.pad_ratio:
|
||||
image, out_size = self.resize_img(image, keep_ratio=True)
|
||||
image, offset = self.square_pad(image)
|
||||
else:
|
||||
image, out_size = self.resize_img(image, keep_ratio=False)
|
||||
offset = (0, 0)
|
||||
results['image'] = image
|
||||
try:
|
||||
polygons[:, :, 0::2] = polygons[:, :, 0::2] * out_size[
|
||||
1] / w + offset[0]
|
||||
polygons[:, :, 1::2] = polygons[:, :, 1::2] * out_size[
|
||||
0] / h + offset[1]
|
||||
except:
|
||||
pass
|
||||
results['polys'] = polygons
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
return repr_str
|
||||
658
backend/ppocr/data/imaug/fce_targets.py
Normal file
658
backend/ppocr/data/imaug/fce_targets.py
Normal file
@@ -0,0 +1,658 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/fcenet_targets.py
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from numpy.fft import fft
|
||||
from numpy.linalg import norm
|
||||
import sys
|
||||
|
||||
|
||||
class FCENetTargets:
|
||||
"""Generate the ground truth targets of FCENet: Fourier Contour Embedding
|
||||
for Arbitrary-Shaped Text Detection.
|
||||
|
||||
[https://arxiv.org/abs/2104.10442]
|
||||
|
||||
Args:
|
||||
fourier_degree (int): The maximum Fourier transform degree k.
|
||||
resample_step (float): The step size for resampling the text center
|
||||
line (TCL). It's better not to exceed half of the minimum width.
|
||||
center_region_shrink_ratio (float): The shrink ratio of text center
|
||||
region.
|
||||
level_size_divisors (tuple(int)): The downsample ratio on each level.
|
||||
level_proportion_range (tuple(tuple(int))): The range of text sizes
|
||||
assigned to each level.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
fourier_degree=5,
|
||||
resample_step=4.0,
|
||||
center_region_shrink_ratio=0.3,
|
||||
level_size_divisors=(8, 16, 32),
|
||||
level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0)),
|
||||
orientation_thr=2.0,
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
assert isinstance(level_size_divisors, tuple)
|
||||
assert isinstance(level_proportion_range, tuple)
|
||||
assert len(level_size_divisors) == len(level_proportion_range)
|
||||
self.fourier_degree = fourier_degree
|
||||
self.resample_step = resample_step
|
||||
self.center_region_shrink_ratio = center_region_shrink_ratio
|
||||
self.level_size_divisors = level_size_divisors
|
||||
self.level_proportion_range = level_proportion_range
|
||||
|
||||
self.orientation_thr = orientation_thr
|
||||
|
||||
def vector_angle(self, vec1, vec2):
|
||||
if vec1.ndim > 1:
|
||||
unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8).reshape((-1, 1))
|
||||
else:
|
||||
unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8)
|
||||
if vec2.ndim > 1:
|
||||
unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8).reshape((-1, 1))
|
||||
else:
|
||||
unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8)
|
||||
return np.arccos(
|
||||
np.clip(
|
||||
np.sum(unit_vec1 * unit_vec2, axis=-1), -1.0, 1.0))
|
||||
|
||||
def resample_line(self, line, n):
|
||||
"""Resample n points on a line.
|
||||
|
||||
Args:
|
||||
line (ndarray): The points composing a line.
|
||||
n (int): The resampled points number.
|
||||
|
||||
Returns:
|
||||
resampled_line (ndarray): The points composing the resampled line.
|
||||
"""
|
||||
|
||||
assert line.ndim == 2
|
||||
assert line.shape[0] >= 2
|
||||
assert line.shape[1] == 2
|
||||
assert isinstance(n, int)
|
||||
assert n > 0
|
||||
|
||||
length_list = [
|
||||
norm(line[i + 1] - line[i]) for i in range(len(line) - 1)
|
||||
]
|
||||
total_length = sum(length_list)
|
||||
length_cumsum = np.cumsum([0.0] + length_list)
|
||||
delta_length = total_length / (float(n) + 1e-8)
|
||||
|
||||
current_edge_ind = 0
|
||||
resampled_line = [line[0]]
|
||||
|
||||
for i in range(1, n):
|
||||
current_line_len = i * delta_length
|
||||
|
||||
while current_line_len >= length_cumsum[current_edge_ind + 1]:
|
||||
current_edge_ind += 1
|
||||
current_edge_end_shift = current_line_len - length_cumsum[
|
||||
current_edge_ind]
|
||||
end_shift_ratio = current_edge_end_shift / length_list[
|
||||
current_edge_ind]
|
||||
current_point = line[current_edge_ind] + (line[current_edge_ind + 1]
|
||||
- line[current_edge_ind]
|
||||
) * end_shift_ratio
|
||||
resampled_line.append(current_point)
|
||||
|
||||
resampled_line.append(line[-1])
|
||||
resampled_line = np.array(resampled_line)
|
||||
|
||||
return resampled_line
|
||||
|
||||
def reorder_poly_edge(self, points):
|
||||
"""Get the respective points composing head edge, tail edge, top
|
||||
sideline and bottom sideline.
|
||||
|
||||
Args:
|
||||
points (ndarray): The points composing a text polygon.
|
||||
|
||||
Returns:
|
||||
head_edge (ndarray): The two points composing the head edge of text
|
||||
polygon.
|
||||
tail_edge (ndarray): The two points composing the tail edge of text
|
||||
polygon.
|
||||
top_sideline (ndarray): The points composing top curved sideline of
|
||||
text polygon.
|
||||
bot_sideline (ndarray): The points composing bottom curved sideline
|
||||
of text polygon.
|
||||
"""
|
||||
|
||||
assert points.ndim == 2
|
||||
assert points.shape[0] >= 4
|
||||
assert points.shape[1] == 2
|
||||
|
||||
head_inds, tail_inds = self.find_head_tail(points, self.orientation_thr)
|
||||
head_edge, tail_edge = points[head_inds], points[tail_inds]
|
||||
|
||||
pad_points = np.vstack([points, points])
|
||||
if tail_inds[1] < 1:
|
||||
tail_inds[1] = len(points)
|
||||
sideline1 = pad_points[head_inds[1]:tail_inds[1]]
|
||||
sideline2 = pad_points[tail_inds[1]:(head_inds[1] + len(points))]
|
||||
sideline_mean_shift = np.mean(
|
||||
sideline1, axis=0) - np.mean(
|
||||
sideline2, axis=0)
|
||||
|
||||
if sideline_mean_shift[1] > 0:
|
||||
top_sideline, bot_sideline = sideline2, sideline1
|
||||
else:
|
||||
top_sideline, bot_sideline = sideline1, sideline2
|
||||
|
||||
return head_edge, tail_edge, top_sideline, bot_sideline
|
||||
|
||||
def find_head_tail(self, points, orientation_thr):
|
||||
"""Find the head edge and tail edge of a text polygon.
|
||||
|
||||
Args:
|
||||
points (ndarray): The points composing a text polygon.
|
||||
orientation_thr (float): The threshold for distinguishing between
|
||||
head edge and tail edge among the horizontal and vertical edges
|
||||
of a quadrangle.
|
||||
|
||||
Returns:
|
||||
head_inds (list): The indexes of two points composing head edge.
|
||||
tail_inds (list): The indexes of two points composing tail edge.
|
||||
"""
|
||||
|
||||
assert points.ndim == 2
|
||||
assert points.shape[0] >= 4
|
||||
assert points.shape[1] == 2
|
||||
assert isinstance(orientation_thr, float)
|
||||
|
||||
if len(points) > 4:
|
||||
pad_points = np.vstack([points, points[0]])
|
||||
edge_vec = pad_points[1:] - pad_points[:-1]
|
||||
|
||||
theta_sum = []
|
||||
adjacent_vec_theta = []
|
||||
for i, edge_vec1 in enumerate(edge_vec):
|
||||
adjacent_ind = [x % len(edge_vec) for x in [i - 1, i + 1]]
|
||||
adjacent_edge_vec = edge_vec[adjacent_ind]
|
||||
temp_theta_sum = np.sum(
|
||||
self.vector_angle(edge_vec1, adjacent_edge_vec))
|
||||
temp_adjacent_theta = self.vector_angle(adjacent_edge_vec[0],
|
||||
adjacent_edge_vec[1])
|
||||
theta_sum.append(temp_theta_sum)
|
||||
adjacent_vec_theta.append(temp_adjacent_theta)
|
||||
theta_sum_score = np.array(theta_sum) / np.pi
|
||||
adjacent_theta_score = np.array(adjacent_vec_theta) / np.pi
|
||||
poly_center = np.mean(points, axis=0)
|
||||
edge_dist = np.maximum(
|
||||
norm(
|
||||
pad_points[1:] - poly_center, axis=-1),
|
||||
norm(
|
||||
pad_points[:-1] - poly_center, axis=-1))
|
||||
dist_score = edge_dist / np.max(edge_dist)
|
||||
position_score = np.zeros(len(edge_vec))
|
||||
score = 0.5 * theta_sum_score + 0.15 * adjacent_theta_score
|
||||
score += 0.35 * dist_score
|
||||
if len(points) % 2 == 0:
|
||||
position_score[(len(score) // 2 - 1)] += 1
|
||||
position_score[-1] += 1
|
||||
score += 0.1 * position_score
|
||||
pad_score = np.concatenate([score, score])
|
||||
score_matrix = np.zeros((len(score), len(score) - 3))
|
||||
x = np.arange(len(score) - 3) / float(len(score) - 4)
|
||||
gaussian = 1. / (np.sqrt(2. * np.pi) * 0.5) * np.exp(-np.power(
|
||||
(x - 0.5) / 0.5, 2.) / 2)
|
||||
gaussian = gaussian / np.max(gaussian)
|
||||
for i in range(len(score)):
|
||||
score_matrix[i, :] = score[i] + pad_score[(i + 2):(i + len(
|
||||
score) - 1)] * gaussian * 0.3
|
||||
|
||||
head_start, tail_increment = np.unravel_index(score_matrix.argmax(),
|
||||
score_matrix.shape)
|
||||
tail_start = (head_start + tail_increment + 2) % len(points)
|
||||
head_end = (head_start + 1) % len(points)
|
||||
tail_end = (tail_start + 1) % len(points)
|
||||
|
||||
if head_end > tail_end:
|
||||
head_start, tail_start = tail_start, head_start
|
||||
head_end, tail_end = tail_end, head_end
|
||||
head_inds = [head_start, head_end]
|
||||
tail_inds = [tail_start, tail_end]
|
||||
else:
|
||||
if self.vector_slope(points[1] - points[0]) + self.vector_slope(
|
||||
points[3] - points[2]) < self.vector_slope(points[
|
||||
2] - points[1]) + self.vector_slope(points[0] - points[
|
||||
3]):
|
||||
horizontal_edge_inds = [[0, 1], [2, 3]]
|
||||
vertical_edge_inds = [[3, 0], [1, 2]]
|
||||
else:
|
||||
horizontal_edge_inds = [[3, 0], [1, 2]]
|
||||
vertical_edge_inds = [[0, 1], [2, 3]]
|
||||
|
||||
vertical_len_sum = norm(points[vertical_edge_inds[0][0]] - points[
|
||||
vertical_edge_inds[0][1]]) + norm(points[vertical_edge_inds[1][
|
||||
0]] - points[vertical_edge_inds[1][1]])
|
||||
horizontal_len_sum = norm(points[horizontal_edge_inds[0][
|
||||
0]] - points[horizontal_edge_inds[0][1]]) + norm(points[
|
||||
horizontal_edge_inds[1][0]] - points[horizontal_edge_inds[1]
|
||||
[1]])
|
||||
|
||||
if vertical_len_sum > horizontal_len_sum * orientation_thr:
|
||||
head_inds = horizontal_edge_inds[0]
|
||||
tail_inds = horizontal_edge_inds[1]
|
||||
else:
|
||||
head_inds = vertical_edge_inds[0]
|
||||
tail_inds = vertical_edge_inds[1]
|
||||
|
||||
return head_inds, tail_inds
|
||||
|
||||
def resample_sidelines(self, sideline1, sideline2, resample_step):
|
||||
"""Resample two sidelines to be of the same points number according to
|
||||
step size.
|
||||
|
||||
Args:
|
||||
sideline1 (ndarray): The points composing a sideline of a text
|
||||
polygon.
|
||||
sideline2 (ndarray): The points composing another sideline of a
|
||||
text polygon.
|
||||
resample_step (float): The resampled step size.
|
||||
|
||||
Returns:
|
||||
resampled_line1 (ndarray): The resampled line 1.
|
||||
resampled_line2 (ndarray): The resampled line 2.
|
||||
"""
|
||||
|
||||
assert sideline1.ndim == sideline2.ndim == 2
|
||||
assert sideline1.shape[1] == sideline2.shape[1] == 2
|
||||
assert sideline1.shape[0] >= 2
|
||||
assert sideline2.shape[0] >= 2
|
||||
assert isinstance(resample_step, float)
|
||||
|
||||
length1 = sum([
|
||||
norm(sideline1[i + 1] - sideline1[i])
|
||||
for i in range(len(sideline1) - 1)
|
||||
])
|
||||
length2 = sum([
|
||||
norm(sideline2[i + 1] - sideline2[i])
|
||||
for i in range(len(sideline2) - 1)
|
||||
])
|
||||
|
||||
total_length = (length1 + length2) / 2
|
||||
resample_point_num = max(int(float(total_length) / resample_step), 1)
|
||||
|
||||
resampled_line1 = self.resample_line(sideline1, resample_point_num)
|
||||
resampled_line2 = self.resample_line(sideline2, resample_point_num)
|
||||
|
||||
return resampled_line1, resampled_line2
|
||||
|
||||
def generate_center_region_mask(self, img_size, text_polys):
|
||||
"""Generate text center region mask.
|
||||
|
||||
Args:
|
||||
img_size (tuple): The image size of (height, width).
|
||||
text_polys (list[list[ndarray]]): The list of text polygons.
|
||||
|
||||
Returns:
|
||||
center_region_mask (ndarray): The text center region mask.
|
||||
"""
|
||||
|
||||
assert isinstance(img_size, tuple)
|
||||
# assert check_argument.is_2dlist(text_polys)
|
||||
|
||||
h, w = img_size
|
||||
|
||||
center_region_mask = np.zeros((h, w), np.uint8)
|
||||
|
||||
center_region_boxes = []
|
||||
for poly in text_polys:
|
||||
# assert len(poly) == 1
|
||||
polygon_points = poly.reshape(-1, 2)
|
||||
_, _, top_line, bot_line = self.reorder_poly_edge(polygon_points)
|
||||
resampled_top_line, resampled_bot_line = self.resample_sidelines(
|
||||
top_line, bot_line, self.resample_step)
|
||||
resampled_bot_line = resampled_bot_line[::-1]
|
||||
center_line = (resampled_top_line + resampled_bot_line) / 2
|
||||
|
||||
line_head_shrink_len = norm(resampled_top_line[0] -
|
||||
resampled_bot_line[0]) / 4.0
|
||||
line_tail_shrink_len = norm(resampled_top_line[-1] -
|
||||
resampled_bot_line[-1]) / 4.0
|
||||
head_shrink_num = int(line_head_shrink_len // self.resample_step)
|
||||
tail_shrink_num = int(line_tail_shrink_len // self.resample_step)
|
||||
if len(center_line) > head_shrink_num + tail_shrink_num + 2:
|
||||
center_line = center_line[head_shrink_num:len(center_line) -
|
||||
tail_shrink_num]
|
||||
resampled_top_line = resampled_top_line[head_shrink_num:len(
|
||||
resampled_top_line) - tail_shrink_num]
|
||||
resampled_bot_line = resampled_bot_line[head_shrink_num:len(
|
||||
resampled_bot_line) - tail_shrink_num]
|
||||
|
||||
for i in range(0, len(center_line) - 1):
|
||||
tl = center_line[i] + (resampled_top_line[i] - center_line[i]
|
||||
) * self.center_region_shrink_ratio
|
||||
tr = center_line[i + 1] + (resampled_top_line[i + 1] -
|
||||
center_line[i + 1]
|
||||
) * self.center_region_shrink_ratio
|
||||
br = center_line[i + 1] + (resampled_bot_line[i + 1] -
|
||||
center_line[i + 1]
|
||||
) * self.center_region_shrink_ratio
|
||||
bl = center_line[i] + (resampled_bot_line[i] - center_line[i]
|
||||
) * self.center_region_shrink_ratio
|
||||
current_center_box = np.vstack([tl, tr, br,
|
||||
bl]).astype(np.int32)
|
||||
center_region_boxes.append(current_center_box)
|
||||
|
||||
cv2.fillPoly(center_region_mask, center_region_boxes, 1)
|
||||
return center_region_mask
|
||||
|
||||
def resample_polygon(self, polygon, n=400):
|
||||
"""Resample one polygon with n points on its boundary.
|
||||
|
||||
Args:
|
||||
polygon (list[float]): The input polygon.
|
||||
n (int): The number of resampled points.
|
||||
Returns:
|
||||
resampled_polygon (list[float]): The resampled polygon.
|
||||
"""
|
||||
length = []
|
||||
|
||||
for i in range(len(polygon)):
|
||||
p1 = polygon[i]
|
||||
if i == len(polygon) - 1:
|
||||
p2 = polygon[0]
|
||||
else:
|
||||
p2 = polygon[i + 1]
|
||||
length.append(((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5)
|
||||
|
||||
total_length = sum(length)
|
||||
n_on_each_line = (np.array(length) / (total_length + 1e-8)) * n
|
||||
n_on_each_line = n_on_each_line.astype(np.int32)
|
||||
new_polygon = []
|
||||
|
||||
for i in range(len(polygon)):
|
||||
num = n_on_each_line[i]
|
||||
p1 = polygon[i]
|
||||
if i == len(polygon) - 1:
|
||||
p2 = polygon[0]
|
||||
else:
|
||||
p2 = polygon[i + 1]
|
||||
|
||||
if num == 0:
|
||||
continue
|
||||
|
||||
dxdy = (p2 - p1) / num
|
||||
for j in range(num):
|
||||
point = p1 + dxdy * j
|
||||
new_polygon.append(point)
|
||||
|
||||
return np.array(new_polygon)
|
||||
|
||||
def normalize_polygon(self, polygon):
|
||||
"""Normalize one polygon so that its start point is at right most.
|
||||
|
||||
Args:
|
||||
polygon (list[float]): The origin polygon.
|
||||
Returns:
|
||||
new_polygon (lost[float]): The polygon with start point at right.
|
||||
"""
|
||||
temp_polygon = polygon - polygon.mean(axis=0)
|
||||
x = np.abs(temp_polygon[:, 0])
|
||||
y = temp_polygon[:, 1]
|
||||
index_x = np.argsort(x)
|
||||
index_y = np.argmin(y[index_x[:8]])
|
||||
index = index_x[index_y]
|
||||
new_polygon = np.concatenate([polygon[index:], polygon[:index]])
|
||||
return new_polygon
|
||||
|
||||
def poly2fourier(self, polygon, fourier_degree):
|
||||
"""Perform Fourier transformation to generate Fourier coefficients ck
|
||||
from polygon.
|
||||
|
||||
Args:
|
||||
polygon (ndarray): An input polygon.
|
||||
fourier_degree (int): The maximum Fourier degree K.
|
||||
Returns:
|
||||
c (ndarray(complex)): Fourier coefficients.
|
||||
"""
|
||||
points = polygon[:, 0] + polygon[:, 1] * 1j
|
||||
c_fft = fft(points) / len(points)
|
||||
c = np.hstack((c_fft[-fourier_degree:], c_fft[:fourier_degree + 1]))
|
||||
return c
|
||||
|
||||
def clockwise(self, c, fourier_degree):
|
||||
"""Make sure the polygon reconstructed from Fourier coefficients c in
|
||||
the clockwise direction.
|
||||
|
||||
Args:
|
||||
polygon (list[float]): The origin polygon.
|
||||
Returns:
|
||||
new_polygon (lost[float]): The polygon in clockwise point order.
|
||||
"""
|
||||
if np.abs(c[fourier_degree + 1]) > np.abs(c[fourier_degree - 1]):
|
||||
return c
|
||||
elif np.abs(c[fourier_degree + 1]) < np.abs(c[fourier_degree - 1]):
|
||||
return c[::-1]
|
||||
else:
|
||||
if np.abs(c[fourier_degree + 2]) > np.abs(c[fourier_degree - 2]):
|
||||
return c
|
||||
else:
|
||||
return c[::-1]
|
||||
|
||||
def cal_fourier_signature(self, polygon, fourier_degree):
|
||||
"""Calculate Fourier signature from input polygon.
|
||||
|
||||
Args:
|
||||
polygon (ndarray): The input polygon.
|
||||
fourier_degree (int): The maximum Fourier degree K.
|
||||
Returns:
|
||||
fourier_signature (ndarray): An array shaped (2k+1, 2) containing
|
||||
real part and image part of 2k+1 Fourier coefficients.
|
||||
"""
|
||||
resampled_polygon = self.resample_polygon(polygon)
|
||||
resampled_polygon = self.normalize_polygon(resampled_polygon)
|
||||
|
||||
fourier_coeff = self.poly2fourier(resampled_polygon, fourier_degree)
|
||||
fourier_coeff = self.clockwise(fourier_coeff, fourier_degree)
|
||||
|
||||
real_part = np.real(fourier_coeff).reshape((-1, 1))
|
||||
image_part = np.imag(fourier_coeff).reshape((-1, 1))
|
||||
fourier_signature = np.hstack([real_part, image_part])
|
||||
|
||||
return fourier_signature
|
||||
|
||||
def generate_fourier_maps(self, img_size, text_polys):
|
||||
"""Generate Fourier coefficient maps.
|
||||
|
||||
Args:
|
||||
img_size (tuple): The image size of (height, width).
|
||||
text_polys (list[list[ndarray]]): The list of text polygons.
|
||||
|
||||
Returns:
|
||||
fourier_real_map (ndarray): The Fourier coefficient real part maps.
|
||||
fourier_image_map (ndarray): The Fourier coefficient image part
|
||||
maps.
|
||||
"""
|
||||
|
||||
assert isinstance(img_size, tuple)
|
||||
|
||||
h, w = img_size
|
||||
k = self.fourier_degree
|
||||
real_map = np.zeros((k * 2 + 1, h, w), dtype=np.float32)
|
||||
imag_map = np.zeros((k * 2 + 1, h, w), dtype=np.float32)
|
||||
|
||||
for poly in text_polys:
|
||||
mask = np.zeros((h, w), dtype=np.uint8)
|
||||
polygon = np.array(poly).reshape((1, -1, 2))
|
||||
cv2.fillPoly(mask, polygon.astype(np.int32), 1)
|
||||
fourier_coeff = self.cal_fourier_signature(polygon[0], k)
|
||||
for i in range(-k, k + 1):
|
||||
if i != 0:
|
||||
real_map[i + k, :, :] = mask * fourier_coeff[i + k, 0] + (
|
||||
1 - mask) * real_map[i + k, :, :]
|
||||
imag_map[i + k, :, :] = mask * fourier_coeff[i + k, 1] + (
|
||||
1 - mask) * imag_map[i + k, :, :]
|
||||
else:
|
||||
yx = np.argwhere(mask > 0.5)
|
||||
k_ind = np.ones((len(yx)), dtype=np.int64) * k
|
||||
y, x = yx[:, 0], yx[:, 1]
|
||||
real_map[k_ind, y, x] = fourier_coeff[k, 0] - x
|
||||
imag_map[k_ind, y, x] = fourier_coeff[k, 1] - y
|
||||
|
||||
return real_map, imag_map
|
||||
|
||||
def generate_text_region_mask(self, img_size, text_polys):
|
||||
"""Generate text center region mask and geometry attribute maps.
|
||||
|
||||
Args:
|
||||
img_size (tuple): The image size (height, width).
|
||||
text_polys (list[list[ndarray]]): The list of text polygons.
|
||||
|
||||
Returns:
|
||||
text_region_mask (ndarray): The text region mask.
|
||||
"""
|
||||
|
||||
assert isinstance(img_size, tuple)
|
||||
|
||||
h, w = img_size
|
||||
text_region_mask = np.zeros((h, w), dtype=np.uint8)
|
||||
|
||||
for poly in text_polys:
|
||||
polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2))
|
||||
cv2.fillPoly(text_region_mask, polygon, 1)
|
||||
|
||||
return text_region_mask
|
||||
|
||||
def generate_effective_mask(self, mask_size: tuple, polygons_ignore):
|
||||
"""Generate effective mask by setting the ineffective regions to 0 and
|
||||
effective regions to 1.
|
||||
|
||||
Args:
|
||||
mask_size (tuple): The mask size.
|
||||
polygons_ignore (list[[ndarray]]: The list of ignored text
|
||||
polygons.
|
||||
|
||||
Returns:
|
||||
mask (ndarray): The effective mask of (height, width).
|
||||
"""
|
||||
|
||||
mask = np.ones(mask_size, dtype=np.uint8)
|
||||
|
||||
for poly in polygons_ignore:
|
||||
instance = poly.reshape(-1, 2).astype(np.int32).reshape(1, -1, 2)
|
||||
cv2.fillPoly(mask, instance, 0)
|
||||
|
||||
return mask
|
||||
|
||||
def generate_level_targets(self, img_size, text_polys, ignore_polys):
|
||||
"""Generate ground truth target on each level.
|
||||
|
||||
Args:
|
||||
img_size (list[int]): Shape of input image.
|
||||
text_polys (list[list[ndarray]]): A list of ground truth polygons.
|
||||
ignore_polys (list[list[ndarray]]): A list of ignored polygons.
|
||||
Returns:
|
||||
level_maps (list(ndarray)): A list of ground target on each level.
|
||||
"""
|
||||
h, w = img_size
|
||||
lv_size_divs = self.level_size_divisors
|
||||
lv_proportion_range = self.level_proportion_range
|
||||
lv_text_polys = [[] for i in range(len(lv_size_divs))]
|
||||
lv_ignore_polys = [[] for i in range(len(lv_size_divs))]
|
||||
level_maps = []
|
||||
for poly in text_polys:
|
||||
polygon = np.array(poly, dtype=np.int).reshape((1, -1, 2))
|
||||
_, _, box_w, box_h = cv2.boundingRect(polygon)
|
||||
proportion = max(box_h, box_w) / (h + 1e-8)
|
||||
|
||||
for ind, proportion_range in enumerate(lv_proportion_range):
|
||||
if proportion_range[0] < proportion < proportion_range[1]:
|
||||
lv_text_polys[ind].append(poly / lv_size_divs[ind])
|
||||
|
||||
for ignore_poly in ignore_polys:
|
||||
polygon = np.array(ignore_poly, dtype=np.int).reshape((1, -1, 2))
|
||||
_, _, box_w, box_h = cv2.boundingRect(polygon)
|
||||
proportion = max(box_h, box_w) / (h + 1e-8)
|
||||
|
||||
for ind, proportion_range in enumerate(lv_proportion_range):
|
||||
if proportion_range[0] < proportion < proportion_range[1]:
|
||||
lv_ignore_polys[ind].append(ignore_poly / lv_size_divs[ind])
|
||||
|
||||
for ind, size_divisor in enumerate(lv_size_divs):
|
||||
current_level_maps = []
|
||||
level_img_size = (h // size_divisor, w // size_divisor)
|
||||
|
||||
text_region = self.generate_text_region_mask(
|
||||
level_img_size, lv_text_polys[ind])[None]
|
||||
current_level_maps.append(text_region)
|
||||
|
||||
center_region = self.generate_center_region_mask(
|
||||
level_img_size, lv_text_polys[ind])[None]
|
||||
current_level_maps.append(center_region)
|
||||
|
||||
effective_mask = self.generate_effective_mask(
|
||||
level_img_size, lv_ignore_polys[ind])[None]
|
||||
current_level_maps.append(effective_mask)
|
||||
|
||||
fourier_real_map, fourier_image_maps = self.generate_fourier_maps(
|
||||
level_img_size, lv_text_polys[ind])
|
||||
current_level_maps.append(fourier_real_map)
|
||||
current_level_maps.append(fourier_image_maps)
|
||||
|
||||
level_maps.append(np.concatenate(current_level_maps))
|
||||
|
||||
return level_maps
|
||||
|
||||
def generate_targets(self, results):
|
||||
"""Generate the ground truth targets for FCENet.
|
||||
|
||||
Args:
|
||||
results (dict): The input result dictionary.
|
||||
|
||||
Returns:
|
||||
results (dict): The output result dictionary.
|
||||
"""
|
||||
|
||||
assert isinstance(results, dict)
|
||||
image = results['image']
|
||||
polygons = results['polys']
|
||||
ignore_tags = results['ignore_tags']
|
||||
h, w, _ = image.shape
|
||||
|
||||
polygon_masks = []
|
||||
polygon_masks_ignore = []
|
||||
for tag, polygon in zip(ignore_tags, polygons):
|
||||
if tag is True:
|
||||
polygon_masks_ignore.append(polygon)
|
||||
else:
|
||||
polygon_masks.append(polygon)
|
||||
|
||||
level_maps = self.generate_level_targets((h, w), polygon_masks,
|
||||
polygon_masks_ignore)
|
||||
|
||||
mapping = {
|
||||
'p3_maps': level_maps[0],
|
||||
'p4_maps': level_maps[1],
|
||||
'p5_maps': level_maps[2]
|
||||
}
|
||||
for key, value in mapping.items():
|
||||
results[key] = value
|
||||
|
||||
return results
|
||||
|
||||
def __call__(self, results):
|
||||
results = self.generate_targets(results)
|
||||
return results
|
||||
244
backend/ppocr/data/imaug/gen_table_mask.py
Normal file
244
backend/ppocr/data/imaug/gen_table_mask.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import six
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GenTableMask(object):
|
||||
""" gen table mask """
|
||||
|
||||
def __init__(self, shrink_h_max, shrink_w_max, mask_type=0, **kwargs):
|
||||
self.shrink_h_max = 5
|
||||
self.shrink_w_max = 5
|
||||
self.mask_type = mask_type
|
||||
|
||||
def projection(self, erosion, h, w, spilt_threshold=0):
|
||||
# 水平投影
|
||||
projection_map = np.ones_like(erosion)
|
||||
project_val_array = [0 for _ in range(0, h)]
|
||||
|
||||
for j in range(0, h):
|
||||
for i in range(0, w):
|
||||
if erosion[j, i] == 255:
|
||||
project_val_array[j] += 1
|
||||
# 根据数组,获取切割点
|
||||
start_idx = 0 # 记录进入字符区的索引
|
||||
end_idx = 0 # 记录进入空白区域的索引
|
||||
in_text = False # 是否遍历到了字符区内
|
||||
box_list = []
|
||||
for i in range(len(project_val_array)):
|
||||
if in_text == False and project_val_array[i] > spilt_threshold: # 进入字符区了
|
||||
in_text = True
|
||||
start_idx = i
|
||||
elif project_val_array[i] <= spilt_threshold and in_text == True: # 进入空白区了
|
||||
end_idx = i
|
||||
in_text = False
|
||||
if end_idx - start_idx <= 2:
|
||||
continue
|
||||
box_list.append((start_idx, end_idx + 1))
|
||||
|
||||
if in_text:
|
||||
box_list.append((start_idx, h - 1))
|
||||
# 绘制投影直方图
|
||||
for j in range(0, h):
|
||||
for i in range(0, project_val_array[j]):
|
||||
projection_map[j, i] = 0
|
||||
return box_list, projection_map
|
||||
|
||||
def projection_cx(self, box_img):
|
||||
box_gray_img = cv2.cvtColor(box_img, cv2.COLOR_BGR2GRAY)
|
||||
h, w = box_gray_img.shape
|
||||
# 灰度图片进行二值化处理
|
||||
ret, thresh1 = cv2.threshold(box_gray_img, 200, 255, cv2.THRESH_BINARY_INV)
|
||||
# 纵向腐蚀
|
||||
if h < w:
|
||||
kernel = np.ones((2, 1), np.uint8)
|
||||
erode = cv2.erode(thresh1, kernel, iterations=1)
|
||||
else:
|
||||
erode = thresh1
|
||||
# 水平膨胀
|
||||
kernel = np.ones((1, 5), np.uint8)
|
||||
erosion = cv2.dilate(erode, kernel, iterations=1)
|
||||
# 水平投影
|
||||
projection_map = np.ones_like(erosion)
|
||||
project_val_array = [0 for _ in range(0, h)]
|
||||
|
||||
for j in range(0, h):
|
||||
for i in range(0, w):
|
||||
if erosion[j, i] == 255:
|
||||
project_val_array[j] += 1
|
||||
# 根据数组,获取切割点
|
||||
start_idx = 0 # 记录进入字符区的索引
|
||||
end_idx = 0 # 记录进入空白区域的索引
|
||||
in_text = False # 是否遍历到了字符区内
|
||||
box_list = []
|
||||
spilt_threshold = 0
|
||||
for i in range(len(project_val_array)):
|
||||
if in_text == False and project_val_array[i] > spilt_threshold: # 进入字符区了
|
||||
in_text = True
|
||||
start_idx = i
|
||||
elif project_val_array[i] <= spilt_threshold and in_text == True: # 进入空白区了
|
||||
end_idx = i
|
||||
in_text = False
|
||||
if end_idx - start_idx <= 2:
|
||||
continue
|
||||
box_list.append((start_idx, end_idx + 1))
|
||||
|
||||
if in_text:
|
||||
box_list.append((start_idx, h - 1))
|
||||
# 绘制投影直方图
|
||||
for j in range(0, h):
|
||||
for i in range(0, project_val_array[j]):
|
||||
projection_map[j, i] = 0
|
||||
split_bbox_list = []
|
||||
if len(box_list) > 1:
|
||||
for i, (h_start, h_end) in enumerate(box_list):
|
||||
if i == 0:
|
||||
h_start = 0
|
||||
if i == len(box_list):
|
||||
h_end = h
|
||||
word_img = erosion[h_start:h_end + 1, :]
|
||||
word_h, word_w = word_img.shape
|
||||
w_split_list, w_projection_map = self.projection(word_img.T, word_w, word_h)
|
||||
w_start, w_end = w_split_list[0][0], w_split_list[-1][1]
|
||||
if h_start > 0:
|
||||
h_start -= 1
|
||||
h_end += 1
|
||||
word_img = box_img[h_start:h_end + 1:, w_start:w_end + 1, :]
|
||||
split_bbox_list.append([w_start, h_start, w_end, h_end])
|
||||
else:
|
||||
split_bbox_list.append([0, 0, w, h])
|
||||
return split_bbox_list
|
||||
|
||||
def shrink_bbox(self, bbox):
|
||||
left, top, right, bottom = bbox
|
||||
sh_h = min(max(int((bottom - top) * 0.1), 1), self.shrink_h_max)
|
||||
sh_w = min(max(int((right - left) * 0.1), 1), self.shrink_w_max)
|
||||
left_new = left + sh_w
|
||||
right_new = right - sh_w
|
||||
top_new = top + sh_h
|
||||
bottom_new = bottom - sh_h
|
||||
if left_new >= right_new:
|
||||
left_new = left
|
||||
right_new = right
|
||||
if top_new >= bottom_new:
|
||||
top_new = top
|
||||
bottom_new = bottom
|
||||
return [left_new, top_new, right_new, bottom_new]
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
cells = data['cells']
|
||||
height, width = img.shape[0:2]
|
||||
if self.mask_type == 1:
|
||||
mask_img = np.zeros((height, width), dtype=np.float32)
|
||||
else:
|
||||
mask_img = np.zeros((height, width, 3), dtype=np.float32)
|
||||
cell_num = len(cells)
|
||||
for cno in range(cell_num):
|
||||
if "bbox" in cells[cno]:
|
||||
bbox = cells[cno]['bbox']
|
||||
left, top, right, bottom = bbox
|
||||
box_img = img[top:bottom, left:right, :].copy()
|
||||
split_bbox_list = self.projection_cx(box_img)
|
||||
for sno in range(len(split_bbox_list)):
|
||||
split_bbox_list[sno][0] += left
|
||||
split_bbox_list[sno][1] += top
|
||||
split_bbox_list[sno][2] += left
|
||||
split_bbox_list[sno][3] += top
|
||||
|
||||
for sno in range(len(split_bbox_list)):
|
||||
left, top, right, bottom = split_bbox_list[sno]
|
||||
left, top, right, bottom = self.shrink_bbox([left, top, right, bottom])
|
||||
if self.mask_type == 1:
|
||||
mask_img[top:bottom, left:right] = 1.0
|
||||
data['mask_img'] = mask_img
|
||||
else:
|
||||
mask_img[top:bottom, left:right, :] = (255, 255, 255)
|
||||
data['image'] = mask_img
|
||||
return data
|
||||
|
||||
class ResizeTableImage(object):
|
||||
def __init__(self, max_len, **kwargs):
|
||||
super(ResizeTableImage, self).__init__()
|
||||
self.max_len = max_len
|
||||
|
||||
def get_img_bbox(self, cells):
|
||||
bbox_list = []
|
||||
if len(cells) == 0:
|
||||
return bbox_list
|
||||
cell_num = len(cells)
|
||||
for cno in range(cell_num):
|
||||
if "bbox" in cells[cno]:
|
||||
bbox = cells[cno]['bbox']
|
||||
bbox_list.append(bbox)
|
||||
return bbox_list
|
||||
|
||||
def resize_img_table(self, img, bbox_list, max_len):
|
||||
height, width = img.shape[0:2]
|
||||
ratio = max_len / (max(height, width) * 1.0)
|
||||
resize_h = int(height * ratio)
|
||||
resize_w = int(width * ratio)
|
||||
img_new = cv2.resize(img, (resize_w, resize_h))
|
||||
bbox_list_new = []
|
||||
for bno in range(len(bbox_list)):
|
||||
left, top, right, bottom = bbox_list[bno].copy()
|
||||
left = int(left * ratio)
|
||||
top = int(top * ratio)
|
||||
right = int(right * ratio)
|
||||
bottom = int(bottom * ratio)
|
||||
bbox_list_new.append([left, top, right, bottom])
|
||||
return img_new, bbox_list_new
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if 'cells' not in data:
|
||||
cells = []
|
||||
else:
|
||||
cells = data['cells']
|
||||
bbox_list = self.get_img_bbox(cells)
|
||||
img_new, bbox_list_new = self.resize_img_table(img, bbox_list, self.max_len)
|
||||
data['image'] = img_new
|
||||
cell_num = len(cells)
|
||||
bno = 0
|
||||
for cno in range(cell_num):
|
||||
if "bbox" in data['cells'][cno]:
|
||||
data['cells'][cno]['bbox'] = bbox_list_new[bno]
|
||||
bno += 1
|
||||
data['max_len'] = self.max_len
|
||||
return data
|
||||
|
||||
class PaddingTableImage(object):
|
||||
def __init__(self, **kwargs):
|
||||
super(PaddingTableImage, self).__init__()
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
max_len = data['max_len']
|
||||
padding_img = np.zeros((max_len, max_len, 3), dtype=np.float32)
|
||||
height, width = img.shape[0:2]
|
||||
padding_img[0:height, 0:width, :] = img.copy()
|
||||
data['image'] = padding_img
|
||||
return data
|
||||
|
||||
105
backend/ppocr/data/imaug/iaa_augment.py
Normal file
105
backend/ppocr/data/imaug/iaa_augment.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/iaa_augment.py
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import imgaug
|
||||
import imgaug.augmenters as iaa
|
||||
|
||||
|
||||
class AugmenterBuilder(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def build(self, args, root=True):
|
||||
if args is None or len(args) == 0:
|
||||
return None
|
||||
elif isinstance(args, list):
|
||||
if root:
|
||||
sequence = [self.build(value, root=False) for value in args]
|
||||
return iaa.Sequential(sequence)
|
||||
else:
|
||||
return getattr(iaa, args[0])(
|
||||
*[self.to_tuple_if_list(a) for a in args[1:]])
|
||||
elif isinstance(args, dict):
|
||||
cls = getattr(iaa, args['type'])
|
||||
return cls(**{
|
||||
k: self.to_tuple_if_list(v)
|
||||
for k, v in args['args'].items()
|
||||
})
|
||||
else:
|
||||
raise RuntimeError('unknown augmenter arg: ' + str(args))
|
||||
|
||||
def to_tuple_if_list(self, obj):
|
||||
if isinstance(obj, list):
|
||||
return tuple(obj)
|
||||
return obj
|
||||
|
||||
|
||||
class IaaAugment():
|
||||
def __init__(self, augmenter_args=None, **kwargs):
|
||||
if augmenter_args is None:
|
||||
augmenter_args = [{
|
||||
'type': 'Fliplr',
|
||||
'args': {
|
||||
'p': 0.5
|
||||
}
|
||||
}, {
|
||||
'type': 'Affine',
|
||||
'args': {
|
||||
'rotate': [-10, 10]
|
||||
}
|
||||
}, {
|
||||
'type': 'Resize',
|
||||
'args': {
|
||||
'size': [0.5, 3]
|
||||
}
|
||||
}]
|
||||
self.augmenter = AugmenterBuilder().build(augmenter_args)
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
shape = image.shape
|
||||
|
||||
if self.augmenter:
|
||||
aug = self.augmenter.to_deterministic()
|
||||
data['image'] = aug.augment_image(image)
|
||||
data = self.may_augment_annotation(aug, data, shape)
|
||||
return data
|
||||
|
||||
def may_augment_annotation(self, aug, data, shape):
|
||||
if aug is None:
|
||||
return data
|
||||
|
||||
line_polys = []
|
||||
for poly in data['polys']:
|
||||
new_poly = self.may_augment_poly(aug, shape, poly)
|
||||
line_polys.append(new_poly)
|
||||
data['polys'] = np.array(line_polys)
|
||||
return data
|
||||
|
||||
def may_augment_poly(self, aug, img_shape, poly):
|
||||
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
|
||||
keypoints = aug.augment_keypoints(
|
||||
[imgaug.KeypointsOnImage(
|
||||
keypoints, shape=img_shape)])[0].keypoints
|
||||
poly = [(p.x, p.y) for p in keypoints]
|
||||
return poly
|
||||
1041
backend/ppocr/data/imaug/label_ops.py
Normal file
1041
backend/ppocr/data/imaug/label_ops.py
Normal file
File diff suppressed because it is too large
Load Diff
173
backend/ppocr/data/imaug/make_border_map.py
Normal file
173
backend/ppocr/data/imaug/make_border_map.py
Normal file
@@ -0,0 +1,173 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_border_map.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
np.seterr(divide='ignore', invalid='ignore')
|
||||
import pyclipper
|
||||
from shapely.geometry import Polygon
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
warnings.simplefilter("ignore")
|
||||
|
||||
__all__ = ['MakeBorderMap']
|
||||
|
||||
|
||||
class MakeBorderMap(object):
|
||||
def __init__(self,
|
||||
shrink_ratio=0.4,
|
||||
thresh_min=0.3,
|
||||
thresh_max=0.7,
|
||||
**kwargs):
|
||||
self.shrink_ratio = shrink_ratio
|
||||
self.thresh_min = thresh_min
|
||||
self.thresh_max = thresh_max
|
||||
|
||||
def __call__(self, data):
|
||||
|
||||
img = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
canvas = np.zeros(img.shape[:2], dtype=np.float32)
|
||||
mask = np.zeros(img.shape[:2], dtype=np.float32)
|
||||
|
||||
for i in range(len(text_polys)):
|
||||
if ignore_tags[i]:
|
||||
continue
|
||||
self.draw_border_map(text_polys[i], canvas, mask=mask)
|
||||
canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
|
||||
|
||||
data['threshold_map'] = canvas
|
||||
data['threshold_mask'] = mask
|
||||
return data
|
||||
|
||||
def draw_border_map(self, polygon, canvas, mask):
|
||||
polygon = np.array(polygon)
|
||||
assert polygon.ndim == 2
|
||||
assert polygon.shape[1] == 2
|
||||
|
||||
polygon_shape = Polygon(polygon)
|
||||
if polygon_shape.area <= 0:
|
||||
return
|
||||
distance = polygon_shape.area * (
|
||||
1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
|
||||
subject = [tuple(l) for l in polygon]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
|
||||
padded_polygon = np.array(padding.Execute(distance)[0])
|
||||
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
|
||||
|
||||
xmin = padded_polygon[:, 0].min()
|
||||
xmax = padded_polygon[:, 0].max()
|
||||
ymin = padded_polygon[:, 1].min()
|
||||
ymax = padded_polygon[:, 1].max()
|
||||
width = xmax - xmin + 1
|
||||
height = ymax - ymin + 1
|
||||
|
||||
polygon[:, 0] = polygon[:, 0] - xmin
|
||||
polygon[:, 1] = polygon[:, 1] - ymin
|
||||
|
||||
xs = np.broadcast_to(
|
||||
np.linspace(
|
||||
0, width - 1, num=width).reshape(1, width), (height, width))
|
||||
ys = np.broadcast_to(
|
||||
np.linspace(
|
||||
0, height - 1, num=height).reshape(height, 1), (height, width))
|
||||
|
||||
distance_map = np.zeros(
|
||||
(polygon.shape[0], height, width), dtype=np.float32)
|
||||
for i in range(polygon.shape[0]):
|
||||
j = (i + 1) % polygon.shape[0]
|
||||
absolute_distance = self._distance(xs, ys, polygon[i], polygon[j])
|
||||
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
|
||||
distance_map = distance_map.min(axis=0)
|
||||
|
||||
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
|
||||
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
|
||||
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
|
||||
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
|
||||
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
|
||||
1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
|
||||
xmin_valid - xmin:xmax_valid - xmax + width],
|
||||
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
|
||||
|
||||
def _distance(self, xs, ys, point_1, point_2):
|
||||
'''
|
||||
compute the distance from point to a line
|
||||
ys: coordinates in the first axis
|
||||
xs: coordinates in the second axis
|
||||
point_1, point_2: (x, y), the end of the line
|
||||
'''
|
||||
height, width = xs.shape[:2]
|
||||
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[
|
||||
1])
|
||||
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[
|
||||
1])
|
||||
square_distance = np.square(point_1[0] - point_2[0]) + np.square(
|
||||
point_1[1] - point_2[1])
|
||||
|
||||
cosin = (square_distance - square_distance_1 - square_distance_2) / (
|
||||
2 * np.sqrt(square_distance_1 * square_distance_2))
|
||||
square_sin = 1 - np.square(cosin)
|
||||
square_sin = np.nan_to_num(square_sin)
|
||||
result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
|
||||
square_distance)
|
||||
|
||||
result[cosin <
|
||||
0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin
|
||||
< 0]
|
||||
# self.extend_line(point_1, point_2, result)
|
||||
return result
|
||||
|
||||
def extend_line(self, point_1, point_2, result, shrink_ratio):
|
||||
ex_point_1 = (int(
|
||||
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
|
||||
int(
|
||||
round(point_1[1] + (point_1[1] - point_2[1]) * (
|
||||
1 + shrink_ratio))))
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_1),
|
||||
tuple(point_1),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0)
|
||||
ex_point_2 = (int(
|
||||
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
|
||||
int(
|
||||
round(point_2[1] + (point_2[1] - point_1[1]) * (
|
||||
1 + shrink_ratio))))
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_2),
|
||||
tuple(point_2),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0)
|
||||
return ex_point_1, ex_point_2
|
||||
106
backend/ppocr/data/imaug/make_pse_gt.py
Normal file
106
backend/ppocr/data/imaug/make_pse_gt.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pyclipper
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
__all__ = ['MakePseGt']
|
||||
|
||||
|
||||
class MakePseGt(object):
|
||||
def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs):
|
||||
self.kernel_num = kernel_num
|
||||
self.min_shrink_ratio = min_shrink_ratio
|
||||
self.size = size
|
||||
|
||||
def __call__(self, data):
|
||||
|
||||
image = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
h, w, _ = image.shape
|
||||
short_edge = min(h, w)
|
||||
if short_edge < self.size:
|
||||
# keep short_size >= self.size
|
||||
scale = self.size / short_edge
|
||||
image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
|
||||
text_polys *= scale
|
||||
|
||||
gt_kernels = []
|
||||
for i in range(1, self.kernel_num + 1):
|
||||
# s1->sn, from big to small
|
||||
rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1
|
||||
) * i
|
||||
text_kernel, ignore_tags = self.generate_kernel(
|
||||
image.shape[0:2], rate, text_polys, ignore_tags)
|
||||
gt_kernels.append(text_kernel)
|
||||
|
||||
training_mask = np.ones(image.shape[0:2], dtype='uint8')
|
||||
for i in range(text_polys.shape[0]):
|
||||
if ignore_tags[i]:
|
||||
cv2.fillPoly(training_mask,
|
||||
text_polys[i].astype(np.int32)[np.newaxis, :, :],
|
||||
0)
|
||||
|
||||
gt_kernels = np.array(gt_kernels)
|
||||
gt_kernels[gt_kernels > 0] = 1
|
||||
|
||||
data['image'] = image
|
||||
data['polys'] = text_polys
|
||||
data['gt_kernels'] = gt_kernels[0:]
|
||||
data['gt_text'] = gt_kernels[0]
|
||||
data['mask'] = training_mask.astype('float32')
|
||||
return data
|
||||
|
||||
def generate_kernel(self,
|
||||
img_size,
|
||||
shrink_ratio,
|
||||
text_polys,
|
||||
ignore_tags=None):
|
||||
"""
|
||||
Refer to part of the code:
|
||||
https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/base_textdet_targets.py
|
||||
"""
|
||||
|
||||
h, w = img_size
|
||||
text_kernel = np.zeros((h, w), dtype=np.float32)
|
||||
for i, poly in enumerate(text_polys):
|
||||
polygon = Polygon(poly)
|
||||
distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / (
|
||||
polygon.length + 1e-6)
|
||||
subject = [tuple(l) for l in poly]
|
||||
pco = pyclipper.PyclipperOffset()
|
||||
pco.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
shrinked = np.array(pco.Execute(-distance))
|
||||
|
||||
if len(shrinked) == 0 or shrinked.size == 0:
|
||||
if ignore_tags is not None:
|
||||
ignore_tags[i] = True
|
||||
continue
|
||||
try:
|
||||
shrinked = np.array(shrinked[0]).reshape(-1, 2)
|
||||
except:
|
||||
if ignore_tags is not None:
|
||||
ignore_tags[i] = True
|
||||
continue
|
||||
cv2.fillPoly(text_kernel, [shrinked.astype(np.int32)], i + 1)
|
||||
return text_kernel, ignore_tags
|
||||
123
backend/ppocr/data/imaug/make_shrink_map.py
Normal file
123
backend/ppocr/data/imaug/make_shrink_map.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_shrink_map.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
||||
__all__ = ['MakeShrinkMap']
|
||||
|
||||
|
||||
class MakeShrinkMap(object):
|
||||
r'''
|
||||
Making binary mask from detection data with ICDAR format.
|
||||
Typically following the process of class `MakeICDARData`.
|
||||
'''
|
||||
|
||||
def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs):
|
||||
self.min_text_size = min_text_size
|
||||
self.shrink_ratio = shrink_ratio
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
h, w = image.shape[:2]
|
||||
text_polys, ignore_tags = self.validate_polygons(text_polys,
|
||||
ignore_tags, h, w)
|
||||
gt = np.zeros((h, w), dtype=np.float32)
|
||||
mask = np.ones((h, w), dtype=np.float32)
|
||||
for i in range(len(text_polys)):
|
||||
polygon = text_polys[i]
|
||||
height = max(polygon[:, 1]) - min(polygon[:, 1])
|
||||
width = max(polygon[:, 0]) - min(polygon[:, 0])
|
||||
if ignore_tags[i] or min(height, width) < self.min_text_size:
|
||||
cv2.fillPoly(mask,
|
||||
polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
else:
|
||||
polygon_shape = Polygon(polygon)
|
||||
subject = [tuple(l) for l in polygon]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND,
|
||||
pyclipper.ET_CLOSEDPOLYGON)
|
||||
shrinked = []
|
||||
|
||||
# Increase the shrink ratio every time we get multiple polygon returned back
|
||||
possible_ratios = np.arange(self.shrink_ratio, 1,
|
||||
self.shrink_ratio)
|
||||
np.append(possible_ratios, 1)
|
||||
# print(possible_ratios)
|
||||
for ratio in possible_ratios:
|
||||
# print(f"Change shrink ratio to {ratio}")
|
||||
distance = polygon_shape.area * (
|
||||
1 - np.power(ratio, 2)) / polygon_shape.length
|
||||
shrinked = padding.Execute(-distance)
|
||||
if len(shrinked) == 1:
|
||||
break
|
||||
|
||||
if shrinked == []:
|
||||
cv2.fillPoly(mask,
|
||||
polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
continue
|
||||
|
||||
for each_shirnk in shrinked:
|
||||
shirnk = np.array(each_shirnk).reshape(-1, 2)
|
||||
cv2.fillPoly(gt, [shirnk.astype(np.int32)], 1)
|
||||
|
||||
data['shrink_map'] = gt
|
||||
data['shrink_mask'] = mask
|
||||
return data
|
||||
|
||||
def validate_polygons(self, polygons, ignore_tags, h, w):
|
||||
'''
|
||||
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
|
||||
'''
|
||||
if len(polygons) == 0:
|
||||
return polygons, ignore_tags
|
||||
assert len(polygons) == len(ignore_tags)
|
||||
for polygon in polygons:
|
||||
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
|
||||
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
|
||||
|
||||
for i in range(len(polygons)):
|
||||
area = self.polygon_area(polygons[i])
|
||||
if abs(area) < 1:
|
||||
ignore_tags[i] = True
|
||||
if area > 0:
|
||||
polygons[i] = polygons[i][::-1, :]
|
||||
return polygons, ignore_tags
|
||||
|
||||
def polygon_area(self, polygon):
|
||||
"""
|
||||
compute polygon area
|
||||
"""
|
||||
area = 0
|
||||
q = polygon[-1]
|
||||
for p in polygon:
|
||||
area += p[0] * q[1] - p[1] * q[0]
|
||||
q = p
|
||||
return area / 2.0
|
||||
468
backend/ppocr/data/imaug/operators.py
Normal file
468
backend/ppocr/data/imaug/operators.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import six
|
||||
import cv2
|
||||
import numpy as np
|
||||
import math
|
||||
|
||||
|
||||
class DecodeImage(object):
|
||||
""" decode image """
|
||||
|
||||
def __init__(self,
|
||||
img_mode='RGB',
|
||||
channel_first=False,
|
||||
ignore_orientation=False,
|
||||
**kwargs):
|
||||
self.img_mode = img_mode
|
||||
self.channel_first = channel_first
|
||||
self.ignore_orientation = ignore_orientation
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if six.PY2:
|
||||
assert type(img) is str and len(
|
||||
img) > 0, "invalid input 'img' in DecodeImage"
|
||||
else:
|
||||
assert type(img) is bytes and len(
|
||||
img) > 0, "invalid input 'img' in DecodeImage"
|
||||
img = np.frombuffer(img, dtype='uint8')
|
||||
if self.ignore_orientation:
|
||||
img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION |
|
||||
cv2.IMREAD_COLOR)
|
||||
else:
|
||||
img = cv2.imdecode(img, 1)
|
||||
if img is None:
|
||||
return None
|
||||
if self.img_mode == 'GRAY':
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
elif self.img_mode == 'RGB':
|
||||
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
|
||||
img = img[:, :, ::-1]
|
||||
|
||||
if self.channel_first:
|
||||
img = img.transpose((2, 0, 1))
|
||||
|
||||
data['image'] = img
|
||||
return data
|
||||
|
||||
|
||||
class NRTRDecodeImage(object):
|
||||
""" decode image """
|
||||
|
||||
def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
|
||||
self.img_mode = img_mode
|
||||
self.channel_first = channel_first
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if six.PY2:
|
||||
assert type(img) is str and len(
|
||||
img) > 0, "invalid input 'img' in DecodeImage"
|
||||
else:
|
||||
assert type(img) is bytes and len(
|
||||
img) > 0, "invalid input 'img' in DecodeImage"
|
||||
img = np.frombuffer(img, dtype='uint8')
|
||||
|
||||
img = cv2.imdecode(img, 1)
|
||||
|
||||
if img is None:
|
||||
return None
|
||||
if self.img_mode == 'GRAY':
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
elif self.img_mode == 'RGB':
|
||||
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
|
||||
img = img[:, :, ::-1]
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
if self.channel_first:
|
||||
img = img.transpose((2, 0, 1))
|
||||
data['image'] = img
|
||||
return data
|
||||
|
||||
|
||||
class NormalizeImage(object):
|
||||
""" normalize image such as substract mean, divide std
|
||||
"""
|
||||
|
||||
def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
|
||||
if isinstance(scale, str):
|
||||
scale = eval(scale)
|
||||
self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
|
||||
mean = mean if mean is not None else [0.485, 0.456, 0.406]
|
||||
std = std if std is not None else [0.229, 0.224, 0.225]
|
||||
|
||||
shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
|
||||
self.mean = np.array(mean).reshape(shape).astype('float32')
|
||||
self.std = np.array(std).reshape(shape).astype('float32')
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
from PIL import Image
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.array(img)
|
||||
assert isinstance(img,
|
||||
np.ndarray), "invalid input 'img' in NormalizeImage"
|
||||
data['image'] = (
|
||||
img.astype('float32') * self.scale - self.mean) / self.std
|
||||
return data
|
||||
|
||||
|
||||
class ToCHWImage(object):
|
||||
""" convert hwc image to chw image
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
from PIL import Image
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.array(img)
|
||||
data['image'] = img.transpose((2, 0, 1))
|
||||
return data
|
||||
|
||||
|
||||
class Fasttext(object):
|
||||
def __init__(self, path="None", **kwargs):
|
||||
import fasttext
|
||||
self.fast_model = fasttext.load_model(path)
|
||||
|
||||
def __call__(self, data):
|
||||
label = data['label']
|
||||
fast_label = self.fast_model[label]
|
||||
data['fast_label'] = fast_label
|
||||
return data
|
||||
|
||||
|
||||
class KeepKeys(object):
|
||||
def __init__(self, keep_keys, **kwargs):
|
||||
self.keep_keys = keep_keys
|
||||
|
||||
def __call__(self, data):
|
||||
data_list = []
|
||||
for key in self.keep_keys:
|
||||
data_list.append(data[key])
|
||||
return data_list
|
||||
|
||||
|
||||
class Pad(object):
|
||||
def __init__(self, size=None, size_div=32, **kwargs):
|
||||
if size is not None and not isinstance(size, (int, list, tuple)):
|
||||
raise TypeError("Type of target_size is invalid. Now is {}".format(
|
||||
type(size)))
|
||||
if isinstance(size, int):
|
||||
size = [size, size]
|
||||
self.size = size
|
||||
self.size_div = size_div
|
||||
|
||||
def __call__(self, data):
|
||||
|
||||
img = data['image']
|
||||
img_h, img_w = img.shape[0], img.shape[1]
|
||||
if self.size:
|
||||
resize_h2, resize_w2 = self.size
|
||||
assert (
|
||||
img_h < resize_h2 and img_w < resize_w2
|
||||
), '(h, w) of target size should be greater than (img_h, img_w)'
|
||||
else:
|
||||
resize_h2 = max(
|
||||
int(math.ceil(img.shape[0] / self.size_div) * self.size_div),
|
||||
self.size_div)
|
||||
resize_w2 = max(
|
||||
int(math.ceil(img.shape[1] / self.size_div) * self.size_div),
|
||||
self.size_div)
|
||||
img = cv2.copyMakeBorder(
|
||||
img,
|
||||
0,
|
||||
resize_h2 - img_h,
|
||||
0,
|
||||
resize_w2 - img_w,
|
||||
cv2.BORDER_CONSTANT,
|
||||
value=0)
|
||||
data['image'] = img
|
||||
return data
|
||||
|
||||
|
||||
class Resize(object):
|
||||
def __init__(self, size=(640, 640), **kwargs):
|
||||
self.size = size
|
||||
|
||||
def resize_image(self, img):
|
||||
resize_h, resize_w = self.size
|
||||
ori_h, ori_w = img.shape[:2] # (h, w, c)
|
||||
ratio_h = float(resize_h) / ori_h
|
||||
ratio_w = float(resize_w) / ori_w
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if 'polys' in data:
|
||||
text_polys = data['polys']
|
||||
|
||||
img_resize, [ratio_h, ratio_w] = self.resize_image(img)
|
||||
if 'polys' in data:
|
||||
new_boxes = []
|
||||
for box in text_polys:
|
||||
new_box = []
|
||||
for cord in box:
|
||||
new_box.append([cord[0] * ratio_w, cord[1] * ratio_h])
|
||||
new_boxes.append(new_box)
|
||||
data['polys'] = np.array(new_boxes, dtype=np.float32)
|
||||
data['image'] = img_resize
|
||||
return data
|
||||
|
||||
|
||||
class DetResizeForTest(object):
|
||||
def __init__(self, **kwargs):
|
||||
super(DetResizeForTest, self).__init__()
|
||||
self.resize_type = 0
|
||||
if 'image_shape' in kwargs:
|
||||
self.image_shape = kwargs['image_shape']
|
||||
self.resize_type = 1
|
||||
elif 'limit_side_len' in kwargs:
|
||||
self.limit_side_len = kwargs['limit_side_len']
|
||||
self.limit_type = kwargs.get('limit_type', 'min')
|
||||
elif 'resize_long' in kwargs:
|
||||
self.resize_type = 2
|
||||
self.resize_long = kwargs.get('resize_long', 960)
|
||||
else:
|
||||
self.limit_side_len = 736
|
||||
self.limit_type = 'min'
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
src_h, src_w, _ = img.shape
|
||||
|
||||
if self.resize_type == 0:
|
||||
# img, shape = self.resize_image_type0(img)
|
||||
img, [ratio_h, ratio_w] = self.resize_image_type0(img)
|
||||
elif self.resize_type == 2:
|
||||
img, [ratio_h, ratio_w] = self.resize_image_type2(img)
|
||||
else:
|
||||
# img, shape = self.resize_image_type1(img)
|
||||
img, [ratio_h, ratio_w] = self.resize_image_type1(img)
|
||||
data['image'] = img
|
||||
data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
|
||||
return data
|
||||
|
||||
def resize_image_type1(self, img):
|
||||
resize_h, resize_w = self.image_shape
|
||||
ori_h, ori_w = img.shape[:2] # (h, w, c)
|
||||
ratio_h = float(resize_h) / ori_h
|
||||
ratio_w = float(resize_w) / ori_w
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
# return img, np.array([ori_h, ori_w])
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
def resize_image_type0(self, img):
|
||||
"""
|
||||
resize image to a size multiple of 32 which is required by the network
|
||||
args:
|
||||
img(array): array with shape [h, w, c]
|
||||
return(tuple):
|
||||
img, (ratio_h, ratio_w)
|
||||
"""
|
||||
limit_side_len = self.limit_side_len
|
||||
h, w, c = img.shape
|
||||
|
||||
# limit the max side
|
||||
if self.limit_type == 'max':
|
||||
if max(h, w) > limit_side_len:
|
||||
if h > w:
|
||||
ratio = float(limit_side_len) / h
|
||||
else:
|
||||
ratio = float(limit_side_len) / w
|
||||
else:
|
||||
ratio = 1.
|
||||
elif self.limit_type == 'min':
|
||||
if min(h, w) < limit_side_len:
|
||||
if h < w:
|
||||
ratio = float(limit_side_len) / h
|
||||
else:
|
||||
ratio = float(limit_side_len) / w
|
||||
else:
|
||||
ratio = 1.
|
||||
elif self.limit_type == 'resize_long':
|
||||
ratio = float(limit_side_len) / max(h, w)
|
||||
else:
|
||||
raise Exception('not support limit type, image ')
|
||||
resize_h = int(h * ratio)
|
||||
resize_w = int(w * ratio)
|
||||
|
||||
resize_h = max(int(round(resize_h / 32) * 32), 32)
|
||||
resize_w = max(int(round(resize_w / 32) * 32), 32)
|
||||
|
||||
try:
|
||||
if int(resize_w) <= 0 or int(resize_h) <= 0:
|
||||
return None, (None, None)
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
except:
|
||||
print(img.shape, resize_w, resize_h)
|
||||
sys.exit(0)
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
def resize_image_type2(self, img):
|
||||
h, w, _ = img.shape
|
||||
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
||||
if resize_h > resize_w:
|
||||
ratio = float(self.resize_long) / resize_h
|
||||
else:
|
||||
ratio = float(self.resize_long) / resize_w
|
||||
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
|
||||
max_stride = 128
|
||||
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
|
||||
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
|
||||
class E2EResizeForTest(object):
|
||||
def __init__(self, **kwargs):
|
||||
super(E2EResizeForTest, self).__init__()
|
||||
self.max_side_len = kwargs['max_side_len']
|
||||
self.valid_set = kwargs['valid_set']
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
src_h, src_w, _ = img.shape
|
||||
if self.valid_set == 'totaltext':
|
||||
im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
|
||||
img, max_side_len=self.max_side_len)
|
||||
else:
|
||||
im_resized, (ratio_h, ratio_w) = self.resize_image(
|
||||
img, max_side_len=self.max_side_len)
|
||||
data['image'] = im_resized
|
||||
data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
|
||||
return data
|
||||
|
||||
def resize_image_for_totaltext(self, im, max_side_len=512):
|
||||
|
||||
h, w, _ = im.shape
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
ratio = 1.25
|
||||
if h * ratio > max_side_len:
|
||||
ratio = float(max_side_len) / resize_h
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
|
||||
max_stride = 128
|
||||
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
|
||||
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
def resize_image(self, im, max_side_len=512):
|
||||
"""
|
||||
resize image to a size multiple of max_stride which is required by the network
|
||||
:param im: the resized image
|
||||
:param max_side_len: limit of max image size to avoid out of memory in gpu
|
||||
:return: the resized image and the resize ratio
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
||||
# Fix the longer side
|
||||
if resize_h > resize_w:
|
||||
ratio = float(max_side_len) / resize_h
|
||||
else:
|
||||
ratio = float(max_side_len) / resize_w
|
||||
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
|
||||
max_stride = 128
|
||||
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
|
||||
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
|
||||
class KieResize(object):
|
||||
def __init__(self, **kwargs):
|
||||
super(KieResize, self).__init__()
|
||||
self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[
|
||||
'img_scale'][1]
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
points = data['points']
|
||||
src_h, src_w, _ = img.shape
|
||||
im_resized, scale_factor, [ratio_h, ratio_w
|
||||
], [new_h, new_w] = self.resize_image(img)
|
||||
resize_points = self.resize_boxes(img, points, scale_factor)
|
||||
data['ori_image'] = img
|
||||
data['ori_boxes'] = points
|
||||
data['points'] = resize_points
|
||||
data['image'] = im_resized
|
||||
data['shape'] = np.array([new_h, new_w])
|
||||
return data
|
||||
|
||||
def resize_image(self, img):
|
||||
norm_img = np.zeros([1024, 1024, 3], dtype='float32')
|
||||
scale = [512, 1024]
|
||||
h, w = img.shape[:2]
|
||||
max_long_edge = max(scale)
|
||||
max_short_edge = min(scale)
|
||||
scale_factor = min(max_long_edge / max(h, w),
|
||||
max_short_edge / min(h, w))
|
||||
resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float(
|
||||
scale_factor) + 0.5)
|
||||
max_stride = 32
|
||||
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
|
||||
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
|
||||
im = cv2.resize(img, (resize_w, resize_h))
|
||||
new_h, new_w = im.shape[:2]
|
||||
w_scale = new_w / w
|
||||
h_scale = new_h / h
|
||||
scale_factor = np.array(
|
||||
[w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
|
||||
norm_img[:new_h, :new_w, :] = im
|
||||
return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w]
|
||||
|
||||
def resize_boxes(self, im, points, scale_factor):
|
||||
points = points * scale_factor
|
||||
img_shape = im.shape[:2]
|
||||
points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1])
|
||||
points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0])
|
||||
return points
|
||||
906
backend/ppocr/data/imaug/pg_process.py
Normal file
906
backend/ppocr/data/imaug/pg_process.py
Normal file
@@ -0,0 +1,906 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
__all__ = ['PGProcessTrain']
|
||||
|
||||
|
||||
class PGProcessTrain(object):
|
||||
def __init__(self,
|
||||
character_dict_path,
|
||||
max_text_length,
|
||||
max_text_nums,
|
||||
tcl_len,
|
||||
batch_size=14,
|
||||
min_crop_size=24,
|
||||
min_text_size=4,
|
||||
max_text_size=512,
|
||||
**kwargs):
|
||||
self.tcl_len = tcl_len
|
||||
self.max_text_length = max_text_length
|
||||
self.max_text_nums = max_text_nums
|
||||
self.batch_size = batch_size
|
||||
self.min_crop_size = min_crop_size
|
||||
self.min_text_size = min_text_size
|
||||
self.max_text_size = max_text_size
|
||||
self.Lexicon_Table = self.get_dict(character_dict_path)
|
||||
self.pad_num = len(self.Lexicon_Table)
|
||||
self.img_id = 0
|
||||
|
||||
def get_dict(self, character_dict_path):
|
||||
character_str = ""
|
||||
with open(character_dict_path, "rb") as fin:
|
||||
lines = fin.readlines()
|
||||
for line in lines:
|
||||
line = line.decode('utf-8').strip("\n").strip("\r\n")
|
||||
character_str += line
|
||||
dict_character = list(character_str)
|
||||
return dict_character
|
||||
|
||||
def quad_area(self, poly):
|
||||
"""
|
||||
compute area of a polygon
|
||||
:param poly:
|
||||
:return:
|
||||
"""
|
||||
edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
|
||||
(poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
|
||||
(poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
|
||||
(poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])]
|
||||
return np.sum(edge) / 2.
|
||||
|
||||
def gen_quad_from_poly(self, poly):
|
||||
"""
|
||||
Generate min area quad from poly.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
min_area_quad = np.zeros((4, 2), dtype=np.float32)
|
||||
rect = cv2.minAreaRect(poly.astype(
|
||||
np.int32)) # (center (x,y), (width, height), angle of rotation)
|
||||
box = np.array(cv2.boxPoints(rect))
|
||||
|
||||
first_point_idx = 0
|
||||
min_dist = 1e4
|
||||
for i in range(4):
|
||||
dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
|
||||
np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
|
||||
np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
|
||||
np.linalg.norm(box[(i + 3) % 4] - poly[-1])
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
first_point_idx = i
|
||||
for i in range(4):
|
||||
min_area_quad[i] = box[(first_point_idx + i) % 4]
|
||||
|
||||
return min_area_quad
|
||||
|
||||
def check_and_validate_polys(self, polys, tags, im_size):
|
||||
"""
|
||||
check so that the text poly is in the same direction,
|
||||
and also filter some invalid polygons
|
||||
:param polys:
|
||||
:param tags:
|
||||
:return:
|
||||
"""
|
||||
(h, w) = im_size
|
||||
if polys.shape[0] == 0:
|
||||
return polys, np.array([]), np.array([])
|
||||
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
|
||||
polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)
|
||||
|
||||
validated_polys = []
|
||||
validated_tags = []
|
||||
hv_tags = []
|
||||
for poly, tag in zip(polys, tags):
|
||||
quad = self.gen_quad_from_poly(poly)
|
||||
p_area = self.quad_area(quad)
|
||||
if abs(p_area) < 1:
|
||||
print('invalid poly')
|
||||
continue
|
||||
if p_area > 0:
|
||||
if tag == False:
|
||||
print('poly in wrong direction')
|
||||
tag = True # reversed cases should be ignore
|
||||
poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
|
||||
1), :]
|
||||
quad = quad[(0, 3, 2, 1), :]
|
||||
|
||||
len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] -
|
||||
quad[2])
|
||||
len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] -
|
||||
quad[2])
|
||||
hv_tag = 1
|
||||
|
||||
if len_w * 2.0 < len_h:
|
||||
hv_tag = 0
|
||||
|
||||
validated_polys.append(poly)
|
||||
validated_tags.append(tag)
|
||||
hv_tags.append(hv_tag)
|
||||
return np.array(validated_polys), np.array(validated_tags), np.array(
|
||||
hv_tags)
|
||||
|
||||
def crop_area(self,
|
||||
im,
|
||||
polys,
|
||||
tags,
|
||||
hv_tags,
|
||||
txts,
|
||||
crop_background=False,
|
||||
max_tries=25):
|
||||
"""
|
||||
make random crop from the input image
|
||||
:param im:
|
||||
:param polys: [b,4,2]
|
||||
:param tags:
|
||||
:param crop_background:
|
||||
:param max_tries: 50 -> 25
|
||||
:return:
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
pad_h = h // 10
|
||||
pad_w = w // 10
|
||||
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
|
||||
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
|
||||
for poly in polys:
|
||||
poly = np.round(poly, decimals=0).astype(np.int32)
|
||||
minx = np.min(poly[:, 0])
|
||||
maxx = np.max(poly[:, 0])
|
||||
w_array[minx + pad_w:maxx + pad_w] = 1
|
||||
miny = np.min(poly[:, 1])
|
||||
maxy = np.max(poly[:, 1])
|
||||
h_array[miny + pad_h:maxy + pad_h] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return im, polys, tags, hv_tags, txts
|
||||
for i in range(max_tries):
|
||||
xx = np.random.choice(w_axis, size=2)
|
||||
xmin = np.min(xx) - pad_w
|
||||
xmax = np.max(xx) - pad_w
|
||||
xmin = np.clip(xmin, 0, w - 1)
|
||||
xmax = np.clip(xmax, 0, w - 1)
|
||||
yy = np.random.choice(h_axis, size=2)
|
||||
ymin = np.min(yy) - pad_h
|
||||
ymax = np.max(yy) - pad_h
|
||||
ymin = np.clip(ymin, 0, h - 1)
|
||||
ymax = np.clip(ymax, 0, h - 1)
|
||||
if xmax - xmin < self.min_crop_size or \
|
||||
ymax - ymin < self.min_crop_size:
|
||||
continue
|
||||
if polys.shape[0] != 0:
|
||||
poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \
|
||||
& (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax)
|
||||
selected_polys = np.where(
|
||||
np.sum(poly_axis_in_area, axis=1) == 4)[0]
|
||||
else:
|
||||
selected_polys = []
|
||||
if len(selected_polys) == 0:
|
||||
# no text in this area
|
||||
if crop_background:
|
||||
txts_tmp = []
|
||||
for selected_poly in selected_polys:
|
||||
txts_tmp.append(txts[selected_poly])
|
||||
txts = txts_tmp
|
||||
return im[ymin: ymax + 1, xmin: xmax + 1, :], \
|
||||
polys[selected_polys], tags[selected_polys], hv_tags[selected_polys], txts
|
||||
else:
|
||||
continue
|
||||
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
|
||||
polys = polys[selected_polys]
|
||||
tags = tags[selected_polys]
|
||||
hv_tags = hv_tags[selected_polys]
|
||||
txts_tmp = []
|
||||
for selected_poly in selected_polys:
|
||||
txts_tmp.append(txts[selected_poly])
|
||||
txts = txts_tmp
|
||||
polys[:, :, 0] -= xmin
|
||||
polys[:, :, 1] -= ymin
|
||||
return im, polys, tags, hv_tags, txts
|
||||
|
||||
return im, polys, tags, hv_tags, txts
|
||||
|
||||
def fit_and_gather_tcl_points_v2(self,
|
||||
min_area_quad,
|
||||
poly,
|
||||
max_h,
|
||||
max_w,
|
||||
fixed_point_num=64,
|
||||
img_id=0,
|
||||
reference_height=3):
|
||||
"""
|
||||
Find the center point of poly as key_points, then fit and gather.
|
||||
"""
|
||||
key_point_xys = []
|
||||
point_num = poly.shape[0]
|
||||
for idx in range(point_num // 2):
|
||||
center_point = (poly[idx] + poly[point_num - 1 - idx]) / 2.0
|
||||
key_point_xys.append(center_point)
|
||||
|
||||
tmp_image = np.zeros(
|
||||
shape=(
|
||||
max_h,
|
||||
max_w, ), dtype='float32')
|
||||
cv2.polylines(tmp_image, [np.array(key_point_xys).astype('int32')],
|
||||
False, 1.0)
|
||||
ys, xs = np.where(tmp_image > 0)
|
||||
xy_text = np.array(list(zip(xs, ys)), dtype='float32')
|
||||
|
||||
left_center_pt = (
|
||||
(min_area_quad[0] - min_area_quad[1]) / 2.0).reshape(1, 2)
|
||||
right_center_pt = (
|
||||
(min_area_quad[1] - min_area_quad[2]) / 2.0).reshape(1, 2)
|
||||
proj_unit_vec = (right_center_pt - left_center_pt) / (
|
||||
np.linalg.norm(right_center_pt - left_center_pt) + 1e-6)
|
||||
proj_unit_vec_tile = np.tile(proj_unit_vec,
|
||||
(xy_text.shape[0], 1)) # (n, 2)
|
||||
left_center_pt_tile = np.tile(left_center_pt,
|
||||
(xy_text.shape[0], 1)) # (n, 2)
|
||||
xy_text_to_left_center = xy_text - left_center_pt_tile
|
||||
proj_value = np.sum(xy_text_to_left_center * proj_unit_vec_tile, axis=1)
|
||||
xy_text = xy_text[np.argsort(proj_value)]
|
||||
|
||||
# convert to np and keep the num of point not greater then fixed_point_num
|
||||
pos_info = np.array(xy_text).reshape(-1, 2)[:, ::-1] # xy-> yx
|
||||
point_num = len(pos_info)
|
||||
if point_num > fixed_point_num:
|
||||
keep_ids = [
|
||||
int((point_num * 1.0 / fixed_point_num) * x)
|
||||
for x in range(fixed_point_num)
|
||||
]
|
||||
pos_info = pos_info[keep_ids, :]
|
||||
|
||||
keep = int(min(len(pos_info), fixed_point_num))
|
||||
if np.random.rand() < 0.2 and reference_height >= 3:
|
||||
dl = (np.random.rand(keep) - 0.5) * reference_height * 0.3
|
||||
random_float = np.array([1, 0]).reshape([1, 2]) * dl.reshape(
|
||||
[keep, 1])
|
||||
pos_info += random_float
|
||||
pos_info[:, 0] = np.clip(pos_info[:, 0], 0, max_h - 1)
|
||||
pos_info[:, 1] = np.clip(pos_info[:, 1], 0, max_w - 1)
|
||||
|
||||
# padding to fixed length
|
||||
pos_l = np.zeros((self.tcl_len, 3), dtype=np.int32)
|
||||
pos_l[:, 0] = np.ones((self.tcl_len, )) * img_id
|
||||
pos_m = np.zeros((self.tcl_len, 1), dtype=np.float32)
|
||||
pos_l[:keep, 1:] = np.round(pos_info).astype(np.int32)
|
||||
pos_m[:keep] = 1.0
|
||||
return pos_l, pos_m
|
||||
|
||||
def generate_direction_map(self, poly_quads, n_char, direction_map):
|
||||
"""
|
||||
"""
|
||||
width_list = []
|
||||
height_list = []
|
||||
for quad in poly_quads:
|
||||
quad_w = (np.linalg.norm(quad[0] - quad[1]) +
|
||||
np.linalg.norm(quad[2] - quad[3])) / 2.0
|
||||
quad_h = (np.linalg.norm(quad[0] - quad[3]) +
|
||||
np.linalg.norm(quad[2] - quad[1])) / 2.0
|
||||
width_list.append(quad_w)
|
||||
height_list.append(quad_h)
|
||||
norm_width = max(sum(width_list) / n_char, 1.0)
|
||||
average_height = max(sum(height_list) / len(height_list), 1.0)
|
||||
k = 1
|
||||
for quad in poly_quads:
|
||||
direct_vector_full = (
|
||||
(quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0
|
||||
direct_vector = direct_vector_full / (
|
||||
np.linalg.norm(direct_vector_full) + 1e-6) * norm_width
|
||||
direction_label = tuple(
|
||||
map(float,
|
||||
[direct_vector[0], direct_vector[1], 1.0 / average_height]))
|
||||
cv2.fillPoly(direction_map,
|
||||
quad.round().astype(np.int32)[np.newaxis, :, :],
|
||||
direction_label)
|
||||
k += 1
|
||||
return direction_map
|
||||
|
||||
def calculate_average_height(self, poly_quads):
|
||||
"""
|
||||
"""
|
||||
height_list = []
|
||||
for quad in poly_quads:
|
||||
quad_h = (np.linalg.norm(quad[0] - quad[3]) +
|
||||
np.linalg.norm(quad[2] - quad[1])) / 2.0
|
||||
height_list.append(quad_h)
|
||||
average_height = max(sum(height_list) / len(height_list), 1.0)
|
||||
return average_height
|
||||
|
||||
def generate_tcl_ctc_label(self,
|
||||
h,
|
||||
w,
|
||||
polys,
|
||||
tags,
|
||||
text_strs,
|
||||
ds_ratio,
|
||||
tcl_ratio=0.3,
|
||||
shrink_ratio_of_width=0.15):
|
||||
"""
|
||||
Generate polygon.
|
||||
"""
|
||||
score_map_big = np.zeros(
|
||||
(
|
||||
h,
|
||||
w, ), dtype=np.float32)
|
||||
h, w = int(h * ds_ratio), int(w * ds_ratio)
|
||||
polys = polys * ds_ratio
|
||||
|
||||
score_map = np.zeros(
|
||||
(
|
||||
h,
|
||||
w, ), dtype=np.float32)
|
||||
score_label_map = np.zeros(
|
||||
(
|
||||
h,
|
||||
w, ), dtype=np.float32)
|
||||
tbo_map = np.zeros((h, w, 5), dtype=np.float32)
|
||||
training_mask = np.ones(
|
||||
(
|
||||
h,
|
||||
w, ), dtype=np.float32)
|
||||
direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape(
|
||||
[1, 1, 3]).astype(np.float32)
|
||||
|
||||
label_idx = 0
|
||||
score_label_map_text_label_list = []
|
||||
pos_list, pos_mask, label_list = [], [], []
|
||||
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
|
||||
poly = poly_tag[0]
|
||||
tag = poly_tag[1]
|
||||
|
||||
# generate min_area_quad
|
||||
min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
|
||||
min_area_quad_h = 0.5 * (
|
||||
np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
|
||||
np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
|
||||
min_area_quad_w = 0.5 * (
|
||||
np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
|
||||
np.linalg.norm(min_area_quad[2] - min_area_quad[3]))
|
||||
|
||||
if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \
|
||||
or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio:
|
||||
continue
|
||||
|
||||
if tag:
|
||||
cv2.fillPoly(training_mask,
|
||||
poly.astype(np.int32)[np.newaxis, :, :], 0.15)
|
||||
else:
|
||||
text_label = text_strs[poly_idx]
|
||||
text_label = self.prepare_text_label(text_label,
|
||||
self.Lexicon_Table)
|
||||
|
||||
text_label_index_list = [[self.Lexicon_Table.index(c_)]
|
||||
for c_ in text_label
|
||||
if c_ in self.Lexicon_Table]
|
||||
if len(text_label_index_list) < 1:
|
||||
continue
|
||||
|
||||
tcl_poly = self.poly2tcl(poly, tcl_ratio)
|
||||
tcl_quads = self.poly2quads(tcl_poly)
|
||||
poly_quads = self.poly2quads(poly)
|
||||
|
||||
stcl_quads, quad_index = self.shrink_poly_along_width(
|
||||
tcl_quads,
|
||||
shrink_ratio_of_width=shrink_ratio_of_width,
|
||||
expand_height_ratio=1.0 / tcl_ratio)
|
||||
|
||||
cv2.fillPoly(score_map,
|
||||
np.round(stcl_quads).astype(np.int32), 1.0)
|
||||
cv2.fillPoly(score_map_big,
|
||||
np.round(stcl_quads / ds_ratio).astype(np.int32),
|
||||
1.0)
|
||||
|
||||
for idx, quad in enumerate(stcl_quads):
|
||||
quad_mask = np.zeros((h, w), dtype=np.float32)
|
||||
quad_mask = cv2.fillPoly(
|
||||
quad_mask,
|
||||
np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0)
|
||||
tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]],
|
||||
quad_mask, tbo_map)
|
||||
|
||||
# score label map and score_label_map_text_label_list for refine
|
||||
if label_idx == 0:
|
||||
text_pos_list_ = [[len(self.Lexicon_Table)], ]
|
||||
score_label_map_text_label_list.append(text_pos_list_)
|
||||
|
||||
label_idx += 1
|
||||
cv2.fillPoly(score_label_map,
|
||||
np.round(poly_quads).astype(np.int32), label_idx)
|
||||
score_label_map_text_label_list.append(text_label_index_list)
|
||||
|
||||
# direction info, fix-me
|
||||
n_char = len(text_label_index_list)
|
||||
direction_map = self.generate_direction_map(poly_quads, n_char,
|
||||
direction_map)
|
||||
|
||||
# pos info
|
||||
average_shrink_height = self.calculate_average_height(
|
||||
stcl_quads)
|
||||
pos_l, pos_m = self.fit_and_gather_tcl_points_v2(
|
||||
min_area_quad,
|
||||
poly,
|
||||
max_h=h,
|
||||
max_w=w,
|
||||
fixed_point_num=64,
|
||||
img_id=self.img_id,
|
||||
reference_height=average_shrink_height)
|
||||
|
||||
label_l = text_label_index_list
|
||||
if len(text_label_index_list) < 2:
|
||||
continue
|
||||
|
||||
pos_list.append(pos_l)
|
||||
pos_mask.append(pos_m)
|
||||
label_list.append(label_l)
|
||||
|
||||
# use big score_map for smooth tcl lines
|
||||
score_map_big_resized = cv2.resize(
|
||||
score_map_big, dsize=None, fx=ds_ratio, fy=ds_ratio)
|
||||
score_map = np.array(score_map_big_resized > 1e-3, dtype='float32')
|
||||
|
||||
return score_map, score_label_map, tbo_map, direction_map, training_mask, \
|
||||
pos_list, pos_mask, label_list, score_label_map_text_label_list
|
||||
|
||||
def adjust_point(self, poly):
|
||||
"""
|
||||
adjust point order.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
if point_num == 4:
|
||||
len_1 = np.linalg.norm(poly[0] - poly[1])
|
||||
len_2 = np.linalg.norm(poly[1] - poly[2])
|
||||
len_3 = np.linalg.norm(poly[2] - poly[3])
|
||||
len_4 = np.linalg.norm(poly[3] - poly[0])
|
||||
|
||||
if (len_1 + len_3) * 1.5 < (len_2 + len_4):
|
||||
poly = poly[[1, 2, 3, 0], :]
|
||||
|
||||
elif point_num > 4:
|
||||
vector_1 = poly[0] - poly[1]
|
||||
vector_2 = poly[1] - poly[2]
|
||||
cos_theta = np.dot(vector_1, vector_2) / (
|
||||
np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6)
|
||||
theta = np.arccos(np.round(cos_theta, decimals=4))
|
||||
|
||||
if abs(theta) > (70 / 180 * math.pi):
|
||||
index = list(range(1, point_num)) + [0]
|
||||
poly = poly[np.array(index), :]
|
||||
return poly
|
||||
|
||||
def gen_min_area_quad_from_poly(self, poly):
|
||||
"""
|
||||
Generate min area quad from poly.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
min_area_quad = np.zeros((4, 2), dtype=np.float32)
|
||||
if point_num == 4:
|
||||
min_area_quad = poly
|
||||
center_point = np.sum(poly, axis=0) / 4
|
||||
else:
|
||||
rect = cv2.minAreaRect(poly.astype(
|
||||
np.int32)) # (center (x,y), (width, height), angle of rotation)
|
||||
center_point = rect[0]
|
||||
box = np.array(cv2.boxPoints(rect))
|
||||
|
||||
first_point_idx = 0
|
||||
min_dist = 1e4
|
||||
for i in range(4):
|
||||
dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
|
||||
np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
|
||||
np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
|
||||
np.linalg.norm(box[(i + 3) % 4] - poly[-1])
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
first_point_idx = i
|
||||
|
||||
for i in range(4):
|
||||
min_area_quad[i] = box[(first_point_idx + i) % 4]
|
||||
|
||||
return min_area_quad, center_point
|
||||
|
||||
def shrink_quad_along_width(self,
|
||||
quad,
|
||||
begin_width_ratio=0.,
|
||||
end_width_ratio=1.):
|
||||
"""
|
||||
Generate shrink_quad_along_width.
|
||||
"""
|
||||
ratio_pair = np.array(
|
||||
[[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
|
||||
p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
|
||||
p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
|
||||
return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
|
||||
|
||||
def shrink_poly_along_width(self,
|
||||
quads,
|
||||
shrink_ratio_of_width,
|
||||
expand_height_ratio=1.0):
|
||||
"""
|
||||
shrink poly with given length.
|
||||
"""
|
||||
upper_edge_list = []
|
||||
|
||||
def get_cut_info(edge_len_list, cut_len):
|
||||
for idx, edge_len in enumerate(edge_len_list):
|
||||
cut_len -= edge_len
|
||||
if cut_len <= 0.000001:
|
||||
ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx]
|
||||
return idx, ratio
|
||||
|
||||
for quad in quads:
|
||||
upper_edge_len = np.linalg.norm(quad[0] - quad[1])
|
||||
upper_edge_list.append(upper_edge_len)
|
||||
|
||||
# length of left edge and right edge.
|
||||
left_length = np.linalg.norm(quads[0][0] - quads[0][
|
||||
3]) * expand_height_ratio
|
||||
right_length = np.linalg.norm(quads[-1][1] - quads[-1][
|
||||
2]) * expand_height_ratio
|
||||
|
||||
shrink_length = min(left_length, right_length,
|
||||
sum(upper_edge_list)) * shrink_ratio_of_width
|
||||
# shrinking length
|
||||
upper_len_left = shrink_length
|
||||
upper_len_right = sum(upper_edge_list) - shrink_length
|
||||
|
||||
left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left)
|
||||
left_quad = self.shrink_quad_along_width(
|
||||
quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1)
|
||||
right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right)
|
||||
right_quad = self.shrink_quad_along_width(
|
||||
quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio)
|
||||
|
||||
out_quad_list = []
|
||||
if left_idx == right_idx:
|
||||
out_quad_list.append(
|
||||
[left_quad[0], right_quad[1], right_quad[2], left_quad[3]])
|
||||
else:
|
||||
out_quad_list.append(left_quad)
|
||||
for idx in range(left_idx + 1, right_idx):
|
||||
out_quad_list.append(quads[idx])
|
||||
out_quad_list.append(right_quad)
|
||||
|
||||
return np.array(out_quad_list), list(range(left_idx, right_idx + 1))
|
||||
|
||||
def prepare_text_label(self, label_str, Lexicon_Table):
|
||||
"""
|
||||
Prepare text lablel by given Lexicon_Table.
|
||||
"""
|
||||
if len(Lexicon_Table) == 36:
|
||||
return label_str.lower()
|
||||
else:
|
||||
return label_str
|
||||
|
||||
def vector_angle(self, A, B):
|
||||
"""
|
||||
Calculate the angle between vector AB and x-axis positive direction.
|
||||
"""
|
||||
AB = np.array([B[1] - A[1], B[0] - A[0]])
|
||||
return np.arctan2(*AB)
|
||||
|
||||
def theta_line_cross_point(self, theta, point):
|
||||
"""
|
||||
Calculate the line through given point and angle in ax + by + c =0 form.
|
||||
"""
|
||||
x, y = point
|
||||
cos = np.cos(theta)
|
||||
sin = np.sin(theta)
|
||||
return [sin, -cos, cos * y - sin * x]
|
||||
|
||||
def line_cross_two_point(self, A, B):
|
||||
"""
|
||||
Calculate the line through given point A and B in ax + by + c =0 form.
|
||||
"""
|
||||
angle = self.vector_angle(A, B)
|
||||
return self.theta_line_cross_point(angle, A)
|
||||
|
||||
def average_angle(self, poly):
|
||||
"""
|
||||
Calculate the average angle between left and right edge in given poly.
|
||||
"""
|
||||
p0, p1, p2, p3 = poly
|
||||
angle30 = self.vector_angle(p3, p0)
|
||||
angle21 = self.vector_angle(p2, p1)
|
||||
return (angle30 + angle21) / 2
|
||||
|
||||
def line_cross_point(self, line1, line2):
|
||||
"""
|
||||
line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2
|
||||
"""
|
||||
a1, b1, c1 = line1
|
||||
a2, b2, c2 = line2
|
||||
d = a1 * b2 - a2 * b1
|
||||
|
||||
if d == 0:
|
||||
print('Cross point does not exist')
|
||||
return np.array([0, 0], dtype=np.float32)
|
||||
else:
|
||||
x = (b1 * c2 - b2 * c1) / d
|
||||
y = (a2 * c1 - a1 * c2) / d
|
||||
|
||||
return np.array([x, y], dtype=np.float32)
|
||||
|
||||
def quad2tcl(self, poly, ratio):
|
||||
"""
|
||||
Generate center line by poly clock-wise point. (4, 2)
|
||||
"""
|
||||
ratio_pair = np.array(
|
||||
[[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
|
||||
p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair
|
||||
p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair
|
||||
return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]])
|
||||
|
||||
def poly2tcl(self, poly, ratio):
|
||||
"""
|
||||
Generate center line by poly clock-wise point.
|
||||
"""
|
||||
ratio_pair = np.array(
|
||||
[[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
|
||||
tcl_poly = np.zeros_like(poly)
|
||||
point_num = poly.shape[0]
|
||||
|
||||
for idx in range(point_num // 2):
|
||||
point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx]
|
||||
) * ratio_pair
|
||||
tcl_poly[idx] = point_pair[0]
|
||||
tcl_poly[point_num - 1 - idx] = point_pair[1]
|
||||
return tcl_poly
|
||||
|
||||
def gen_quad_tbo(self, quad, tcl_mask, tbo_map):
|
||||
"""
|
||||
Generate tbo_map for give quad.
|
||||
"""
|
||||
# upper and lower line function: ax + by + c = 0;
|
||||
up_line = self.line_cross_two_point(quad[0], quad[1])
|
||||
lower_line = self.line_cross_two_point(quad[3], quad[2])
|
||||
|
||||
quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) +
|
||||
np.linalg.norm(quad[1] - quad[2]))
|
||||
quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) +
|
||||
np.linalg.norm(quad[2] - quad[3]))
|
||||
|
||||
# average angle of left and right line.
|
||||
angle = self.average_angle(quad)
|
||||
|
||||
xy_in_poly = np.argwhere(tcl_mask == 1)
|
||||
for y, x in xy_in_poly:
|
||||
point = (x, y)
|
||||
line = self.theta_line_cross_point(angle, point)
|
||||
cross_point_upper = self.line_cross_point(up_line, line)
|
||||
cross_point_lower = self.line_cross_point(lower_line, line)
|
||||
##FIX, offset reverse
|
||||
upper_offset_x, upper_offset_y = cross_point_upper - point
|
||||
lower_offset_x, lower_offset_y = cross_point_lower - point
|
||||
tbo_map[y, x, 0] = upper_offset_y
|
||||
tbo_map[y, x, 1] = upper_offset_x
|
||||
tbo_map[y, x, 2] = lower_offset_y
|
||||
tbo_map[y, x, 3] = lower_offset_x
|
||||
tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2
|
||||
return tbo_map
|
||||
|
||||
def poly2quads(self, poly):
|
||||
"""
|
||||
Split poly into quads.
|
||||
"""
|
||||
quad_list = []
|
||||
point_num = poly.shape[0]
|
||||
|
||||
# point pair
|
||||
point_pair_list = []
|
||||
for idx in range(point_num // 2):
|
||||
point_pair = [poly[idx], poly[point_num - 1 - idx]]
|
||||
point_pair_list.append(point_pair)
|
||||
|
||||
quad_num = point_num // 2 - 1
|
||||
for idx in range(quad_num):
|
||||
# reshape and adjust to clock-wise
|
||||
quad_list.append((np.array(point_pair_list)[[idx, idx + 1]]
|
||||
).reshape(4, 2)[[0, 2, 3, 1]])
|
||||
|
||||
return np.array(quad_list)
|
||||
|
||||
def rotate_im_poly(self, im, text_polys):
|
||||
"""
|
||||
rotate image with 90 / 180 / 270 degre
|
||||
"""
|
||||
im_w, im_h = im.shape[1], im.shape[0]
|
||||
dst_im = im.copy()
|
||||
dst_polys = []
|
||||
rand_degree_ratio = np.random.rand()
|
||||
rand_degree_cnt = 1
|
||||
if rand_degree_ratio > 0.5:
|
||||
rand_degree_cnt = 3
|
||||
for i in range(rand_degree_cnt):
|
||||
dst_im = np.rot90(dst_im)
|
||||
rot_degree = -90 * rand_degree_cnt
|
||||
rot_angle = rot_degree * math.pi / 180.0
|
||||
n_poly = text_polys.shape[0]
|
||||
cx, cy = 0.5 * im_w, 0.5 * im_h
|
||||
ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0]
|
||||
for i in range(n_poly):
|
||||
wordBB = text_polys[i]
|
||||
poly = []
|
||||
for j in range(4): # 16->4
|
||||
sx, sy = wordBB[j][0], wordBB[j][1]
|
||||
dx = math.cos(rot_angle) * (sx - cx) - math.sin(rot_angle) * (
|
||||
sy - cy) + ncx
|
||||
dy = math.sin(rot_angle) * (sx - cx) + math.cos(rot_angle) * (
|
||||
sy - cy) + ncy
|
||||
poly.append([dx, dy])
|
||||
dst_polys.append(poly)
|
||||
return dst_im, np.array(dst_polys, dtype=np.float32)
|
||||
|
||||
def __call__(self, data):
|
||||
input_size = 512
|
||||
im = data['image']
|
||||
text_polys = data['polys']
|
||||
text_tags = data['ignore_tags']
|
||||
text_strs = data['texts']
|
||||
h, w, _ = im.shape
|
||||
text_polys, text_tags, hv_tags = self.check_and_validate_polys(
|
||||
text_polys, text_tags, (h, w))
|
||||
if text_polys.shape[0] <= 0:
|
||||
return None
|
||||
# set aspect ratio and keep area fix
|
||||
asp_scales = np.arange(1.0, 1.55, 0.1)
|
||||
asp_scale = np.random.choice(asp_scales)
|
||||
if np.random.rand() < 0.5:
|
||||
asp_scale = 1.0 / asp_scale
|
||||
asp_scale = math.sqrt(asp_scale)
|
||||
|
||||
asp_wx = asp_scale
|
||||
asp_hy = 1.0 / asp_scale
|
||||
im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy)
|
||||
text_polys[:, :, 0] *= asp_wx
|
||||
text_polys[:, :, 1] *= asp_hy
|
||||
|
||||
h, w, _ = im.shape
|
||||
if max(h, w) > 2048:
|
||||
rd_scale = 2048.0 / max(h, w)
|
||||
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
|
||||
text_polys *= rd_scale
|
||||
h, w, _ = im.shape
|
||||
if min(h, w) < 16:
|
||||
return None
|
||||
|
||||
# no background
|
||||
im, text_polys, text_tags, hv_tags, text_strs = self.crop_area(
|
||||
im,
|
||||
text_polys,
|
||||
text_tags,
|
||||
hv_tags,
|
||||
text_strs,
|
||||
crop_background=False)
|
||||
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
# # continue for all ignore case
|
||||
if np.sum((text_tags * 1.0)) >= text_tags.size:
|
||||
return None
|
||||
new_h, new_w, _ = im.shape
|
||||
if (new_h is None) or (new_w is None):
|
||||
return None
|
||||
# resize image
|
||||
std_ratio = float(input_size) / max(new_w, new_h)
|
||||
rand_scales = np.array(
|
||||
[0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
rz_scale = std_ratio * np.random.choice(rand_scales)
|
||||
im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale)
|
||||
text_polys[:, :, 0] *= rz_scale
|
||||
text_polys[:, :, 1] *= rz_scale
|
||||
|
||||
# add gaussian blur
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
ks = np.random.permutation(5)[0] + 1
|
||||
ks = int(ks / 2) * 2 + 1
|
||||
im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0)
|
||||
# add brighter
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
im = im * (1.0 + np.random.rand() * 0.5)
|
||||
im = np.clip(im, 0.0, 255.0)
|
||||
# add darker
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
im = im * (1.0 - np.random.rand() * 0.5)
|
||||
im = np.clip(im, 0.0, 255.0)
|
||||
|
||||
# Padding the im to [input_size, input_size]
|
||||
new_h, new_w, _ = im.shape
|
||||
if min(new_w, new_h) < input_size * 0.5:
|
||||
return None
|
||||
im_padded = np.ones((input_size, input_size, 3), dtype=np.float32)
|
||||
im_padded[:, :, 2] = 0.485 * 255
|
||||
im_padded[:, :, 1] = 0.456 * 255
|
||||
im_padded[:, :, 0] = 0.406 * 255
|
||||
|
||||
# Random the start position
|
||||
del_h = input_size - new_h
|
||||
del_w = input_size - new_w
|
||||
sh, sw = 0, 0
|
||||
if del_h > 1:
|
||||
sh = int(np.random.rand() * del_h)
|
||||
if del_w > 1:
|
||||
sw = int(np.random.rand() * del_w)
|
||||
|
||||
# Padding
|
||||
im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy()
|
||||
text_polys[:, :, 0] += sw
|
||||
text_polys[:, :, 1] += sh
|
||||
|
||||
score_map, score_label_map, border_map, direction_map, training_mask, \
|
||||
pos_list, pos_mask, label_list, score_label_map_text_label = self.generate_tcl_ctc_label(input_size,
|
||||
input_size,
|
||||
text_polys,
|
||||
text_tags,
|
||||
text_strs, 0.25)
|
||||
if len(label_list) <= 0: # eliminate negative samples
|
||||
return None
|
||||
pos_list_temp = np.zeros([64, 3])
|
||||
pos_mask_temp = np.zeros([64, 1])
|
||||
label_list_temp = np.zeros([self.max_text_length, 1]) + self.pad_num
|
||||
|
||||
for i, label in enumerate(label_list):
|
||||
n = len(label)
|
||||
if n > self.max_text_length:
|
||||
label_list[i] = label[:self.max_text_length]
|
||||
continue
|
||||
while n < self.max_text_length:
|
||||
label.append([self.pad_num])
|
||||
n += 1
|
||||
|
||||
for i in range(len(label_list)):
|
||||
label_list[i] = np.array(label_list[i])
|
||||
|
||||
if len(pos_list) <= 0 or len(pos_list) > self.max_text_nums:
|
||||
return None
|
||||
for __ in range(self.max_text_nums - len(pos_list), 0, -1):
|
||||
pos_list.append(pos_list_temp)
|
||||
pos_mask.append(pos_mask_temp)
|
||||
label_list.append(label_list_temp)
|
||||
|
||||
if self.img_id == self.batch_size - 1:
|
||||
self.img_id = 0
|
||||
else:
|
||||
self.img_id += 1
|
||||
|
||||
im_padded[:, :, 2] -= 0.485 * 255
|
||||
im_padded[:, :, 1] -= 0.456 * 255
|
||||
im_padded[:, :, 0] -= 0.406 * 255
|
||||
im_padded[:, :, 2] /= (255.0 * 0.229)
|
||||
im_padded[:, :, 1] /= (255.0 * 0.224)
|
||||
im_padded[:, :, 0] /= (255.0 * 0.225)
|
||||
im_padded = im_padded.transpose((2, 0, 1))
|
||||
images = im_padded[::-1, :, :]
|
||||
tcl_maps = score_map[np.newaxis, :, :]
|
||||
tcl_label_maps = score_label_map[np.newaxis, :, :]
|
||||
border_maps = border_map.transpose((2, 0, 1))
|
||||
direction_maps = direction_map.transpose((2, 0, 1))
|
||||
training_masks = training_mask[np.newaxis, :, :]
|
||||
pos_list = np.array(pos_list)
|
||||
pos_mask = np.array(pos_mask)
|
||||
label_list = np.array(label_list)
|
||||
data['images'] = images
|
||||
data['tcl_maps'] = tcl_maps
|
||||
data['tcl_label_maps'] = tcl_label_maps
|
||||
data['border_maps'] = border_maps
|
||||
data['direction_maps'] = direction_maps
|
||||
data['training_masks'] = training_masks
|
||||
data['label_list'] = label_list
|
||||
data['pos_list'] = pos_list
|
||||
data['pos_mask'] = pos_mask
|
||||
return data
|
||||
143
backend/ppocr/data/imaug/randaugment.py
Normal file
143
backend/ppocr/data/imaug/randaugment.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from PIL import Image, ImageEnhance, ImageOps
|
||||
import numpy as np
|
||||
import random
|
||||
import six
|
||||
|
||||
|
||||
class RawRandAugment(object):
|
||||
def __init__(self,
|
||||
num_layers=2,
|
||||
magnitude=5,
|
||||
fillcolor=(128, 128, 128),
|
||||
**kwargs):
|
||||
self.num_layers = num_layers
|
||||
self.magnitude = magnitude
|
||||
self.max_level = 10
|
||||
|
||||
abso_level = self.magnitude / self.max_level
|
||||
self.level_map = {
|
||||
"shearX": 0.3 * abso_level,
|
||||
"shearY": 0.3 * abso_level,
|
||||
"translateX": 150.0 / 331 * abso_level,
|
||||
"translateY": 150.0 / 331 * abso_level,
|
||||
"rotate": 30 * abso_level,
|
||||
"color": 0.9 * abso_level,
|
||||
"posterize": int(4.0 * abso_level),
|
||||
"solarize": 256.0 * abso_level,
|
||||
"contrast": 0.9 * abso_level,
|
||||
"sharpness": 0.9 * abso_level,
|
||||
"brightness": 0.9 * abso_level,
|
||||
"autocontrast": 0,
|
||||
"equalize": 0,
|
||||
"invert": 0
|
||||
}
|
||||
|
||||
# from https://stackoverflow.com/questions/5252170/
|
||||
# specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
|
||||
def rotate_with_fill(img, magnitude):
|
||||
rot = img.convert("RGBA").rotate(magnitude)
|
||||
return Image.composite(rot,
|
||||
Image.new("RGBA", rot.size, (128, ) * 4),
|
||||
rot).convert(img.mode)
|
||||
|
||||
rnd_ch_op = random.choice
|
||||
|
||||
self.func = {
|
||||
"shearX": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
|
||||
Image.BICUBIC,
|
||||
fillcolor=fillcolor),
|
||||
"shearY": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
|
||||
Image.BICUBIC,
|
||||
fillcolor=fillcolor),
|
||||
"translateX": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
|
||||
fillcolor=fillcolor),
|
||||
"translateY": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
|
||||
fillcolor=fillcolor),
|
||||
"rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
|
||||
"color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
|
||||
1 + magnitude * rnd_ch_op([-1, 1])),
|
||||
"posterize": lambda img, magnitude:
|
||||
ImageOps.posterize(img, magnitude),
|
||||
"solarize": lambda img, magnitude:
|
||||
ImageOps.solarize(img, magnitude),
|
||||
"contrast": lambda img, magnitude:
|
||||
ImageEnhance.Contrast(img).enhance(
|
||||
1 + magnitude * rnd_ch_op([-1, 1])),
|
||||
"sharpness": lambda img, magnitude:
|
||||
ImageEnhance.Sharpness(img).enhance(
|
||||
1 + magnitude * rnd_ch_op([-1, 1])),
|
||||
"brightness": lambda img, magnitude:
|
||||
ImageEnhance.Brightness(img).enhance(
|
||||
1 + magnitude * rnd_ch_op([-1, 1])),
|
||||
"autocontrast": lambda img, magnitude:
|
||||
ImageOps.autocontrast(img),
|
||||
"equalize": lambda img, magnitude: ImageOps.equalize(img),
|
||||
"invert": lambda img, magnitude: ImageOps.invert(img)
|
||||
}
|
||||
|
||||
def __call__(self, img):
|
||||
avaiable_op_names = list(self.level_map.keys())
|
||||
for layer_num in range(self.num_layers):
|
||||
op_name = np.random.choice(avaiable_op_names)
|
||||
img = self.func[op_name](img, self.level_map[op_name])
|
||||
return img
|
||||
|
||||
|
||||
class RandAugment(RawRandAugment):
|
||||
""" RandAugment wrapper to auto fit different img types """
|
||||
|
||||
def __init__(self, prob=0.5, *args, **kwargs):
|
||||
self.prob = prob
|
||||
if six.PY2:
|
||||
super(RandAugment, self).__init__(*args, **kwargs)
|
||||
else:
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def __call__(self, data):
|
||||
if np.random.rand() > self.prob:
|
||||
return data
|
||||
img = data['image']
|
||||
if not isinstance(img, Image.Image):
|
||||
img = np.ascontiguousarray(img)
|
||||
img = Image.fromarray(img)
|
||||
|
||||
if six.PY2:
|
||||
img = super(RandAugment, self).__call__(img)
|
||||
else:
|
||||
img = super().__call__(img)
|
||||
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.asarray(img)
|
||||
data['image'] = img
|
||||
return data
|
||||
234
backend/ppocr/data/imaug/random_crop_data.py
Normal file
234
backend/ppocr/data/imaug/random_crop_data.py
Normal file
@@ -0,0 +1,234 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/random_crop_data.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
import random
|
||||
|
||||
|
||||
def is_poly_in_rect(poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
|
||||
return False
|
||||
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_poly_outside_rect(poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
|
||||
return True
|
||||
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def split_regions(axis):
|
||||
regions = []
|
||||
min_axis = 0
|
||||
for i in range(1, axis.shape[0]):
|
||||
if axis[i] != axis[i - 1] + 1:
|
||||
region = axis[min_axis:i]
|
||||
min_axis = i
|
||||
regions.append(region)
|
||||
return regions
|
||||
|
||||
|
||||
def random_select(axis, max_size):
|
||||
xx = np.random.choice(axis, size=2)
|
||||
xmin = np.min(xx)
|
||||
xmax = np.max(xx)
|
||||
xmin = np.clip(xmin, 0, max_size - 1)
|
||||
xmax = np.clip(xmax, 0, max_size - 1)
|
||||
return xmin, xmax
|
||||
|
||||
|
||||
def region_wise_random_select(regions, max_size):
|
||||
selected_index = list(np.random.choice(len(regions), 2))
|
||||
selected_values = []
|
||||
for index in selected_index:
|
||||
axis = regions[index]
|
||||
xx = int(np.random.choice(axis, size=1))
|
||||
selected_values.append(xx)
|
||||
xmin = min(selected_values)
|
||||
xmax = max(selected_values)
|
||||
return xmin, xmax
|
||||
|
||||
|
||||
def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
|
||||
h, w, _ = im.shape
|
||||
h_array = np.zeros(h, dtype=np.int32)
|
||||
w_array = np.zeros(w, dtype=np.int32)
|
||||
for points in text_polys:
|
||||
points = np.round(points, decimals=0).astype(np.int32)
|
||||
minx = np.min(points[:, 0])
|
||||
maxx = np.max(points[:, 0])
|
||||
w_array[minx:maxx] = 1
|
||||
miny = np.min(points[:, 1])
|
||||
maxy = np.max(points[:, 1])
|
||||
h_array[miny:maxy] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return 0, 0, w, h
|
||||
|
||||
h_regions = split_regions(h_axis)
|
||||
w_regions = split_regions(w_axis)
|
||||
|
||||
for i in range(max_tries):
|
||||
if len(w_regions) > 1:
|
||||
xmin, xmax = region_wise_random_select(w_regions, w)
|
||||
else:
|
||||
xmin, xmax = random_select(w_axis, w)
|
||||
if len(h_regions) > 1:
|
||||
ymin, ymax = region_wise_random_select(h_regions, h)
|
||||
else:
|
||||
ymin, ymax = random_select(h_axis, h)
|
||||
|
||||
if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h:
|
||||
# area too small
|
||||
continue
|
||||
num_poly_in_rect = 0
|
||||
for poly in text_polys:
|
||||
if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin,
|
||||
ymax - ymin):
|
||||
num_poly_in_rect += 1
|
||||
break
|
||||
|
||||
if num_poly_in_rect > 0:
|
||||
return xmin, ymin, xmax - xmin, ymax - ymin
|
||||
|
||||
return 0, 0, w, h
|
||||
|
||||
|
||||
class EastRandomCropData(object):
|
||||
def __init__(self,
|
||||
size=(640, 640),
|
||||
max_tries=10,
|
||||
min_crop_side_ratio=0.1,
|
||||
keep_ratio=True,
|
||||
**kwargs):
|
||||
self.size = size
|
||||
self.max_tries = max_tries
|
||||
self.min_crop_side_ratio = min_crop_side_ratio
|
||||
self.keep_ratio = keep_ratio
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
texts = data['texts']
|
||||
all_care_polys = [
|
||||
text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
|
||||
]
|
||||
# 计算crop区域
|
||||
crop_x, crop_y, crop_w, crop_h = crop_area(
|
||||
img, all_care_polys, self.min_crop_side_ratio, self.max_tries)
|
||||
# crop 图片 保持比例填充
|
||||
scale_w = self.size[0] / crop_w
|
||||
scale_h = self.size[1] / crop_h
|
||||
scale = min(scale_w, scale_h)
|
||||
h = int(crop_h * scale)
|
||||
w = int(crop_w * scale)
|
||||
if self.keep_ratio:
|
||||
padimg = np.zeros((self.size[1], self.size[0], img.shape[2]),
|
||||
img.dtype)
|
||||
padimg[:h, :w] = cv2.resize(
|
||||
img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
|
||||
img = padimg
|
||||
else:
|
||||
img = cv2.resize(
|
||||
img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
|
||||
tuple(self.size))
|
||||
# crop 文本框
|
||||
text_polys_crop = []
|
||||
ignore_tags_crop = []
|
||||
texts_crop = []
|
||||
for poly, text, tag in zip(text_polys, texts, ignore_tags):
|
||||
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
|
||||
if not is_poly_outside_rect(poly, 0, 0, w, h):
|
||||
text_polys_crop.append(poly)
|
||||
ignore_tags_crop.append(tag)
|
||||
texts_crop.append(text)
|
||||
data['image'] = img
|
||||
data['polys'] = np.array(text_polys_crop)
|
||||
data['ignore_tags'] = ignore_tags_crop
|
||||
data['texts'] = texts_crop
|
||||
return data
|
||||
|
||||
|
||||
class RandomCropImgMask(object):
|
||||
def __init__(self, size, main_key, crop_keys, p=3 / 8, **kwargs):
|
||||
self.size = size
|
||||
self.main_key = main_key
|
||||
self.crop_keys = crop_keys
|
||||
self.p = p
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
|
||||
h, w = image.shape[0:2]
|
||||
th, tw = self.size
|
||||
if w == tw and h == th:
|
||||
return data
|
||||
|
||||
mask = data[self.main_key]
|
||||
if np.max(mask) > 0 and random.random() > self.p:
|
||||
# make sure to crop the text region
|
||||
tl = np.min(np.where(mask > 0), axis=1) - (th, tw)
|
||||
tl[tl < 0] = 0
|
||||
br = np.max(np.where(mask > 0), axis=1) - (th, tw)
|
||||
br[br < 0] = 0
|
||||
|
||||
br[0] = min(br[0], h - th)
|
||||
br[1] = min(br[1], w - tw)
|
||||
|
||||
i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0
|
||||
j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0
|
||||
else:
|
||||
i = random.randint(0, h - th) if h - th > 0 else 0
|
||||
j = random.randint(0, w - tw) if w - tw > 0 else 0
|
||||
|
||||
# return i, j, th, tw
|
||||
for k in data:
|
||||
if k in self.crop_keys:
|
||||
if len(data[k].shape) == 3:
|
||||
if np.argmin(data[k].shape) == 0:
|
||||
img = data[k][:, i:i + th, j:j + tw]
|
||||
if img.shape[1] != img.shape[2]:
|
||||
a = 1
|
||||
elif np.argmin(data[k].shape) == 2:
|
||||
img = data[k][i:i + th, j:j + tw, :]
|
||||
if img.shape[1] != img.shape[0]:
|
||||
a = 1
|
||||
else:
|
||||
img = data[k]
|
||||
else:
|
||||
img = data[k][i:i + th, j:j + tw]
|
||||
if img.shape[0] != img.shape[1]:
|
||||
a = 1
|
||||
data[k] = img
|
||||
return data
|
||||
601
backend/ppocr/data/imaug/rec_img_aug.py
Normal file
601
backend/ppocr/data/imaug/rec_img_aug.py
Normal file
@@ -0,0 +1,601 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import random
|
||||
import copy
|
||||
from PIL import Image
|
||||
from .text_image_aug import tia_perspective, tia_stretch, tia_distort
|
||||
|
||||
|
||||
class RecAug(object):
|
||||
def __init__(self, use_tia=True, aug_prob=0.4, **kwargs):
|
||||
self.use_tia = use_tia
|
||||
self.aug_prob = aug_prob
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
img = warp(img, 10, self.use_tia, self.aug_prob)
|
||||
data['image'] = img
|
||||
return data
|
||||
|
||||
|
||||
class RecConAug(object):
|
||||
def __init__(self,
|
||||
prob=0.5,
|
||||
image_shape=(32, 320, 3),
|
||||
max_text_length=25,
|
||||
ext_data_num=1,
|
||||
**kwargs):
|
||||
self.ext_data_num = ext_data_num
|
||||
self.prob = prob
|
||||
self.max_text_length = max_text_length
|
||||
self.image_shape = image_shape
|
||||
self.max_wh_ratio = self.image_shape[1] / self.image_shape[0]
|
||||
|
||||
def merge_ext_data(self, data, ext_data):
|
||||
ori_w = round(data['image'].shape[1] / data['image'].shape[0] *
|
||||
self.image_shape[0])
|
||||
ext_w = round(ext_data['image'].shape[1] / ext_data['image'].shape[0] *
|
||||
self.image_shape[0])
|
||||
data['image'] = cv2.resize(data['image'], (ori_w, self.image_shape[0]))
|
||||
ext_data['image'] = cv2.resize(ext_data['image'],
|
||||
(ext_w, self.image_shape[0]))
|
||||
data['image'] = np.concatenate(
|
||||
[data['image'], ext_data['image']], axis=1)
|
||||
data["label"] += ext_data["label"]
|
||||
return data
|
||||
|
||||
def __call__(self, data):
|
||||
rnd_num = random.random()
|
||||
if rnd_num > self.prob:
|
||||
return data
|
||||
for idx, ext_data in enumerate(data["ext_data"]):
|
||||
if len(data["label"]) + len(ext_data[
|
||||
"label"]) > self.max_text_length:
|
||||
break
|
||||
concat_ratio = data['image'].shape[1] / data['image'].shape[
|
||||
0] + ext_data['image'].shape[1] / ext_data['image'].shape[0]
|
||||
if concat_ratio > self.max_wh_ratio:
|
||||
break
|
||||
data = self.merge_ext_data(data, ext_data)
|
||||
data.pop("ext_data")
|
||||
return data
|
||||
|
||||
|
||||
class ClsResizeImg(object):
|
||||
def __init__(self, image_shape, **kwargs):
|
||||
self.image_shape = image_shape
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
norm_img, _ = resize_norm_img(img, self.image_shape)
|
||||
data['image'] = norm_img
|
||||
return data
|
||||
|
||||
|
||||
class NRTRRecResizeImg(object):
|
||||
def __init__(self, image_shape, resize_type, padding=False, **kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.resize_type = resize_type
|
||||
self.padding = padding
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
image_shape = self.image_shape
|
||||
if self.padding:
|
||||
imgC, imgH, imgW = image_shape
|
||||
# todo: change to 0 and modified image shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
ratio = w / float(h)
|
||||
if math.ceil(imgH * ratio) > imgW:
|
||||
resized_w = imgW
|
||||
else:
|
||||
resized_w = int(math.ceil(imgH * ratio))
|
||||
resized_image = cv2.resize(img, (resized_w, imgH))
|
||||
norm_img = np.expand_dims(resized_image, -1)
|
||||
norm_img = norm_img.transpose((2, 0, 1))
|
||||
resized_image = norm_img.astype(np.float32) / 128. - 1.
|
||||
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
||||
padding_im[:, :, 0:resized_w] = resized_image
|
||||
data['image'] = padding_im
|
||||
return data
|
||||
if self.resize_type == 'PIL':
|
||||
image_pil = Image.fromarray(np.uint8(img))
|
||||
img = image_pil.resize(self.image_shape, Image.ANTIALIAS)
|
||||
img = np.array(img)
|
||||
if self.resize_type == 'OpenCV':
|
||||
img = cv2.resize(img, self.image_shape)
|
||||
norm_img = np.expand_dims(img, -1)
|
||||
norm_img = norm_img.transpose((2, 0, 1))
|
||||
data['image'] = norm_img.astype(np.float32) / 128. - 1.
|
||||
return data
|
||||
|
||||
|
||||
class RecResizeImg(object):
|
||||
def __init__(self,
|
||||
image_shape,
|
||||
infer_mode=False,
|
||||
character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
|
||||
padding=True,
|
||||
**kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.infer_mode = infer_mode
|
||||
self.character_dict_path = character_dict_path
|
||||
self.padding = padding
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if self.infer_mode and self.character_dict_path is not None:
|
||||
norm_img, valid_ratio = resize_norm_img_chinese(img,
|
||||
self.image_shape)
|
||||
else:
|
||||
norm_img, valid_ratio = resize_norm_img(img, self.image_shape,
|
||||
self.padding)
|
||||
data['image'] = norm_img
|
||||
data['valid_ratio'] = valid_ratio
|
||||
return data
|
||||
|
||||
|
||||
class SRNRecResizeImg(object):
|
||||
def __init__(self, image_shape, num_heads, max_text_length, **kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.num_heads = num_heads
|
||||
self.max_text_length = max_text_length
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
norm_img = resize_norm_img_srn(img, self.image_shape)
|
||||
data['image'] = norm_img
|
||||
[encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
|
||||
srn_other_inputs(self.image_shape, self.num_heads, self.max_text_length)
|
||||
|
||||
data['encoder_word_pos'] = encoder_word_pos
|
||||
data['gsrm_word_pos'] = gsrm_word_pos
|
||||
data['gsrm_slf_attn_bias1'] = gsrm_slf_attn_bias1
|
||||
data['gsrm_slf_attn_bias2'] = gsrm_slf_attn_bias2
|
||||
return data
|
||||
|
||||
|
||||
class SARRecResizeImg(object):
|
||||
def __init__(self, image_shape, width_downsample_ratio=0.25, **kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.width_downsample_ratio = width_downsample_ratio
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(
|
||||
img, self.image_shape, self.width_downsample_ratio)
|
||||
data['image'] = norm_img
|
||||
data['resized_shape'] = resize_shape
|
||||
data['pad_shape'] = pad_shape
|
||||
data['valid_ratio'] = valid_ratio
|
||||
return data
|
||||
|
||||
|
||||
class PRENResizeImg(object):
|
||||
def __init__(self, image_shape, **kwargs):
|
||||
"""
|
||||
Accroding to original paper's realization, it's a hard resize method here.
|
||||
So maybe you should optimize it to fit for your task better.
|
||||
"""
|
||||
self.dst_h, self.dst_w = image_shape
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
resized_img = cv2.resize(
|
||||
img, (self.dst_w, self.dst_h), interpolation=cv2.INTER_LINEAR)
|
||||
resized_img = resized_img.transpose((2, 0, 1)) / 255
|
||||
resized_img -= 0.5
|
||||
resized_img /= 0.5
|
||||
data['image'] = resized_img.astype(np.float32)
|
||||
return data
|
||||
|
||||
|
||||
def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
|
||||
imgC, imgH, imgW_min, imgW_max = image_shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
valid_ratio = 1.0
|
||||
# make sure new_width is an integral multiple of width_divisor.
|
||||
width_divisor = int(1 / width_downsample_ratio)
|
||||
# resize
|
||||
ratio = w / float(h)
|
||||
resize_w = math.ceil(imgH * ratio)
|
||||
if resize_w % width_divisor != 0:
|
||||
resize_w = round(resize_w / width_divisor) * width_divisor
|
||||
if imgW_min is not None:
|
||||
resize_w = max(imgW_min, resize_w)
|
||||
if imgW_max is not None:
|
||||
valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
|
||||
resize_w = min(imgW_max, resize_w)
|
||||
resized_image = cv2.resize(img, (resize_w, imgH))
|
||||
resized_image = resized_image.astype('float32')
|
||||
# norm
|
||||
if image_shape[0] == 1:
|
||||
resized_image = resized_image / 255
|
||||
resized_image = resized_image[np.newaxis, :]
|
||||
else:
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
resize_shape = resized_image.shape
|
||||
padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
|
||||
padding_im[:, :, 0:resize_w] = resized_image
|
||||
pad_shape = padding_im.shape
|
||||
|
||||
return padding_im, resize_shape, pad_shape, valid_ratio
|
||||
|
||||
|
||||
def resize_norm_img(img, image_shape, padding=True):
|
||||
imgC, imgH, imgW = image_shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
if not padding:
|
||||
resized_image = cv2.resize(
|
||||
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
|
||||
resized_w = imgW
|
||||
else:
|
||||
ratio = w / float(h)
|
||||
if math.ceil(imgH * ratio) > imgW:
|
||||
resized_w = imgW
|
||||
else:
|
||||
resized_w = int(math.ceil(imgH * ratio))
|
||||
resized_image = cv2.resize(img, (resized_w, imgH))
|
||||
resized_image = resized_image.astype('float32')
|
||||
if image_shape[0] == 1:
|
||||
resized_image = resized_image / 255
|
||||
resized_image = resized_image[np.newaxis, :]
|
||||
else:
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
||||
padding_im[:, :, 0:resized_w] = resized_image
|
||||
valid_ratio = min(1.0, float(resized_w / imgW))
|
||||
return padding_im, valid_ratio
|
||||
|
||||
|
||||
def resize_norm_img_chinese(img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
# todo: change to 0 and modified image shape
|
||||
max_wh_ratio = imgW * 1.0 / imgH
|
||||
h, w = img.shape[0], img.shape[1]
|
||||
ratio = w * 1.0 / h
|
||||
max_wh_ratio = max(max_wh_ratio, ratio)
|
||||
imgW = int(imgH * max_wh_ratio)
|
||||
if math.ceil(imgH * ratio) > imgW:
|
||||
resized_w = imgW
|
||||
else:
|
||||
resized_w = int(math.ceil(imgH * ratio))
|
||||
resized_image = cv2.resize(img, (resized_w, imgH))
|
||||
resized_image = resized_image.astype('float32')
|
||||
if image_shape[0] == 1:
|
||||
resized_image = resized_image / 255
|
||||
resized_image = resized_image[np.newaxis, :]
|
||||
else:
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
||||
padding_im[:, :, 0:resized_w] = resized_image
|
||||
valid_ratio = min(1.0, float(resized_w / imgW))
|
||||
return padding_im, valid_ratio
|
||||
|
||||
|
||||
def resize_norm_img_srn(img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
|
||||
img_black = np.zeros((imgH, imgW))
|
||||
im_hei = img.shape[0]
|
||||
im_wid = img.shape[1]
|
||||
|
||||
if im_wid <= im_hei * 1:
|
||||
img_new = cv2.resize(img, (imgH * 1, imgH))
|
||||
elif im_wid <= im_hei * 2:
|
||||
img_new = cv2.resize(img, (imgH * 2, imgH))
|
||||
elif im_wid <= im_hei * 3:
|
||||
img_new = cv2.resize(img, (imgH * 3, imgH))
|
||||
else:
|
||||
img_new = cv2.resize(img, (imgW, imgH))
|
||||
|
||||
img_np = np.asarray(img_new)
|
||||
img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
|
||||
img_black[:, 0:img_np.shape[1]] = img_np
|
||||
img_black = img_black[:, :, np.newaxis]
|
||||
|
||||
row, col, c = img_black.shape
|
||||
c = 1
|
||||
|
||||
return np.reshape(img_black, (c, row, col)).astype(np.float32)
|
||||
|
||||
|
||||
def srn_other_inputs(image_shape, num_heads, max_text_length):
|
||||
|
||||
imgC, imgH, imgW = image_shape
|
||||
feature_dim = int((imgH / 8) * (imgW / 8))
|
||||
|
||||
encoder_word_pos = np.array(range(0, feature_dim)).reshape(
|
||||
(feature_dim, 1)).astype('int64')
|
||||
gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
|
||||
(max_text_length, 1)).astype('int64')
|
||||
|
||||
gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
|
||||
gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
|
||||
[1, max_text_length, max_text_length])
|
||||
gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1,
|
||||
[num_heads, 1, 1]) * [-1e9]
|
||||
|
||||
gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
|
||||
[1, max_text_length, max_text_length])
|
||||
gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2,
|
||||
[num_heads, 1, 1]) * [-1e9]
|
||||
|
||||
return [
|
||||
encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2
|
||||
]
|
||||
|
||||
|
||||
def flag():
|
||||
"""
|
||||
flag
|
||||
"""
|
||||
return 1 if random.random() > 0.5000001 else -1
|
||||
|
||||
|
||||
def cvtColor(img):
|
||||
"""
|
||||
cvtColor
|
||||
"""
|
||||
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
|
||||
delta = 0.001 * random.random() * flag()
|
||||
hsv[:, :, 2] = hsv[:, :, 2] * (1 + delta)
|
||||
new_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
|
||||
return new_img
|
||||
|
||||
|
||||
def blur(img):
|
||||
"""
|
||||
blur
|
||||
"""
|
||||
h, w, _ = img.shape
|
||||
if h > 10 and w > 10:
|
||||
return cv2.GaussianBlur(img, (5, 5), 1)
|
||||
else:
|
||||
return img
|
||||
|
||||
|
||||
def jitter(img):
|
||||
"""
|
||||
jitter
|
||||
"""
|
||||
w, h, _ = img.shape
|
||||
if h > 10 and w > 10:
|
||||
thres = min(w, h)
|
||||
s = int(random.random() * thres * 0.01)
|
||||
src_img = img.copy()
|
||||
for i in range(s):
|
||||
img[i:, i:, :] = src_img[:w - i, :h - i, :]
|
||||
return img
|
||||
else:
|
||||
return img
|
||||
|
||||
|
||||
def add_gasuss_noise(image, mean=0, var=0.1):
|
||||
"""
|
||||
Gasuss noise
|
||||
"""
|
||||
|
||||
noise = np.random.normal(mean, var**0.5, image.shape)
|
||||
out = image + 0.5 * noise
|
||||
out = np.clip(out, 0, 255)
|
||||
out = np.uint8(out)
|
||||
return out
|
||||
|
||||
|
||||
def get_crop(image):
|
||||
"""
|
||||
random crop
|
||||
"""
|
||||
h, w, _ = image.shape
|
||||
top_min = 1
|
||||
top_max = 8
|
||||
top_crop = int(random.randint(top_min, top_max))
|
||||
top_crop = min(top_crop, h - 1)
|
||||
crop_img = image.copy()
|
||||
ratio = random.randint(0, 1)
|
||||
if ratio:
|
||||
crop_img = crop_img[top_crop:h, :, :]
|
||||
else:
|
||||
crop_img = crop_img[0:h - top_crop, :, :]
|
||||
return crop_img
|
||||
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Config
|
||||
"""
|
||||
|
||||
def __init__(self, use_tia):
|
||||
self.anglex = random.random() * 30
|
||||
self.angley = random.random() * 15
|
||||
self.anglez = random.random() * 10
|
||||
self.fov = 42
|
||||
self.r = 0
|
||||
self.shearx = random.random() * 0.3
|
||||
self.sheary = random.random() * 0.05
|
||||
self.borderMode = cv2.BORDER_REPLICATE
|
||||
self.use_tia = use_tia
|
||||
|
||||
def make(self, w, h, ang):
|
||||
"""
|
||||
make
|
||||
"""
|
||||
self.anglex = random.random() * 5 * flag()
|
||||
self.angley = random.random() * 5 * flag()
|
||||
self.anglez = -1 * random.random() * int(ang) * flag()
|
||||
self.fov = 42
|
||||
self.r = 0
|
||||
self.shearx = 0
|
||||
self.sheary = 0
|
||||
self.borderMode = cv2.BORDER_REPLICATE
|
||||
self.w = w
|
||||
self.h = h
|
||||
|
||||
self.perspective = self.use_tia
|
||||
self.stretch = self.use_tia
|
||||
self.distort = self.use_tia
|
||||
|
||||
self.crop = True
|
||||
self.affine = False
|
||||
self.reverse = True
|
||||
self.noise = True
|
||||
self.jitter = True
|
||||
self.blur = True
|
||||
self.color = True
|
||||
|
||||
|
||||
def rad(x):
|
||||
"""
|
||||
rad
|
||||
"""
|
||||
return x * np.pi / 180
|
||||
|
||||
|
||||
def get_warpR(config):
|
||||
"""
|
||||
get_warpR
|
||||
"""
|
||||
anglex, angley, anglez, fov, w, h, r = \
|
||||
config.anglex, config.angley, config.anglez, config.fov, config.w, config.h, config.r
|
||||
if w > 69 and w < 112:
|
||||
anglex = anglex * 1.5
|
||||
|
||||
z = np.sqrt(w**2 + h**2) / 2 / np.tan(rad(fov / 2))
|
||||
# Homogeneous coordinate transformation matrix
|
||||
rx = np.array([[1, 0, 0, 0],
|
||||
[0, np.cos(rad(anglex)), -np.sin(rad(anglex)), 0], [
|
||||
0,
|
||||
-np.sin(rad(anglex)),
|
||||
np.cos(rad(anglex)),
|
||||
0,
|
||||
], [0, 0, 0, 1]], np.float32)
|
||||
ry = np.array([[np.cos(rad(angley)), 0, np.sin(rad(angley)), 0],
|
||||
[0, 1, 0, 0], [
|
||||
-np.sin(rad(angley)),
|
||||
0,
|
||||
np.cos(rad(angley)),
|
||||
0,
|
||||
], [0, 0, 0, 1]], np.float32)
|
||||
rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0, 0],
|
||||
[-np.sin(rad(anglez)), np.cos(rad(anglez)), 0, 0],
|
||||
[0, 0, 1, 0], [0, 0, 0, 1]], np.float32)
|
||||
r = rx.dot(ry).dot(rz)
|
||||
# generate 4 points
|
||||
pcenter = np.array([h / 2, w / 2, 0, 0], np.float32)
|
||||
p1 = np.array([0, 0, 0, 0], np.float32) - pcenter
|
||||
p2 = np.array([w, 0, 0, 0], np.float32) - pcenter
|
||||
p3 = np.array([0, h, 0, 0], np.float32) - pcenter
|
||||
p4 = np.array([w, h, 0, 0], np.float32) - pcenter
|
||||
dst1 = r.dot(p1)
|
||||
dst2 = r.dot(p2)
|
||||
dst3 = r.dot(p3)
|
||||
dst4 = r.dot(p4)
|
||||
list_dst = np.array([dst1, dst2, dst3, dst4])
|
||||
org = np.array([[0, 0], [w, 0], [0, h], [w, h]], np.float32)
|
||||
dst = np.zeros((4, 2), np.float32)
|
||||
# Project onto the image plane
|
||||
dst[:, 0] = list_dst[:, 0] * z / (z - list_dst[:, 2]) + pcenter[0]
|
||||
dst[:, 1] = list_dst[:, 1] * z / (z - list_dst[:, 2]) + pcenter[1]
|
||||
|
||||
warpR = cv2.getPerspectiveTransform(org, dst)
|
||||
|
||||
dst1, dst2, dst3, dst4 = dst
|
||||
r1 = int(min(dst1[1], dst2[1]))
|
||||
r2 = int(max(dst3[1], dst4[1]))
|
||||
c1 = int(min(dst1[0], dst3[0]))
|
||||
c2 = int(max(dst2[0], dst4[0]))
|
||||
|
||||
try:
|
||||
ratio = min(1.0 * h / (r2 - r1), 1.0 * w / (c2 - c1))
|
||||
|
||||
dx = -c1
|
||||
dy = -r1
|
||||
T1 = np.float32([[1., 0, dx], [0, 1., dy], [0, 0, 1.0 / ratio]])
|
||||
ret = T1.dot(warpR)
|
||||
except:
|
||||
ratio = 1.0
|
||||
T1 = np.float32([[1., 0, 0], [0, 1., 0], [0, 0, 1.]])
|
||||
ret = T1
|
||||
return ret, (-r1, -c1), ratio, dst
|
||||
|
||||
|
||||
def get_warpAffine(config):
|
||||
"""
|
||||
get_warpAffine
|
||||
"""
|
||||
anglez = config.anglez
|
||||
rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0],
|
||||
[-np.sin(rad(anglez)), np.cos(rad(anglez)), 0]], np.float32)
|
||||
return rz
|
||||
|
||||
|
||||
def warp(img, ang, use_tia=True, prob=0.4):
|
||||
"""
|
||||
warp
|
||||
"""
|
||||
h, w, _ = img.shape
|
||||
config = Config(use_tia=use_tia)
|
||||
config.make(w, h, ang)
|
||||
new_img = img
|
||||
|
||||
if config.distort:
|
||||
img_height, img_width = img.shape[0:2]
|
||||
if random.random() <= prob and img_height >= 20 and img_width >= 20:
|
||||
new_img = tia_distort(new_img, random.randint(3, 6))
|
||||
|
||||
if config.stretch:
|
||||
img_height, img_width = img.shape[0:2]
|
||||
if random.random() <= prob and img_height >= 20 and img_width >= 20:
|
||||
new_img = tia_stretch(new_img, random.randint(3, 6))
|
||||
|
||||
if config.perspective:
|
||||
if random.random() <= prob:
|
||||
new_img = tia_perspective(new_img)
|
||||
|
||||
if config.crop:
|
||||
img_height, img_width = img.shape[0:2]
|
||||
if random.random() <= prob and img_height >= 20 and img_width >= 20:
|
||||
new_img = get_crop(new_img)
|
||||
|
||||
if config.blur:
|
||||
if random.random() <= prob:
|
||||
new_img = blur(new_img)
|
||||
if config.color:
|
||||
if random.random() <= prob:
|
||||
new_img = cvtColor(new_img)
|
||||
if config.jitter:
|
||||
new_img = jitter(new_img)
|
||||
if config.noise:
|
||||
if random.random() <= prob:
|
||||
new_img = add_gasuss_noise(new_img)
|
||||
if config.reverse:
|
||||
if random.random() <= prob:
|
||||
new_img = 255 - new_img
|
||||
return new_img
|
||||
777
backend/ppocr/data/imaug/sast_process.py
Normal file
777
backend/ppocr/data/imaug/sast_process.py
Normal file
@@ -0,0 +1,777 @@
|
||||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
"""
|
||||
This part code is refered from:
|
||||
https://github.com/songdejia/EAST/blob/master/data_utils.py
|
||||
"""
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
__all__ = ['SASTProcessTrain']
|
||||
|
||||
|
||||
class SASTProcessTrain(object):
|
||||
def __init__(self,
|
||||
image_shape=[512, 512],
|
||||
min_crop_size=24,
|
||||
min_crop_side_ratio=0.3,
|
||||
min_text_size=10,
|
||||
max_text_size=512,
|
||||
**kwargs):
|
||||
self.input_size = image_shape[1]
|
||||
self.min_crop_size = min_crop_size
|
||||
self.min_crop_side_ratio = min_crop_side_ratio
|
||||
self.min_text_size = min_text_size
|
||||
self.max_text_size = max_text_size
|
||||
|
||||
def quad_area(self, poly):
|
||||
"""
|
||||
compute area of a polygon
|
||||
:param poly:
|
||||
:return:
|
||||
"""
|
||||
edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
|
||||
(poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
|
||||
(poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
|
||||
(poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])]
|
||||
return np.sum(edge) / 2.
|
||||
|
||||
def gen_quad_from_poly(self, poly):
|
||||
"""
|
||||
Generate min area quad from poly.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
min_area_quad = np.zeros((4, 2), dtype=np.float32)
|
||||
if True:
|
||||
rect = cv2.minAreaRect(poly.astype(
|
||||
np.int32)) # (center (x,y), (width, height), angle of rotation)
|
||||
center_point = rect[0]
|
||||
box = np.array(cv2.boxPoints(rect))
|
||||
|
||||
first_point_idx = 0
|
||||
min_dist = 1e4
|
||||
for i in range(4):
|
||||
dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
|
||||
np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
|
||||
np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
|
||||
np.linalg.norm(box[(i + 3) % 4] - poly[-1])
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
first_point_idx = i
|
||||
for i in range(4):
|
||||
min_area_quad[i] = box[(first_point_idx + i) % 4]
|
||||
|
||||
return min_area_quad
|
||||
|
||||
def check_and_validate_polys(self, polys, tags, xxx_todo_changeme):
|
||||
"""
|
||||
check so that the text poly is in the same direction,
|
||||
and also filter some invalid polygons
|
||||
:param polys:
|
||||
:param tags:
|
||||
:return:
|
||||
"""
|
||||
(h, w) = xxx_todo_changeme
|
||||
if polys.shape[0] == 0:
|
||||
return polys, np.array([]), np.array([])
|
||||
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
|
||||
polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)
|
||||
|
||||
validated_polys = []
|
||||
validated_tags = []
|
||||
hv_tags = []
|
||||
for poly, tag in zip(polys, tags):
|
||||
quad = self.gen_quad_from_poly(poly)
|
||||
p_area = self.quad_area(quad)
|
||||
if abs(p_area) < 1:
|
||||
print('invalid poly')
|
||||
continue
|
||||
if p_area > 0:
|
||||
if tag == False:
|
||||
print('poly in wrong direction')
|
||||
tag = True # reversed cases should be ignore
|
||||
poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
|
||||
1), :]
|
||||
quad = quad[(0, 3, 2, 1), :]
|
||||
|
||||
len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] -
|
||||
quad[2])
|
||||
len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] -
|
||||
quad[2])
|
||||
hv_tag = 1
|
||||
|
||||
if len_w * 2.0 < len_h:
|
||||
hv_tag = 0
|
||||
|
||||
validated_polys.append(poly)
|
||||
validated_tags.append(tag)
|
||||
hv_tags.append(hv_tag)
|
||||
return np.array(validated_polys), np.array(validated_tags), np.array(
|
||||
hv_tags)
|
||||
|
||||
def crop_area(self,
|
||||
im,
|
||||
polys,
|
||||
tags,
|
||||
hv_tags,
|
||||
crop_background=False,
|
||||
max_tries=25):
|
||||
"""
|
||||
make random crop from the input image
|
||||
:param im:
|
||||
:param polys:
|
||||
:param tags:
|
||||
:param crop_background:
|
||||
:param max_tries: 50 -> 25
|
||||
:return:
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
pad_h = h // 10
|
||||
pad_w = w // 10
|
||||
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
|
||||
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
|
||||
for poly in polys:
|
||||
poly = np.round(poly, decimals=0).astype(np.int32)
|
||||
minx = np.min(poly[:, 0])
|
||||
maxx = np.max(poly[:, 0])
|
||||
w_array[minx + pad_w:maxx + pad_w] = 1
|
||||
miny = np.min(poly[:, 1])
|
||||
maxy = np.max(poly[:, 1])
|
||||
h_array[miny + pad_h:maxy + pad_h] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return im, polys, tags, hv_tags
|
||||
for i in range(max_tries):
|
||||
xx = np.random.choice(w_axis, size=2)
|
||||
xmin = np.min(xx) - pad_w
|
||||
xmax = np.max(xx) - pad_w
|
||||
xmin = np.clip(xmin, 0, w - 1)
|
||||
xmax = np.clip(xmax, 0, w - 1)
|
||||
yy = np.random.choice(h_axis, size=2)
|
||||
ymin = np.min(yy) - pad_h
|
||||
ymax = np.max(yy) - pad_h
|
||||
ymin = np.clip(ymin, 0, h - 1)
|
||||
ymax = np.clip(ymax, 0, h - 1)
|
||||
# if xmax - xmin < ARGS.min_crop_side_ratio * w or \
|
||||
# ymax - ymin < ARGS.min_crop_side_ratio * h:
|
||||
if xmax - xmin < self.min_crop_size or \
|
||||
ymax - ymin < self.min_crop_size:
|
||||
# area too small
|
||||
continue
|
||||
if polys.shape[0] != 0:
|
||||
poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \
|
||||
& (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax)
|
||||
selected_polys = np.where(
|
||||
np.sum(poly_axis_in_area, axis=1) == 4)[0]
|
||||
else:
|
||||
selected_polys = []
|
||||
if len(selected_polys) == 0:
|
||||
# no text in this area
|
||||
if crop_background:
|
||||
return im[ymin : ymax + 1, xmin : xmax + 1, :], \
|
||||
polys[selected_polys], tags[selected_polys], hv_tags[selected_polys]
|
||||
else:
|
||||
continue
|
||||
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
|
||||
polys = polys[selected_polys]
|
||||
tags = tags[selected_polys]
|
||||
hv_tags = hv_tags[selected_polys]
|
||||
polys[:, :, 0] -= xmin
|
||||
polys[:, :, 1] -= ymin
|
||||
return im, polys, tags, hv_tags
|
||||
|
||||
return im, polys, tags, hv_tags
|
||||
|
||||
def generate_direction_map(self, poly_quads, direction_map):
|
||||
"""
|
||||
"""
|
||||
width_list = []
|
||||
height_list = []
|
||||
for quad in poly_quads:
|
||||
quad_w = (np.linalg.norm(quad[0] - quad[1]) +
|
||||
np.linalg.norm(quad[2] - quad[3])) / 2.0
|
||||
quad_h = (np.linalg.norm(quad[0] - quad[3]) +
|
||||
np.linalg.norm(quad[2] - quad[1])) / 2.0
|
||||
width_list.append(quad_w)
|
||||
height_list.append(quad_h)
|
||||
norm_width = max(sum(width_list) / (len(width_list) + 1e-6), 1.0)
|
||||
average_height = max(sum(height_list) / (len(height_list) + 1e-6), 1.0)
|
||||
|
||||
for quad in poly_quads:
|
||||
direct_vector_full = (
|
||||
(quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0
|
||||
direct_vector = direct_vector_full / (
|
||||
np.linalg.norm(direct_vector_full) + 1e-6) * norm_width
|
||||
direction_label = tuple(
|
||||
map(float, [
|
||||
direct_vector[0], direct_vector[1], 1.0 / (average_height +
|
||||
1e-6)
|
||||
]))
|
||||
cv2.fillPoly(direction_map,
|
||||
quad.round().astype(np.int32)[np.newaxis, :, :],
|
||||
direction_label)
|
||||
return direction_map
|
||||
|
||||
def calculate_average_height(self, poly_quads):
|
||||
"""
|
||||
"""
|
||||
height_list = []
|
||||
for quad in poly_quads:
|
||||
quad_h = (np.linalg.norm(quad[0] - quad[3]) +
|
||||
np.linalg.norm(quad[2] - quad[1])) / 2.0
|
||||
height_list.append(quad_h)
|
||||
average_height = max(sum(height_list) / len(height_list), 1.0)
|
||||
return average_height
|
||||
|
||||
def generate_tcl_label(self,
|
||||
hw,
|
||||
polys,
|
||||
tags,
|
||||
ds_ratio,
|
||||
tcl_ratio=0.3,
|
||||
shrink_ratio_of_width=0.15):
|
||||
"""
|
||||
Generate polygon.
|
||||
"""
|
||||
h, w = hw
|
||||
h, w = int(h * ds_ratio), int(w * ds_ratio)
|
||||
polys = polys * ds_ratio
|
||||
|
||||
score_map = np.zeros(
|
||||
(
|
||||
h,
|
||||
w, ), dtype=np.float32)
|
||||
tbo_map = np.zeros((h, w, 5), dtype=np.float32)
|
||||
training_mask = np.ones(
|
||||
(
|
||||
h,
|
||||
w, ), dtype=np.float32)
|
||||
direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape(
|
||||
[1, 1, 3]).astype(np.float32)
|
||||
|
||||
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
|
||||
poly = poly_tag[0]
|
||||
tag = poly_tag[1]
|
||||
|
||||
# generate min_area_quad
|
||||
min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
|
||||
min_area_quad_h = 0.5 * (
|
||||
np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
|
||||
np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
|
||||
min_area_quad_w = 0.5 * (
|
||||
np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
|
||||
np.linalg.norm(min_area_quad[2] - min_area_quad[3]))
|
||||
|
||||
if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \
|
||||
or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio:
|
||||
continue
|
||||
|
||||
if tag:
|
||||
# continue
|
||||
cv2.fillPoly(training_mask,
|
||||
poly.astype(np.int32)[np.newaxis, :, :], 0.15)
|
||||
else:
|
||||
tcl_poly = self.poly2tcl(poly, tcl_ratio)
|
||||
tcl_quads = self.poly2quads(tcl_poly)
|
||||
poly_quads = self.poly2quads(poly)
|
||||
# stcl map
|
||||
stcl_quads, quad_index = self.shrink_poly_along_width(
|
||||
tcl_quads,
|
||||
shrink_ratio_of_width=shrink_ratio_of_width,
|
||||
expand_height_ratio=1.0 / tcl_ratio)
|
||||
# generate tcl map
|
||||
cv2.fillPoly(score_map,
|
||||
np.round(stcl_quads).astype(np.int32), 1.0)
|
||||
|
||||
# generate tbo map
|
||||
for idx, quad in enumerate(stcl_quads):
|
||||
quad_mask = np.zeros((h, w), dtype=np.float32)
|
||||
quad_mask = cv2.fillPoly(
|
||||
quad_mask,
|
||||
np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0)
|
||||
tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]],
|
||||
quad_mask, tbo_map)
|
||||
return score_map, tbo_map, training_mask
|
||||
|
||||
def generate_tvo_and_tco(self,
|
||||
hw,
|
||||
polys,
|
||||
tags,
|
||||
tcl_ratio=0.3,
|
||||
ds_ratio=0.25):
|
||||
"""
|
||||
Generate tcl map, tvo map and tbo map.
|
||||
"""
|
||||
h, w = hw
|
||||
h, w = int(h * ds_ratio), int(w * ds_ratio)
|
||||
polys = polys * ds_ratio
|
||||
poly_mask = np.zeros((h, w), dtype=np.float32)
|
||||
|
||||
tvo_map = np.ones((9, h, w), dtype=np.float32)
|
||||
tvo_map[0:-1:2] = np.tile(np.arange(0, w), (h, 1))
|
||||
tvo_map[1:-1:2] = np.tile(np.arange(0, w), (h, 1)).T
|
||||
poly_tv_xy_map = np.zeros((8, h, w), dtype=np.float32)
|
||||
|
||||
# tco map
|
||||
tco_map = np.ones((3, h, w), dtype=np.float32)
|
||||
tco_map[0] = np.tile(np.arange(0, w), (h, 1))
|
||||
tco_map[1] = np.tile(np.arange(0, w), (h, 1)).T
|
||||
poly_tc_xy_map = np.zeros((2, h, w), dtype=np.float32)
|
||||
|
||||
poly_short_edge_map = np.ones((h, w), dtype=np.float32)
|
||||
|
||||
for poly, poly_tag in zip(polys, tags):
|
||||
|
||||
if poly_tag == True:
|
||||
continue
|
||||
|
||||
# adjust point order for vertical poly
|
||||
poly = self.adjust_point(poly)
|
||||
|
||||
# generate min_area_quad
|
||||
min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
|
||||
min_area_quad_h = 0.5 * (
|
||||
np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
|
||||
np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
|
||||
min_area_quad_w = 0.5 * (
|
||||
np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
|
||||
np.linalg.norm(min_area_quad[2] - min_area_quad[3]))
|
||||
|
||||
# generate tcl map and text, 128 * 128
|
||||
tcl_poly = self.poly2tcl(poly, tcl_ratio)
|
||||
|
||||
# generate poly_tv_xy_map
|
||||
for idx in range(4):
|
||||
cv2.fillPoly(
|
||||
poly_tv_xy_map[2 * idx],
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(min(max(min_area_quad[idx, 0], 0), w)))
|
||||
cv2.fillPoly(
|
||||
poly_tv_xy_map[2 * idx + 1],
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(min(max(min_area_quad[idx, 1], 0), h)))
|
||||
|
||||
# generate poly_tc_xy_map
|
||||
for idx in range(2):
|
||||
cv2.fillPoly(
|
||||
poly_tc_xy_map[idx],
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(center_point[idx]))
|
||||
|
||||
# generate poly_short_edge_map
|
||||
cv2.fillPoly(
|
||||
poly_short_edge_map,
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(max(min(min_area_quad_h, min_area_quad_w), 1.0)))
|
||||
|
||||
# generate poly_mask and training_mask
|
||||
cv2.fillPoly(poly_mask,
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
1)
|
||||
|
||||
tvo_map *= poly_mask
|
||||
tvo_map[:8] -= poly_tv_xy_map
|
||||
tvo_map[-1] /= poly_short_edge_map
|
||||
tvo_map = tvo_map.transpose((1, 2, 0))
|
||||
|
||||
tco_map *= poly_mask
|
||||
tco_map[:2] -= poly_tc_xy_map
|
||||
tco_map[-1] /= poly_short_edge_map
|
||||
tco_map = tco_map.transpose((1, 2, 0))
|
||||
|
||||
return tvo_map, tco_map
|
||||
|
||||
def adjust_point(self, poly):
|
||||
"""
|
||||
adjust point order.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
if point_num == 4:
|
||||
len_1 = np.linalg.norm(poly[0] - poly[1])
|
||||
len_2 = np.linalg.norm(poly[1] - poly[2])
|
||||
len_3 = np.linalg.norm(poly[2] - poly[3])
|
||||
len_4 = np.linalg.norm(poly[3] - poly[0])
|
||||
|
||||
if (len_1 + len_3) * 1.5 < (len_2 + len_4):
|
||||
poly = poly[[1, 2, 3, 0], :]
|
||||
|
||||
elif point_num > 4:
|
||||
vector_1 = poly[0] - poly[1]
|
||||
vector_2 = poly[1] - poly[2]
|
||||
cos_theta = np.dot(vector_1, vector_2) / (
|
||||
np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6)
|
||||
theta = np.arccos(np.round(cos_theta, decimals=4))
|
||||
|
||||
if abs(theta) > (70 / 180 * math.pi):
|
||||
index = list(range(1, point_num)) + [0]
|
||||
poly = poly[np.array(index), :]
|
||||
return poly
|
||||
|
||||
def gen_min_area_quad_from_poly(self, poly):
|
||||
"""
|
||||
Generate min area quad from poly.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
min_area_quad = np.zeros((4, 2), dtype=np.float32)
|
||||
if point_num == 4:
|
||||
min_area_quad = poly
|
||||
center_point = np.sum(poly, axis=0) / 4
|
||||
else:
|
||||
rect = cv2.minAreaRect(poly.astype(
|
||||
np.int32)) # (center (x,y), (width, height), angle of rotation)
|
||||
center_point = rect[0]
|
||||
box = np.array(cv2.boxPoints(rect))
|
||||
|
||||
first_point_idx = 0
|
||||
min_dist = 1e4
|
||||
for i in range(4):
|
||||
dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
|
||||
np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
|
||||
np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
|
||||
np.linalg.norm(box[(i + 3) % 4] - poly[-1])
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
first_point_idx = i
|
||||
|
||||
for i in range(4):
|
||||
min_area_quad[i] = box[(first_point_idx + i) % 4]
|
||||
|
||||
return min_area_quad, center_point
|
||||
|
||||
def shrink_quad_along_width(self,
|
||||
quad,
|
||||
begin_width_ratio=0.,
|
||||
end_width_ratio=1.):
|
||||
"""
|
||||
Generate shrink_quad_along_width.
|
||||
"""
|
||||
ratio_pair = np.array(
|
||||
[[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
|
||||
p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
|
||||
p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
|
||||
return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
|
||||
|
||||
def shrink_poly_along_width(self,
|
||||
quads,
|
||||
shrink_ratio_of_width,
|
||||
expand_height_ratio=1.0):
|
||||
"""
|
||||
shrink poly with given length.
|
||||
"""
|
||||
upper_edge_list = []
|
||||
|
||||
def get_cut_info(edge_len_list, cut_len):
|
||||
for idx, edge_len in enumerate(edge_len_list):
|
||||
cut_len -= edge_len
|
||||
if cut_len <= 0.000001:
|
||||
ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx]
|
||||
return idx, ratio
|
||||
|
||||
for quad in quads:
|
||||
upper_edge_len = np.linalg.norm(quad[0] - quad[1])
|
||||
upper_edge_list.append(upper_edge_len)
|
||||
|
||||
# length of left edge and right edge.
|
||||
left_length = np.linalg.norm(quads[0][0] - quads[0][
|
||||
3]) * expand_height_ratio
|
||||
right_length = np.linalg.norm(quads[-1][1] - quads[-1][
|
||||
2]) * expand_height_ratio
|
||||
|
||||
shrink_length = min(left_length, right_length,
|
||||
sum(upper_edge_list)) * shrink_ratio_of_width
|
||||
# shrinking length
|
||||
upper_len_left = shrink_length
|
||||
upper_len_right = sum(upper_edge_list) - shrink_length
|
||||
|
||||
left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left)
|
||||
left_quad = self.shrink_quad_along_width(
|
||||
quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1)
|
||||
right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right)
|
||||
right_quad = self.shrink_quad_along_width(
|
||||
quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio)
|
||||
|
||||
out_quad_list = []
|
||||
if left_idx == right_idx:
|
||||
out_quad_list.append(
|
||||
[left_quad[0], right_quad[1], right_quad[2], left_quad[3]])
|
||||
else:
|
||||
out_quad_list.append(left_quad)
|
||||
for idx in range(left_idx + 1, right_idx):
|
||||
out_quad_list.append(quads[idx])
|
||||
out_quad_list.append(right_quad)
|
||||
|
||||
return np.array(out_quad_list), list(range(left_idx, right_idx + 1))
|
||||
|
||||
def vector_angle(self, A, B):
|
||||
"""
|
||||
Calculate the angle between vector AB and x-axis positive direction.
|
||||
"""
|
||||
AB = np.array([B[1] - A[1], B[0] - A[0]])
|
||||
return np.arctan2(*AB)
|
||||
|
||||
def theta_line_cross_point(self, theta, point):
|
||||
"""
|
||||
Calculate the line through given point and angle in ax + by + c =0 form.
|
||||
"""
|
||||
x, y = point
|
||||
cos = np.cos(theta)
|
||||
sin = np.sin(theta)
|
||||
return [sin, -cos, cos * y - sin * x]
|
||||
|
||||
def line_cross_two_point(self, A, B):
|
||||
"""
|
||||
Calculate the line through given point A and B in ax + by + c =0 form.
|
||||
"""
|
||||
angle = self.vector_angle(A, B)
|
||||
return self.theta_line_cross_point(angle, A)
|
||||
|
||||
def average_angle(self, poly):
|
||||
"""
|
||||
Calculate the average angle between left and right edge in given poly.
|
||||
"""
|
||||
p0, p1, p2, p3 = poly
|
||||
angle30 = self.vector_angle(p3, p0)
|
||||
angle21 = self.vector_angle(p2, p1)
|
||||
return (angle30 + angle21) / 2
|
||||
|
||||
def line_cross_point(self, line1, line2):
|
||||
"""
|
||||
line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2
|
||||
"""
|
||||
a1, b1, c1 = line1
|
||||
a2, b2, c2 = line2
|
||||
d = a1 * b2 - a2 * b1
|
||||
|
||||
if d == 0:
|
||||
#print("line1", line1)
|
||||
#print("line2", line2)
|
||||
print('Cross point does not exist')
|
||||
return np.array([0, 0], dtype=np.float32)
|
||||
else:
|
||||
x = (b1 * c2 - b2 * c1) / d
|
||||
y = (a2 * c1 - a1 * c2) / d
|
||||
|
||||
return np.array([x, y], dtype=np.float32)
|
||||
|
||||
def quad2tcl(self, poly, ratio):
|
||||
"""
|
||||
Generate center line by poly clock-wise point. (4, 2)
|
||||
"""
|
||||
ratio_pair = np.array(
|
||||
[[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
|
||||
p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair
|
||||
p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair
|
||||
return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]])
|
||||
|
||||
def poly2tcl(self, poly, ratio):
|
||||
"""
|
||||
Generate center line by poly clock-wise point.
|
||||
"""
|
||||
ratio_pair = np.array(
|
||||
[[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
|
||||
tcl_poly = np.zeros_like(poly)
|
||||
point_num = poly.shape[0]
|
||||
|
||||
for idx in range(point_num // 2):
|
||||
point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx]
|
||||
) * ratio_pair
|
||||
tcl_poly[idx] = point_pair[0]
|
||||
tcl_poly[point_num - 1 - idx] = point_pair[1]
|
||||
return tcl_poly
|
||||
|
||||
def gen_quad_tbo(self, quad, tcl_mask, tbo_map):
|
||||
"""
|
||||
Generate tbo_map for give quad.
|
||||
"""
|
||||
# upper and lower line function: ax + by + c = 0;
|
||||
up_line = self.line_cross_two_point(quad[0], quad[1])
|
||||
lower_line = self.line_cross_two_point(quad[3], quad[2])
|
||||
|
||||
quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) +
|
||||
np.linalg.norm(quad[1] - quad[2]))
|
||||
quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) +
|
||||
np.linalg.norm(quad[2] - quad[3]))
|
||||
|
||||
# average angle of left and right line.
|
||||
angle = self.average_angle(quad)
|
||||
|
||||
xy_in_poly = np.argwhere(tcl_mask == 1)
|
||||
for y, x in xy_in_poly:
|
||||
point = (x, y)
|
||||
line = self.theta_line_cross_point(angle, point)
|
||||
cross_point_upper = self.line_cross_point(up_line, line)
|
||||
cross_point_lower = self.line_cross_point(lower_line, line)
|
||||
##FIX, offset reverse
|
||||
upper_offset_x, upper_offset_y = cross_point_upper - point
|
||||
lower_offset_x, lower_offset_y = cross_point_lower - point
|
||||
tbo_map[y, x, 0] = upper_offset_y
|
||||
tbo_map[y, x, 1] = upper_offset_x
|
||||
tbo_map[y, x, 2] = lower_offset_y
|
||||
tbo_map[y, x, 3] = lower_offset_x
|
||||
tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2
|
||||
return tbo_map
|
||||
|
||||
def poly2quads(self, poly):
|
||||
"""
|
||||
Split poly into quads.
|
||||
"""
|
||||
quad_list = []
|
||||
point_num = poly.shape[0]
|
||||
|
||||
# point pair
|
||||
point_pair_list = []
|
||||
for idx in range(point_num // 2):
|
||||
point_pair = [poly[idx], poly[point_num - 1 - idx]]
|
||||
point_pair_list.append(point_pair)
|
||||
|
||||
quad_num = point_num // 2 - 1
|
||||
for idx in range(quad_num):
|
||||
# reshape and adjust to clock-wise
|
||||
quad_list.append((np.array(point_pair_list)[[idx, idx + 1]]
|
||||
).reshape(4, 2)[[0, 2, 3, 1]])
|
||||
|
||||
return np.array(quad_list)
|
||||
|
||||
def __call__(self, data):
|
||||
im = data['image']
|
||||
text_polys = data['polys']
|
||||
text_tags = data['ignore_tags']
|
||||
if im is None:
|
||||
return None
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
h, w, _ = im.shape
|
||||
text_polys, text_tags, hv_tags = self.check_and_validate_polys(
|
||||
text_polys, text_tags, (h, w))
|
||||
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
#set aspect ratio and keep area fix
|
||||
asp_scales = np.arange(1.0, 1.55, 0.1)
|
||||
asp_scale = np.random.choice(asp_scales)
|
||||
|
||||
if np.random.rand() < 0.5:
|
||||
asp_scale = 1.0 / asp_scale
|
||||
asp_scale = math.sqrt(asp_scale)
|
||||
|
||||
asp_wx = asp_scale
|
||||
asp_hy = 1.0 / asp_scale
|
||||
im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy)
|
||||
text_polys[:, :, 0] *= asp_wx
|
||||
text_polys[:, :, 1] *= asp_hy
|
||||
|
||||
h, w, _ = im.shape
|
||||
if max(h, w) > 2048:
|
||||
rd_scale = 2048.0 / max(h, w)
|
||||
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
|
||||
text_polys *= rd_scale
|
||||
h, w, _ = im.shape
|
||||
if min(h, w) < 16:
|
||||
return None
|
||||
|
||||
#no background
|
||||
im, text_polys, text_tags, hv_tags = self.crop_area(im, \
|
||||
text_polys, text_tags, hv_tags, crop_background=False)
|
||||
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
#continue for all ignore case
|
||||
if np.sum((text_tags * 1.0)) >= text_tags.size:
|
||||
return None
|
||||
new_h, new_w, _ = im.shape
|
||||
if (new_h is None) or (new_w is None):
|
||||
return None
|
||||
#resize image
|
||||
std_ratio = float(self.input_size) / max(new_w, new_h)
|
||||
rand_scales = np.array(
|
||||
[0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
rz_scale = std_ratio * np.random.choice(rand_scales)
|
||||
im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale)
|
||||
text_polys[:, :, 0] *= rz_scale
|
||||
text_polys[:, :, 1] *= rz_scale
|
||||
|
||||
#add gaussian blur
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
ks = np.random.permutation(5)[0] + 1
|
||||
ks = int(ks / 2) * 2 + 1
|
||||
im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0)
|
||||
#add brighter
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
im = im * (1.0 + np.random.rand() * 0.5)
|
||||
im = np.clip(im, 0.0, 255.0)
|
||||
#add darker
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
im = im * (1.0 - np.random.rand() * 0.5)
|
||||
im = np.clip(im, 0.0, 255.0)
|
||||
|
||||
# Padding the im to [input_size, input_size]
|
||||
new_h, new_w, _ = im.shape
|
||||
if min(new_w, new_h) < self.input_size * 0.5:
|
||||
return None
|
||||
|
||||
im_padded = np.ones(
|
||||
(self.input_size, self.input_size, 3), dtype=np.float32)
|
||||
im_padded[:, :, 2] = 0.485 * 255
|
||||
im_padded[:, :, 1] = 0.456 * 255
|
||||
im_padded[:, :, 0] = 0.406 * 255
|
||||
|
||||
# Random the start position
|
||||
del_h = self.input_size - new_h
|
||||
del_w = self.input_size - new_w
|
||||
sh, sw = 0, 0
|
||||
if del_h > 1:
|
||||
sh = int(np.random.rand() * del_h)
|
||||
if del_w > 1:
|
||||
sw = int(np.random.rand() * del_w)
|
||||
|
||||
# Padding
|
||||
im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy()
|
||||
text_polys[:, :, 0] += sw
|
||||
text_polys[:, :, 1] += sh
|
||||
|
||||
score_map, border_map, training_mask = self.generate_tcl_label(
|
||||
(self.input_size, self.input_size), text_polys, text_tags, 0.25)
|
||||
|
||||
# SAST head
|
||||
tvo_map, tco_map = self.generate_tvo_and_tco(
|
||||
(self.input_size, self.input_size),
|
||||
text_polys,
|
||||
text_tags,
|
||||
tcl_ratio=0.3,
|
||||
ds_ratio=0.25)
|
||||
# print("test--------tvo_map shape:", tvo_map.shape)
|
||||
|
||||
im_padded[:, :, 2] -= 0.485 * 255
|
||||
im_padded[:, :, 1] -= 0.456 * 255
|
||||
im_padded[:, :, 0] -= 0.406 * 255
|
||||
im_padded[:, :, 2] /= (255.0 * 0.229)
|
||||
im_padded[:, :, 1] /= (255.0 * 0.224)
|
||||
im_padded[:, :, 0] /= (255.0 * 0.225)
|
||||
im_padded = im_padded.transpose((2, 0, 1))
|
||||
|
||||
data['image'] = im_padded[::-1, :, :]
|
||||
data['score_map'] = score_map[np.newaxis, :, :]
|
||||
data['border_map'] = border_map.transpose((2, 0, 1))
|
||||
data['training_mask'] = training_mask[np.newaxis, :, :]
|
||||
data['tvo_map'] = tvo_map.transpose((2, 0, 1))
|
||||
data['tco_map'] = tco_map.transpose((2, 0, 1))
|
||||
return data
|
||||
60
backend/ppocr/data/imaug/ssl_img_aug.py
Normal file
60
backend/ppocr/data/imaug/ssl_img_aug.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import random
|
||||
from PIL import Image
|
||||
|
||||
from .rec_img_aug import resize_norm_img
|
||||
|
||||
|
||||
class SSLRotateResize(object):
|
||||
def __init__(self,
|
||||
image_shape,
|
||||
padding=False,
|
||||
select_all=True,
|
||||
mode="train",
|
||||
**kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.padding = padding
|
||||
self.select_all = select_all
|
||||
self.mode = mode
|
||||
|
||||
def __call__(self, data):
|
||||
img = data["image"]
|
||||
|
||||
data["image_r90"] = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
|
||||
data["image_r180"] = cv2.rotate(data["image_r90"],
|
||||
cv2.ROTATE_90_CLOCKWISE)
|
||||
data["image_r270"] = cv2.rotate(data["image_r180"],
|
||||
cv2.ROTATE_90_CLOCKWISE)
|
||||
|
||||
images = []
|
||||
for key in ["image", "image_r90", "image_r180", "image_r270"]:
|
||||
images.append(
|
||||
resize_norm_img(
|
||||
data.pop(key),
|
||||
image_shape=self.image_shape,
|
||||
padding=self.padding)[0])
|
||||
data["image"] = np.stack(images, axis=0)
|
||||
data["label"] = np.array(list(range(4)))
|
||||
if not self.select_all:
|
||||
data["image"] = data["image"][0::2] # just choose 0 and 180
|
||||
data["label"] = data["label"][0:2] # label needs to be continuous
|
||||
if self.mode == "test":
|
||||
data["image"] = data["image"][0]
|
||||
data["label"] = data["label"][0]
|
||||
return data
|
||||
17
backend/ppocr/data/imaug/text_image_aug/__init__.py
Normal file
17
backend/ppocr/data/imaug/text_image_aug/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .augment import tia_perspective, tia_distort, tia_stretch
|
||||
|
||||
__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective']
|
||||
120
backend/ppocr/data/imaug/text_image_aug/augment.py
Normal file
120
backend/ppocr/data/imaug/text_image_aug/augment.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/augment.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from .warp_mls import WarpMLS
|
||||
|
||||
|
||||
def tia_distort(src, segment=4):
|
||||
img_h, img_w = src.shape[:2]
|
||||
|
||||
cut = img_w // segment
|
||||
thresh = cut // 3
|
||||
|
||||
src_pts = list()
|
||||
dst_pts = list()
|
||||
|
||||
src_pts.append([0, 0])
|
||||
src_pts.append([img_w, 0])
|
||||
src_pts.append([img_w, img_h])
|
||||
src_pts.append([0, img_h])
|
||||
|
||||
dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)])
|
||||
dst_pts.append(
|
||||
[img_w - np.random.randint(thresh), np.random.randint(thresh)])
|
||||
dst_pts.append(
|
||||
[img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)])
|
||||
dst_pts.append(
|
||||
[np.random.randint(thresh), img_h - np.random.randint(thresh)])
|
||||
|
||||
half_thresh = thresh * 0.5
|
||||
|
||||
for cut_idx in np.arange(1, segment, 1):
|
||||
src_pts.append([cut * cut_idx, 0])
|
||||
src_pts.append([cut * cut_idx, img_h])
|
||||
dst_pts.append([
|
||||
cut * cut_idx + np.random.randint(thresh) - half_thresh,
|
||||
np.random.randint(thresh) - half_thresh
|
||||
])
|
||||
dst_pts.append([
|
||||
cut * cut_idx + np.random.randint(thresh) - half_thresh,
|
||||
img_h + np.random.randint(thresh) - half_thresh
|
||||
])
|
||||
|
||||
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
||||
dst = trans.generate()
|
||||
|
||||
return dst
|
||||
|
||||
|
||||
def tia_stretch(src, segment=4):
|
||||
img_h, img_w = src.shape[:2]
|
||||
|
||||
cut = img_w // segment
|
||||
thresh = cut * 4 // 5
|
||||
|
||||
src_pts = list()
|
||||
dst_pts = list()
|
||||
|
||||
src_pts.append([0, 0])
|
||||
src_pts.append([img_w, 0])
|
||||
src_pts.append([img_w, img_h])
|
||||
src_pts.append([0, img_h])
|
||||
|
||||
dst_pts.append([0, 0])
|
||||
dst_pts.append([img_w, 0])
|
||||
dst_pts.append([img_w, img_h])
|
||||
dst_pts.append([0, img_h])
|
||||
|
||||
half_thresh = thresh * 0.5
|
||||
|
||||
for cut_idx in np.arange(1, segment, 1):
|
||||
move = np.random.randint(thresh) - half_thresh
|
||||
src_pts.append([cut * cut_idx, 0])
|
||||
src_pts.append([cut * cut_idx, img_h])
|
||||
dst_pts.append([cut * cut_idx + move, 0])
|
||||
dst_pts.append([cut * cut_idx + move, img_h])
|
||||
|
||||
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
||||
dst = trans.generate()
|
||||
|
||||
return dst
|
||||
|
||||
|
||||
def tia_perspective(src):
|
||||
img_h, img_w = src.shape[:2]
|
||||
|
||||
thresh = img_h // 2
|
||||
|
||||
src_pts = list()
|
||||
dst_pts = list()
|
||||
|
||||
src_pts.append([0, 0])
|
||||
src_pts.append([img_w, 0])
|
||||
src_pts.append([img_w, img_h])
|
||||
src_pts.append([0, img_h])
|
||||
|
||||
dst_pts.append([0, np.random.randint(thresh)])
|
||||
dst_pts.append([img_w, np.random.randint(thresh)])
|
||||
dst_pts.append([img_w, img_h - np.random.randint(thresh)])
|
||||
dst_pts.append([0, img_h - np.random.randint(thresh)])
|
||||
|
||||
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
||||
dst = trans.generate()
|
||||
|
||||
return dst
|
||||
168
backend/ppocr/data/imaug/text_image_aug/warp_mls.py
Normal file
168
backend/ppocr/data/imaug/text_image_aug/warp_mls.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is refer from:
|
||||
https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/warp_mls.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class WarpMLS:
|
||||
def __init__(self, src, src_pts, dst_pts, dst_w, dst_h, trans_ratio=1.):
|
||||
self.src = src
|
||||
self.src_pts = src_pts
|
||||
self.dst_pts = dst_pts
|
||||
self.pt_count = len(self.dst_pts)
|
||||
self.dst_w = dst_w
|
||||
self.dst_h = dst_h
|
||||
self.trans_ratio = trans_ratio
|
||||
self.grid_size = 100
|
||||
self.rdx = np.zeros((self.dst_h, self.dst_w))
|
||||
self.rdy = np.zeros((self.dst_h, self.dst_w))
|
||||
|
||||
@staticmethod
|
||||
def __bilinear_interp(x, y, v11, v12, v21, v22):
|
||||
return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 *
|
||||
(1 - y) + v22 * y) * x
|
||||
|
||||
def generate(self):
|
||||
self.calc_delta()
|
||||
return self.gen_img()
|
||||
|
||||
def calc_delta(self):
|
||||
w = np.zeros(self.pt_count, dtype=np.float32)
|
||||
|
||||
if self.pt_count < 2:
|
||||
return
|
||||
|
||||
i = 0
|
||||
while 1:
|
||||
if self.dst_w <= i < self.dst_w + self.grid_size - 1:
|
||||
i = self.dst_w - 1
|
||||
elif i >= self.dst_w:
|
||||
break
|
||||
|
||||
j = 0
|
||||
while 1:
|
||||
if self.dst_h <= j < self.dst_h + self.grid_size - 1:
|
||||
j = self.dst_h - 1
|
||||
elif j >= self.dst_h:
|
||||
break
|
||||
|
||||
sw = 0
|
||||
swp = np.zeros(2, dtype=np.float32)
|
||||
swq = np.zeros(2, dtype=np.float32)
|
||||
new_pt = np.zeros(2, dtype=np.float32)
|
||||
cur_pt = np.array([i, j], dtype=np.float32)
|
||||
|
||||
k = 0
|
||||
for k in range(self.pt_count):
|
||||
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
||||
break
|
||||
|
||||
w[k] = 1. / (
|
||||
(i - self.dst_pts[k][0]) * (i - self.dst_pts[k][0]) +
|
||||
(j - self.dst_pts[k][1]) * (j - self.dst_pts[k][1]))
|
||||
|
||||
sw += w[k]
|
||||
swp = swp + w[k] * np.array(self.dst_pts[k])
|
||||
swq = swq + w[k] * np.array(self.src_pts[k])
|
||||
|
||||
if k == self.pt_count - 1:
|
||||
pstar = 1 / sw * swp
|
||||
qstar = 1 / sw * swq
|
||||
|
||||
miu_s = 0
|
||||
for k in range(self.pt_count):
|
||||
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
||||
continue
|
||||
pt_i = self.dst_pts[k] - pstar
|
||||
miu_s += w[k] * np.sum(pt_i * pt_i)
|
||||
|
||||
cur_pt -= pstar
|
||||
cur_pt_j = np.array([-cur_pt[1], cur_pt[0]])
|
||||
|
||||
for k in range(self.pt_count):
|
||||
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
||||
continue
|
||||
|
||||
pt_i = self.dst_pts[k] - pstar
|
||||
pt_j = np.array([-pt_i[1], pt_i[0]])
|
||||
|
||||
tmp_pt = np.zeros(2, dtype=np.float32)
|
||||
tmp_pt[0] = np.sum(pt_i * cur_pt) * self.src_pts[k][0] - \
|
||||
np.sum(pt_j * cur_pt) * self.src_pts[k][1]
|
||||
tmp_pt[1] = -np.sum(pt_i * cur_pt_j) * self.src_pts[k][0] + \
|
||||
np.sum(pt_j * cur_pt_j) * self.src_pts[k][1]
|
||||
tmp_pt *= (w[k] / miu_s)
|
||||
new_pt += tmp_pt
|
||||
|
||||
new_pt += qstar
|
||||
else:
|
||||
new_pt = self.src_pts[k]
|
||||
|
||||
self.rdx[j, i] = new_pt[0] - i
|
||||
self.rdy[j, i] = new_pt[1] - j
|
||||
|
||||
j += self.grid_size
|
||||
i += self.grid_size
|
||||
|
||||
def gen_img(self):
|
||||
src_h, src_w = self.src.shape[:2]
|
||||
dst = np.zeros_like(self.src, dtype=np.float32)
|
||||
|
||||
for i in np.arange(0, self.dst_h, self.grid_size):
|
||||
for j in np.arange(0, self.dst_w, self.grid_size):
|
||||
ni = i + self.grid_size
|
||||
nj = j + self.grid_size
|
||||
w = h = self.grid_size
|
||||
if ni >= self.dst_h:
|
||||
ni = self.dst_h - 1
|
||||
h = ni - i + 1
|
||||
if nj >= self.dst_w:
|
||||
nj = self.dst_w - 1
|
||||
w = nj - j + 1
|
||||
|
||||
di = np.reshape(np.arange(h), (-1, 1))
|
||||
dj = np.reshape(np.arange(w), (1, -1))
|
||||
delta_x = self.__bilinear_interp(
|
||||
di / h, dj / w, self.rdx[i, j], self.rdx[i, nj],
|
||||
self.rdx[ni, j], self.rdx[ni, nj])
|
||||
delta_y = self.__bilinear_interp(
|
||||
di / h, dj / w, self.rdy[i, j], self.rdy[i, nj],
|
||||
self.rdy[ni, j], self.rdy[ni, nj])
|
||||
nx = j + dj + delta_x * self.trans_ratio
|
||||
ny = i + di + delta_y * self.trans_ratio
|
||||
nx = np.clip(nx, 0, src_w - 1)
|
||||
ny = np.clip(ny, 0, src_h - 1)
|
||||
nxi = np.array(np.floor(nx), dtype=np.int32)
|
||||
nyi = np.array(np.floor(ny), dtype=np.int32)
|
||||
nxi1 = np.array(np.ceil(nx), dtype=np.int32)
|
||||
nyi1 = np.array(np.ceil(ny), dtype=np.int32)
|
||||
|
||||
if len(self.src.shape) == 3:
|
||||
x = np.tile(np.expand_dims(ny - nyi, axis=-1), (1, 1, 3))
|
||||
y = np.tile(np.expand_dims(nx - nxi, axis=-1), (1, 1, 3))
|
||||
else:
|
||||
x = ny - nyi
|
||||
y = nx - nxi
|
||||
dst[i:i + h, j:j + w] = self.__bilinear_interp(
|
||||
x, y, self.src[nyi, nxi], self.src[nyi, nxi1],
|
||||
self.src[nyi1, nxi], self.src[nyi1, nxi1])
|
||||
|
||||
dst = np.clip(dst, 0, 255)
|
||||
dst = np.array(dst, dtype=np.uint8)
|
||||
|
||||
return dst
|
||||
19
backend/ppocr/data/imaug/vqa/__init__.py
Normal file
19
backend/ppocr/data/imaug/vqa/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .token import VQATokenPad, VQASerTokenChunk, VQAReTokenChunk, VQAReTokenRelation
|
||||
|
||||
__all__ = [
|
||||
'VQATokenPad', 'VQASerTokenChunk', 'VQAReTokenChunk', 'VQAReTokenRelation'
|
||||
]
|
||||
17
backend/ppocr/data/imaug/vqa/token/__init__.py
Normal file
17
backend/ppocr/data/imaug/vqa/token/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .vqa_token_chunk import VQASerTokenChunk, VQAReTokenChunk
|
||||
from .vqa_token_pad import VQATokenPad
|
||||
from .vqa_token_relation import VQAReTokenRelation
|
||||
122
backend/ppocr/data/imaug/vqa/token/vqa_token_chunk.py
Normal file
122
backend/ppocr/data/imaug/vqa/token/vqa_token_chunk.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class VQASerTokenChunk(object):
|
||||
def __init__(self, max_seq_len=512, infer_mode=False, **kwargs):
|
||||
self.max_seq_len = max_seq_len
|
||||
self.infer_mode = infer_mode
|
||||
|
||||
def __call__(self, data):
|
||||
encoded_inputs_all = []
|
||||
seq_len = len(data['input_ids'])
|
||||
for index in range(0, seq_len, self.max_seq_len):
|
||||
chunk_beg = index
|
||||
chunk_end = min(index + self.max_seq_len, seq_len)
|
||||
encoded_inputs_example = {}
|
||||
for key in data:
|
||||
if key in [
|
||||
'label', 'input_ids', 'labels', 'token_type_ids',
|
||||
'bbox', 'attention_mask'
|
||||
]:
|
||||
if self.infer_mode and key == 'labels':
|
||||
encoded_inputs_example[key] = data[key]
|
||||
else:
|
||||
encoded_inputs_example[key] = data[key][chunk_beg:
|
||||
chunk_end]
|
||||
else:
|
||||
encoded_inputs_example[key] = data[key]
|
||||
|
||||
encoded_inputs_all.append(encoded_inputs_example)
|
||||
if len(encoded_inputs_all) == 0:
|
||||
return None
|
||||
return encoded_inputs_all[0]
|
||||
|
||||
|
||||
class VQAReTokenChunk(object):
|
||||
def __init__(self,
|
||||
max_seq_len=512,
|
||||
entities_labels=None,
|
||||
infer_mode=False,
|
||||
**kwargs):
|
||||
self.max_seq_len = max_seq_len
|
||||
self.entities_labels = {
|
||||
'HEADER': 0,
|
||||
'QUESTION': 1,
|
||||
'ANSWER': 2
|
||||
} if entities_labels is None else entities_labels
|
||||
self.infer_mode = infer_mode
|
||||
|
||||
def __call__(self, data):
|
||||
# prepare data
|
||||
entities = data.pop('entities')
|
||||
relations = data.pop('relations')
|
||||
encoded_inputs_all = []
|
||||
for index in range(0, len(data["input_ids"]), self.max_seq_len):
|
||||
item = {}
|
||||
for key in data:
|
||||
if key in [
|
||||
'label', 'input_ids', 'labels', 'token_type_ids',
|
||||
'bbox', 'attention_mask'
|
||||
]:
|
||||
if self.infer_mode and key == 'labels':
|
||||
item[key] = data[key]
|
||||
else:
|
||||
item[key] = data[key][index:index + self.max_seq_len]
|
||||
else:
|
||||
item[key] = data[key]
|
||||
# select entity in current chunk
|
||||
entities_in_this_span = []
|
||||
global_to_local_map = {} #
|
||||
for entity_id, entity in enumerate(entities):
|
||||
if (index <= entity["start"] < index + self.max_seq_len and
|
||||
index <= entity["end"] < index + self.max_seq_len):
|
||||
entity["start"] = entity["start"] - index
|
||||
entity["end"] = entity["end"] - index
|
||||
global_to_local_map[entity_id] = len(entities_in_this_span)
|
||||
entities_in_this_span.append(entity)
|
||||
|
||||
# select relations in current chunk
|
||||
relations_in_this_span = []
|
||||
for relation in relations:
|
||||
if (index <= relation["start_index"] < index + self.max_seq_len
|
||||
and index <= relation["end_index"] <
|
||||
index + self.max_seq_len):
|
||||
relations_in_this_span.append({
|
||||
"head": global_to_local_map[relation["head"]],
|
||||
"tail": global_to_local_map[relation["tail"]],
|
||||
"start_index": relation["start_index"] - index,
|
||||
"end_index": relation["end_index"] - index,
|
||||
})
|
||||
item.update({
|
||||
"entities": self.reformat(entities_in_this_span),
|
||||
"relations": self.reformat(relations_in_this_span),
|
||||
})
|
||||
if len(item['entities']) > 0:
|
||||
item['entities']['label'] = [
|
||||
self.entities_labels[x] for x in item['entities']['label']
|
||||
]
|
||||
encoded_inputs_all.append(item)
|
||||
if len(encoded_inputs_all) == 0:
|
||||
return None
|
||||
return encoded_inputs_all[0]
|
||||
|
||||
def reformat(self, data):
|
||||
new_data = defaultdict(list)
|
||||
for item in data:
|
||||
for k, v in item.items():
|
||||
new_data[k].append(v)
|
||||
return new_data
|
||||
104
backend/ppocr/data/imaug/vqa/token/vqa_token_pad.py
Normal file
104
backend/ppocr/data/imaug/vqa/token/vqa_token_pad.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
|
||||
class VQATokenPad(object):
|
||||
def __init__(self,
|
||||
max_seq_len=512,
|
||||
pad_to_max_seq_len=True,
|
||||
return_attention_mask=True,
|
||||
return_token_type_ids=True,
|
||||
truncation_strategy="longest_first",
|
||||
return_overflowing_tokens=False,
|
||||
return_special_tokens_mask=False,
|
||||
infer_mode=False,
|
||||
**kwargs):
|
||||
self.max_seq_len = max_seq_len
|
||||
self.pad_to_max_seq_len = max_seq_len
|
||||
self.return_attention_mask = return_attention_mask
|
||||
self.return_token_type_ids = return_token_type_ids
|
||||
self.truncation_strategy = truncation_strategy
|
||||
self.return_overflowing_tokens = return_overflowing_tokens
|
||||
self.return_special_tokens_mask = return_special_tokens_mask
|
||||
self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||
self.infer_mode = infer_mode
|
||||
|
||||
def __call__(self, data):
|
||||
needs_to_be_padded = self.pad_to_max_seq_len and len(data[
|
||||
"input_ids"]) < self.max_seq_len
|
||||
|
||||
if needs_to_be_padded:
|
||||
if 'tokenizer_params' in data:
|
||||
tokenizer_params = data.pop('tokenizer_params')
|
||||
else:
|
||||
tokenizer_params = dict(
|
||||
padding_side='right', pad_token_type_id=0, pad_token_id=1)
|
||||
|
||||
difference = self.max_seq_len - len(data["input_ids"])
|
||||
if tokenizer_params['padding_side'] == 'right':
|
||||
if self.return_attention_mask:
|
||||
data["attention_mask"] = [1] * len(data[
|
||||
"input_ids"]) + [0] * difference
|
||||
if self.return_token_type_ids:
|
||||
data["token_type_ids"] = (
|
||||
data["token_type_ids"] +
|
||||
[tokenizer_params['pad_token_type_id']] * difference)
|
||||
if self.return_special_tokens_mask:
|
||||
data["special_tokens_mask"] = data[
|
||||
"special_tokens_mask"] + [1] * difference
|
||||
data["input_ids"] = data["input_ids"] + [
|
||||
tokenizer_params['pad_token_id']
|
||||
] * difference
|
||||
if not self.infer_mode:
|
||||
data["labels"] = data[
|
||||
"labels"] + [self.pad_token_label_id] * difference
|
||||
data["bbox"] = data["bbox"] + [[0, 0, 0, 0]] * difference
|
||||
elif tokenizer_params['padding_side'] == 'left':
|
||||
if self.return_attention_mask:
|
||||
data["attention_mask"] = [0] * difference + [
|
||||
1
|
||||
] * len(data["input_ids"])
|
||||
if self.return_token_type_ids:
|
||||
data["token_type_ids"] = (
|
||||
[tokenizer_params['pad_token_type_id']] * difference +
|
||||
data["token_type_ids"])
|
||||
if self.return_special_tokens_mask:
|
||||
data["special_tokens_mask"] = [
|
||||
1
|
||||
] * difference + data["special_tokens_mask"]
|
||||
data["input_ids"] = [tokenizer_params['pad_token_id']
|
||||
] * difference + data["input_ids"]
|
||||
if not self.infer_mode:
|
||||
data["labels"] = [self.pad_token_label_id
|
||||
] * difference + data["labels"]
|
||||
data["bbox"] = [[0, 0, 0, 0]] * difference + data["bbox"]
|
||||
else:
|
||||
if self.return_attention_mask:
|
||||
data["attention_mask"] = [1] * len(data["input_ids"])
|
||||
|
||||
for key in data:
|
||||
if key in [
|
||||
'input_ids', 'labels', 'token_type_ids', 'bbox',
|
||||
'attention_mask'
|
||||
]:
|
||||
if self.infer_mode:
|
||||
if key != 'labels':
|
||||
length = min(len(data[key]), self.max_seq_len)
|
||||
data[key] = data[key][:length]
|
||||
else:
|
||||
continue
|
||||
data[key] = np.array(data[key], dtype='int64')
|
||||
return data
|
||||
67
backend/ppocr/data/imaug/vqa/token/vqa_token_relation.py
Normal file
67
backend/ppocr/data/imaug/vqa/token/vqa_token_relation.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
class VQAReTokenRelation(object):
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __call__(self, data):
|
||||
"""
|
||||
build relations
|
||||
"""
|
||||
entities = data['entities']
|
||||
relations = data['relations']
|
||||
id2label = data.pop('id2label')
|
||||
empty_entity = data.pop('empty_entity')
|
||||
entity_id_to_index_map = data.pop('entity_id_to_index_map')
|
||||
|
||||
relations = list(set(relations))
|
||||
relations = [
|
||||
rel for rel in relations
|
||||
if rel[0] not in empty_entity and rel[1] not in empty_entity
|
||||
]
|
||||
kv_relations = []
|
||||
for rel in relations:
|
||||
pair = [id2label[rel[0]], id2label[rel[1]]]
|
||||
if pair == ["question", "answer"]:
|
||||
kv_relations.append({
|
||||
"head": entity_id_to_index_map[rel[0]],
|
||||
"tail": entity_id_to_index_map[rel[1]]
|
||||
})
|
||||
elif pair == ["answer", "question"]:
|
||||
kv_relations.append({
|
||||
"head": entity_id_to_index_map[rel[1]],
|
||||
"tail": entity_id_to_index_map[rel[0]]
|
||||
})
|
||||
else:
|
||||
continue
|
||||
relations = sorted(
|
||||
[{
|
||||
"head": rel["head"],
|
||||
"tail": rel["tail"],
|
||||
"start_index": self.get_relation_span(rel, entities)[0],
|
||||
"end_index": self.get_relation_span(rel, entities)[1],
|
||||
} for rel in kv_relations],
|
||||
key=lambda x: x["head"], )
|
||||
|
||||
data['relations'] = relations
|
||||
return data
|
||||
|
||||
def get_relation_span(self, rel, entities):
|
||||
bound = []
|
||||
for entity_index in [rel["head"], rel["tail"]]:
|
||||
bound.append(entities[entity_index]["start"])
|
||||
bound.append(entities[entity_index]["end"])
|
||||
return min(bound), max(bound)
|
||||
118
backend/ppocr/data/lmdb_dataset.py
Normal file
118
backend/ppocr/data/lmdb_dataset.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import os
|
||||
from paddle.io import Dataset
|
||||
import lmdb
|
||||
import cv2
|
||||
|
||||
from .imaug import transform, create_operators
|
||||
|
||||
|
||||
class LMDBDataSet(Dataset):
|
||||
def __init__(self, config, mode, logger, seed=None):
|
||||
super(LMDBDataSet, self).__init__()
|
||||
|
||||
global_config = config['Global']
|
||||
dataset_config = config[mode]['dataset']
|
||||
loader_config = config[mode]['loader']
|
||||
batch_size = loader_config['batch_size_per_card']
|
||||
data_dir = dataset_config['data_dir']
|
||||
self.do_shuffle = loader_config['shuffle']
|
||||
|
||||
self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir)
|
||||
logger.info("Initialize indexs of datasets:%s" % data_dir)
|
||||
self.data_idx_order_list = self.dataset_traversal()
|
||||
if self.do_shuffle:
|
||||
np.random.shuffle(self.data_idx_order_list)
|
||||
self.ops = create_operators(dataset_config['transforms'], global_config)
|
||||
|
||||
ratio_list = dataset_config.get("ratio_list", [1.0])
|
||||
self.need_reset = True in [x < 1 for x in ratio_list]
|
||||
|
||||
def load_hierarchical_lmdb_dataset(self, data_dir):
|
||||
lmdb_sets = {}
|
||||
dataset_idx = 0
|
||||
for dirpath, dirnames, filenames in os.walk(data_dir + '/'):
|
||||
if not dirnames:
|
||||
env = lmdb.open(
|
||||
dirpath,
|
||||
max_readers=32,
|
||||
readonly=True,
|
||||
lock=False,
|
||||
readahead=False,
|
||||
meminit=False)
|
||||
txn = env.begin(write=False)
|
||||
num_samples = int(txn.get('num-samples'.encode()))
|
||||
lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \
|
||||
"txn":txn, "num_samples":num_samples}
|
||||
dataset_idx += 1
|
||||
return lmdb_sets
|
||||
|
||||
def dataset_traversal(self):
|
||||
lmdb_num = len(self.lmdb_sets)
|
||||
total_sample_num = 0
|
||||
for lno in range(lmdb_num):
|
||||
total_sample_num += self.lmdb_sets[lno]['num_samples']
|
||||
data_idx_order_list = np.zeros((total_sample_num, 2))
|
||||
beg_idx = 0
|
||||
for lno in range(lmdb_num):
|
||||
tmp_sample_num = self.lmdb_sets[lno]['num_samples']
|
||||
end_idx = beg_idx + tmp_sample_num
|
||||
data_idx_order_list[beg_idx:end_idx, 0] = lno
|
||||
data_idx_order_list[beg_idx:end_idx, 1] \
|
||||
= list(range(tmp_sample_num))
|
||||
data_idx_order_list[beg_idx:end_idx, 1] += 1
|
||||
beg_idx = beg_idx + tmp_sample_num
|
||||
return data_idx_order_list
|
||||
|
||||
def get_img_data(self, value):
|
||||
"""get_img_data"""
|
||||
if not value:
|
||||
return None
|
||||
imgdata = np.frombuffer(value, dtype='uint8')
|
||||
if imgdata is None:
|
||||
return None
|
||||
imgori = cv2.imdecode(imgdata, 1)
|
||||
if imgori is None:
|
||||
return None
|
||||
return imgori
|
||||
|
||||
def get_lmdb_sample_info(self, txn, index):
|
||||
label_key = 'label-%09d'.encode() % index
|
||||
label = txn.get(label_key)
|
||||
if label is None:
|
||||
return None
|
||||
label = label.decode('utf-8')
|
||||
img_key = 'image-%09d'.encode() % index
|
||||
imgbuf = txn.get(img_key)
|
||||
return imgbuf, label
|
||||
|
||||
def __getitem__(self, idx):
|
||||
lmdb_idx, file_idx = self.data_idx_order_list[idx]
|
||||
lmdb_idx = int(lmdb_idx)
|
||||
file_idx = int(file_idx)
|
||||
sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'],
|
||||
file_idx)
|
||||
if sample_info is None:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
img, label = sample_info
|
||||
data = {'image': img, 'label': label}
|
||||
outs = transform(data, self.ops)
|
||||
if outs is None:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
return outs
|
||||
|
||||
def __len__(self):
|
||||
return self.data_idx_order_list.shape[0]
|
||||
106
backend/ppocr/data/pgnet_dataset.py
Normal file
106
backend/ppocr/data/pgnet_dataset.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import os
|
||||
from paddle.io import Dataset
|
||||
from .imaug import transform, create_operators
|
||||
import random
|
||||
|
||||
|
||||
class PGDataSet(Dataset):
|
||||
def __init__(self, config, mode, logger, seed=None):
|
||||
super(PGDataSet, self).__init__()
|
||||
|
||||
self.logger = logger
|
||||
self.seed = seed
|
||||
self.mode = mode
|
||||
global_config = config['Global']
|
||||
dataset_config = config[mode]['dataset']
|
||||
loader_config = config[mode]['loader']
|
||||
|
||||
self.delimiter = dataset_config.get('delimiter', '\t')
|
||||
label_file_list = dataset_config.pop('label_file_list')
|
||||
data_source_num = len(label_file_list)
|
||||
ratio_list = dataset_config.get("ratio_list", [1.0])
|
||||
if isinstance(ratio_list, (float, int)):
|
||||
ratio_list = [float(ratio_list)] * int(data_source_num)
|
||||
assert len(
|
||||
ratio_list
|
||||
) == data_source_num, "The length of ratio_list should be the same as the file_list."
|
||||
self.data_dir = dataset_config['data_dir']
|
||||
self.do_shuffle = loader_config['shuffle']
|
||||
|
||||
logger.info("Initialize indexs of datasets:%s" % label_file_list)
|
||||
self.data_lines = self.get_image_info_list(label_file_list, ratio_list)
|
||||
self.data_idx_order_list = list(range(len(self.data_lines)))
|
||||
if mode.lower() == "train":
|
||||
self.shuffle_data_random()
|
||||
|
||||
self.ops = create_operators(dataset_config['transforms'], global_config)
|
||||
|
||||
self.need_reset = True in [x < 1 for x in ratio_list]
|
||||
|
||||
def shuffle_data_random(self):
|
||||
if self.do_shuffle:
|
||||
random.seed(self.seed)
|
||||
random.shuffle(self.data_lines)
|
||||
return
|
||||
|
||||
def get_image_info_list(self, file_list, ratio_list):
|
||||
if isinstance(file_list, str):
|
||||
file_list = [file_list]
|
||||
data_lines = []
|
||||
for idx, file in enumerate(file_list):
|
||||
with open(file, "rb") as f:
|
||||
lines = f.readlines()
|
||||
if self.mode == "train" or ratio_list[idx] < 1.0:
|
||||
random.seed(self.seed)
|
||||
lines = random.sample(lines,
|
||||
round(len(lines) * ratio_list[idx]))
|
||||
data_lines.extend(lines)
|
||||
return data_lines
|
||||
|
||||
def __getitem__(self, idx):
|
||||
file_idx = self.data_idx_order_list[idx]
|
||||
data_line = self.data_lines[file_idx]
|
||||
img_id = 0
|
||||
try:
|
||||
data_line = data_line.decode('utf-8')
|
||||
substr = data_line.strip("\n").split(self.delimiter)
|
||||
file_name = substr[0]
|
||||
label = substr[1]
|
||||
img_path = os.path.join(self.data_dir, file_name)
|
||||
if self.mode.lower() == 'eval':
|
||||
try:
|
||||
img_id = int(data_line.split(".")[0][7:])
|
||||
except:
|
||||
img_id = 0
|
||||
data = {'img_path': img_path, 'label': label, 'img_id': img_id}
|
||||
if not os.path.exists(img_path):
|
||||
raise Exception("{} does not exist!".format(img_path))
|
||||
with open(data['img_path'], 'rb') as f:
|
||||
img = f.read()
|
||||
data['image'] = img
|
||||
outs = transform(data, self.ops)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
"When parsing line {}, error happened with msg: {}".format(
|
||||
self.data_idx_order_list[idx], e))
|
||||
outs = None
|
||||
if outs is None:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
return outs
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_idx_order_list)
|
||||
114
backend/ppocr/data/pubtab_dataset.py
Normal file
114
backend/ppocr/data/pubtab_dataset.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
from paddle.io import Dataset
|
||||
import json
|
||||
|
||||
from .imaug import transform, create_operators
|
||||
|
||||
|
||||
class PubTabDataSet(Dataset):
|
||||
def __init__(self, config, mode, logger, seed=None):
|
||||
super(PubTabDataSet, self).__init__()
|
||||
self.logger = logger
|
||||
|
||||
global_config = config['Global']
|
||||
dataset_config = config[mode]['dataset']
|
||||
loader_config = config[mode]['loader']
|
||||
|
||||
label_file_path = dataset_config.pop('label_file_path')
|
||||
|
||||
self.data_dir = dataset_config['data_dir']
|
||||
self.do_shuffle = loader_config['shuffle']
|
||||
self.do_hard_select = False
|
||||
if 'hard_select' in loader_config:
|
||||
self.do_hard_select = loader_config['hard_select']
|
||||
self.hard_prob = loader_config['hard_prob']
|
||||
if self.do_hard_select:
|
||||
self.img_select_prob = self.load_hard_select_prob()
|
||||
self.table_select_type = None
|
||||
if 'table_select_type' in loader_config:
|
||||
self.table_select_type = loader_config['table_select_type']
|
||||
self.table_select_prob = loader_config['table_select_prob']
|
||||
|
||||
self.seed = seed
|
||||
logger.info("Initialize indexs of datasets:%s" % label_file_path)
|
||||
with open(label_file_path, "rb") as f:
|
||||
self.data_lines = f.readlines()
|
||||
self.data_idx_order_list = list(range(len(self.data_lines)))
|
||||
if mode.lower() == "train":
|
||||
self.shuffle_data_random()
|
||||
self.ops = create_operators(dataset_config['transforms'], global_config)
|
||||
|
||||
ratio_list = dataset_config.get("ratio_list", [1.0])
|
||||
self.need_reset = True in [x < 1 for x in ratio_list]
|
||||
|
||||
def shuffle_data_random(self):
|
||||
if self.do_shuffle:
|
||||
random.seed(self.seed)
|
||||
random.shuffle(self.data_lines)
|
||||
return
|
||||
|
||||
def __getitem__(self, idx):
|
||||
try:
|
||||
data_line = self.data_lines[idx]
|
||||
data_line = data_line.decode('utf-8').strip("\n")
|
||||
info = json.loads(data_line)
|
||||
file_name = info['filename']
|
||||
select_flag = True
|
||||
if self.do_hard_select:
|
||||
prob = self.img_select_prob[file_name]
|
||||
if prob < random.uniform(0, 1):
|
||||
select_flag = False
|
||||
|
||||
if self.table_select_type:
|
||||
structure = info['html']['structure']['tokens'].copy()
|
||||
structure_str = ''.join(structure)
|
||||
table_type = "simple"
|
||||
if 'colspan' in structure_str or 'rowspan' in structure_str:
|
||||
table_type = "complex"
|
||||
if table_type == "complex":
|
||||
if self.table_select_prob < random.uniform(0, 1):
|
||||
select_flag = False
|
||||
|
||||
if select_flag:
|
||||
cells = info['html']['cells'].copy()
|
||||
structure = info['html']['structure'].copy()
|
||||
img_path = os.path.join(self.data_dir, file_name)
|
||||
data = {
|
||||
'img_path': img_path,
|
||||
'cells': cells,
|
||||
'structure': structure
|
||||
}
|
||||
if not os.path.exists(img_path):
|
||||
raise Exception("{} does not exist!".format(img_path))
|
||||
with open(data['img_path'], 'rb') as f:
|
||||
img = f.read()
|
||||
data['image'] = img
|
||||
outs = transform(data, self.ops)
|
||||
else:
|
||||
outs = None
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
"When parsing line {}, error happened with msg: {}".format(
|
||||
data_line, e))
|
||||
outs = None
|
||||
if outs is None:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
return outs
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_idx_order_list)
|
||||
151
backend/ppocr/data/simple_dataset.py
Normal file
151
backend/ppocr/data/simple_dataset.py
Normal file
@@ -0,0 +1,151 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
import random
|
||||
import traceback
|
||||
from paddle.io import Dataset
|
||||
from .imaug import transform, create_operators
|
||||
|
||||
|
||||
class SimpleDataSet(Dataset):
|
||||
def __init__(self, config, mode, logger, seed=None):
|
||||
super(SimpleDataSet, self).__init__()
|
||||
self.logger = logger
|
||||
self.mode = mode.lower()
|
||||
|
||||
global_config = config['Global']
|
||||
dataset_config = config[mode]['dataset']
|
||||
loader_config = config[mode]['loader']
|
||||
|
||||
self.delimiter = dataset_config.get('delimiter', '\t')
|
||||
label_file_list = dataset_config.pop('label_file_list')
|
||||
data_source_num = len(label_file_list)
|
||||
ratio_list = dataset_config.get("ratio_list", [1.0])
|
||||
if isinstance(ratio_list, (float, int)):
|
||||
ratio_list = [float(ratio_list)] * int(data_source_num)
|
||||
|
||||
assert len(
|
||||
ratio_list
|
||||
) == data_source_num, "The length of ratio_list should be the same as the file_list."
|
||||
self.data_dir = dataset_config['data_dir']
|
||||
self.do_shuffle = loader_config['shuffle']
|
||||
self.seed = seed
|
||||
logger.info("Initialize indexs of datasets:%s" % label_file_list)
|
||||
self.data_lines = self.get_image_info_list(label_file_list, ratio_list)
|
||||
self.data_idx_order_list = list(range(len(self.data_lines)))
|
||||
if self.mode == "train" and self.do_shuffle:
|
||||
self.shuffle_data_random()
|
||||
self.ops = create_operators(dataset_config['transforms'], global_config)
|
||||
self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx",
|
||||
2)
|
||||
self.need_reset = True in [x < 1 for x in ratio_list]
|
||||
|
||||
def get_image_info_list(self, file_list, ratio_list):
|
||||
if isinstance(file_list, str):
|
||||
file_list = [file_list]
|
||||
data_lines = []
|
||||
for idx, file in enumerate(file_list):
|
||||
with open(file, "rb") as f:
|
||||
lines = f.readlines()
|
||||
if self.mode == "train" or ratio_list[idx] < 1.0:
|
||||
random.seed(self.seed)
|
||||
lines = random.sample(lines,
|
||||
round(len(lines) * ratio_list[idx]))
|
||||
data_lines.extend(lines)
|
||||
return data_lines
|
||||
|
||||
def shuffle_data_random(self):
|
||||
random.seed(self.seed)
|
||||
random.shuffle(self.data_lines)
|
||||
return
|
||||
|
||||
def _try_parse_filename_list(self, file_name):
|
||||
# multiple images -> one gt label
|
||||
if len(file_name) > 0 and file_name[0] == "[":
|
||||
try:
|
||||
info = json.loads(file_name)
|
||||
file_name = random.choice(info)
|
||||
except:
|
||||
pass
|
||||
return file_name
|
||||
|
||||
def get_ext_data(self):
|
||||
ext_data_num = 0
|
||||
for op in self.ops:
|
||||
if hasattr(op, 'ext_data_num'):
|
||||
ext_data_num = getattr(op, 'ext_data_num')
|
||||
break
|
||||
load_data_ops = self.ops[:self.ext_op_transform_idx]
|
||||
ext_data = []
|
||||
|
||||
while len(ext_data) < ext_data_num:
|
||||
file_idx = self.data_idx_order_list[np.random.randint(self.__len__(
|
||||
))]
|
||||
data_line = self.data_lines[file_idx]
|
||||
data_line = data_line.decode('utf-8')
|
||||
substr = data_line.strip("\n").split(self.delimiter)
|
||||
file_name = substr[0]
|
||||
file_name = self._try_parse_filename_list(file_name)
|
||||
label = substr[1]
|
||||
img_path = os.path.join(self.data_dir, file_name)
|
||||
data = {'img_path': img_path, 'label': label}
|
||||
if not os.path.exists(img_path):
|
||||
continue
|
||||
with open(data['img_path'], 'rb') as f:
|
||||
img = f.read()
|
||||
data['image'] = img
|
||||
data = transform(data, load_data_ops)
|
||||
|
||||
if data is None:
|
||||
continue
|
||||
if 'polys' in data.keys():
|
||||
if data['polys'].shape[1] != 4:
|
||||
continue
|
||||
ext_data.append(data)
|
||||
return ext_data
|
||||
|
||||
def __getitem__(self, idx):
|
||||
file_idx = self.data_idx_order_list[idx]
|
||||
data_line = self.data_lines[file_idx]
|
||||
try:
|
||||
data_line = data_line.decode('utf-8')
|
||||
substr = data_line.strip("\n").split(self.delimiter)
|
||||
file_name = substr[0]
|
||||
file_name = self._try_parse_filename_list(file_name)
|
||||
label = substr[1]
|
||||
img_path = os.path.join(self.data_dir, file_name)
|
||||
data = {'img_path': img_path, 'label': label}
|
||||
if not os.path.exists(img_path):
|
||||
raise Exception("{} does not exist!".format(img_path))
|
||||
with open(data['img_path'], 'rb') as f:
|
||||
img = f.read()
|
||||
data['image'] = img
|
||||
data['ext_data'] = self.get_ext_data()
|
||||
outs = transform(data, self.ops)
|
||||
except:
|
||||
self.logger.error(
|
||||
"When parsing line {}, error happened with msg: {}".format(
|
||||
data_line, traceback.format_exc()))
|
||||
outs = None
|
||||
if outs is None:
|
||||
# during evaluation, we should fix the idx to get same results for many times of evaluation.
|
||||
rnd_idx = np.random.randint(self.__len__(
|
||||
)) if self.mode == "train" else (idx + 1) % self.__len__()
|
||||
return self.__getitem__(rnd_idx)
|
||||
return outs
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_idx_order_list)
|
||||
Reference in New Issue
Block a user