This commit is contained in:
YaoFANGUK
2023-10-25 16:38:16 +08:00
commit 2b9360c299
602 changed files with 152490 additions and 0 deletions

View File

@@ -0,0 +1,62 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import copy
import paddle
__all__ = ['build_optimizer']
def build_lr_scheduler(lr_config, epochs, step_each_epoch):
from . import learning_rate
lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
lr_name = lr_config.pop('name', 'Const')
lr = getattr(learning_rate, lr_name)(**lr_config)()
return lr
def build_optimizer(config, epochs, step_each_epoch, model):
from . import regularizer, optimizer
config = copy.deepcopy(config)
# step1 build lr
lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
# step2 build regularization
if 'regularizer' in config and config['regularizer'] is not None:
reg_config = config.pop('regularizer')
reg_name = reg_config.pop('name')
if not hasattr(regularizer, reg_name):
reg_name += 'Decay'
reg = getattr(regularizer, reg_name)(**reg_config)()
elif 'weight_decay' in config:
reg = config.pop('weight_decay')
else:
reg = None
# step3 build optimizer
optim_name = config.pop('name')
if 'clip_norm' in config:
clip_norm = config.pop('clip_norm')
grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
else:
grad_clip = None
optim = getattr(optimizer, optim_name)(learning_rate=lr,
weight_decay=reg,
grad_clip=grad_clip,
**config)
return optim(model), lr

View File

@@ -0,0 +1,310 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from paddle.optimizer import lr
from .lr_scheduler import CyclicalCosineDecay, OneCycleDecay
class Linear(object):
"""
Linear learning rate decay
Args:
lr (float): The initial learning rate. It is a python float number.
epochs(int): The decay step size. It determines the decay cycle.
end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
power(float, optional): Power of polynomial. Default: 1.0.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
learning_rate,
epochs,
step_each_epoch,
end_lr=0.0,
power=1.0,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(Linear, self).__init__()
self.learning_rate = learning_rate
self.epochs = epochs * step_each_epoch
self.end_lr = end_lr
self.power = power
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = lr.PolynomialDecay(
learning_rate=self.learning_rate,
decay_steps=self.epochs,
end_lr=self.end_lr,
power=self.power,
last_epoch=self.last_epoch)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.learning_rate,
last_epoch=self.last_epoch)
return learning_rate
class Cosine(object):
"""
Cosine learning rate decay
lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
Args:
lr(float): initial learning rate
step_each_epoch(int): steps each epoch
epochs(int): total training epochs
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
learning_rate,
step_each_epoch,
epochs,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(Cosine, self).__init__()
self.learning_rate = learning_rate
self.T_max = step_each_epoch * epochs
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = lr.CosineAnnealingDecay(
learning_rate=self.learning_rate,
T_max=self.T_max,
last_epoch=self.last_epoch)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.learning_rate,
last_epoch=self.last_epoch)
return learning_rate
class Step(object):
"""
Piecewise learning rate decay
Args:
step_each_epoch(int): steps each epoch
learning_rate (float): The initial learning rate. It is a python float number.
step_size (int): the interval to update.
gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
It should be less than 1.0. Default: 0.1.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
learning_rate,
step_size,
step_each_epoch,
gamma,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(Step, self).__init__()
self.step_size = step_each_epoch * step_size
self.learning_rate = learning_rate
self.gamma = gamma
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = lr.StepDecay(
learning_rate=self.learning_rate,
step_size=self.step_size,
gamma=self.gamma,
last_epoch=self.last_epoch)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.learning_rate,
last_epoch=self.last_epoch)
return learning_rate
class Piecewise(object):
"""
Piecewise learning rate decay
Args:
boundaries(list): A list of steps numbers. The type of element in the list is python int.
values(list): A list of learning rate values that will be picked during different epoch boundaries.
The type of element in the list is python float.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
step_each_epoch,
decay_epochs,
values,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(Piecewise, self).__init__()
self.boundaries = [step_each_epoch * e for e in decay_epochs]
self.values = values
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = lr.PiecewiseDecay(
boundaries=self.boundaries,
values=self.values,
last_epoch=self.last_epoch)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.values[0],
last_epoch=self.last_epoch)
return learning_rate
class CyclicalCosine(object):
"""
Cyclical cosine learning rate decay
Args:
learning_rate(float): initial learning rate
step_each_epoch(int): steps each epoch
epochs(int): total training epochs
cycle(int): period of the cosine learning rate
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
learning_rate,
step_each_epoch,
epochs,
cycle,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(CyclicalCosine, self).__init__()
self.learning_rate = learning_rate
self.T_max = step_each_epoch * epochs
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
self.cycle = round(cycle * step_each_epoch)
def __call__(self):
learning_rate = CyclicalCosineDecay(
learning_rate=self.learning_rate,
T_max=self.T_max,
cycle=self.cycle,
last_epoch=self.last_epoch)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.learning_rate,
last_epoch=self.last_epoch)
return learning_rate
class OneCycle(object):
"""
One Cycle learning rate decay
Args:
max_lr(float): Upper learning rate boundaries
epochs(int): total training epochs
step_each_epoch(int): steps each epoch
anneal_strategy(str): {cos, linear} Specifies the annealing strategy: “cos” for cosine annealing, “linear” for linear annealing.
Default: cos
three_phase(bool): If True, use a third phase of the schedule to annihilate the learning rate according to final_div_factor
instead of modifying the second phase (the first two phases will be symmetrical about the step indicated by pct_start).
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
max_lr,
epochs,
step_each_epoch,
anneal_strategy='cos',
three_phase=False,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(OneCycle, self).__init__()
self.max_lr = max_lr
self.epochs = epochs
self.steps_per_epoch = step_each_epoch
self.anneal_strategy = anneal_strategy
self.three_phase = three_phase
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = OneCycleDecay(
max_lr=self.max_lr,
epochs=self.epochs,
steps_per_epoch=self.steps_per_epoch,
anneal_strategy=self.anneal_strategy,
three_phase=self.three_phase,
last_epoch=self.last_epoch)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.max_lr,
last_epoch=self.last_epoch)
return learning_rate
class Const(object):
"""
Const learning rate decay
Args:
learning_rate(float): initial learning rate
step_each_epoch(int): steps each epoch
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
"""
def __init__(self,
learning_rate,
step_each_epoch,
warmup_epoch=0,
last_epoch=-1,
**kwargs):
super(Const, self).__init__()
self.learning_rate = learning_rate
self.last_epoch = last_epoch
self.warmup_epoch = round(warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = self.learning_rate
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=0.0,
end_lr=self.learning_rate,
last_epoch=self.last_epoch)
return learning_rate

View File

@@ -0,0 +1,162 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from paddle.optimizer.lr import LRScheduler
class CyclicalCosineDecay(LRScheduler):
def __init__(self,
learning_rate,
T_max,
cycle=1,
last_epoch=-1,
eta_min=0.0,
verbose=False):
"""
Cyclical cosine learning rate decay
A learning rate which can be referred in https://arxiv.org/pdf/2012.12645.pdf
Args:
learning rate(float): learning rate
T_max(int): maximum epoch num
cycle(int): period of the cosine decay
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
eta_min(float): minimum learning rate during training
verbose(bool): whether to print learning rate for each epoch
"""
super(CyclicalCosineDecay, self).__init__(learning_rate, last_epoch,
verbose)
self.cycle = cycle
self.eta_min = eta_min
def get_lr(self):
if self.last_epoch == 0:
return self.base_lr
reletive_epoch = self.last_epoch % self.cycle
lr = self.eta_min + 0.5 * (self.base_lr - self.eta_min) * \
(1 + math.cos(math.pi * reletive_epoch / self.cycle))
return lr
class OneCycleDecay(LRScheduler):
"""
One Cycle learning rate decay
A learning rate which can be referred in https://arxiv.org/abs/1708.07120
Code refered in https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
"""
def __init__(self,
max_lr,
epochs=None,
steps_per_epoch=None,
pct_start=0.3,
anneal_strategy='cos',
div_factor=25.,
final_div_factor=1e4,
three_phase=False,
last_epoch=-1,
verbose=False):
# Validate total_steps
if epochs <= 0 or not isinstance(epochs, int):
raise ValueError(
"Expected positive integer epochs, but got {}".format(epochs))
if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int):
raise ValueError(
"Expected positive integer steps_per_epoch, but got {}".format(
steps_per_epoch))
self.total_steps = epochs * steps_per_epoch
self.max_lr = max_lr
self.initial_lr = self.max_lr / div_factor
self.min_lr = self.initial_lr / final_div_factor
if three_phase:
self._schedule_phases = [
{
'end_step': float(pct_start * self.total_steps) - 1,
'start_lr': self.initial_lr,
'end_lr': self.max_lr,
},
{
'end_step': float(2 * pct_start * self.total_steps) - 2,
'start_lr': self.max_lr,
'end_lr': self.initial_lr,
},
{
'end_step': self.total_steps - 1,
'start_lr': self.initial_lr,
'end_lr': self.min_lr,
},
]
else:
self._schedule_phases = [
{
'end_step': float(pct_start * self.total_steps) - 1,
'start_lr': self.initial_lr,
'end_lr': self.max_lr,
},
{
'end_step': self.total_steps - 1,
'start_lr': self.max_lr,
'end_lr': self.min_lr,
},
]
# Validate pct_start
if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
raise ValueError(
"Expected float between 0 and 1 pct_start, but got {}".format(
pct_start))
# Validate anneal_strategy
if anneal_strategy not in ['cos', 'linear']:
raise ValueError(
"anneal_strategy must by one of 'cos' or 'linear', instead got {}".
format(anneal_strategy))
elif anneal_strategy == 'cos':
self.anneal_func = self._annealing_cos
elif anneal_strategy == 'linear':
self.anneal_func = self._annealing_linear
super(OneCycleDecay, self).__init__(max_lr, last_epoch, verbose)
def _annealing_cos(self, start, end, pct):
"Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
cos_out = math.cos(math.pi * pct) + 1
return end + (start - end) / 2.0 * cos_out
def _annealing_linear(self, start, end, pct):
"Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
return (end - start) * pct + start
def get_lr(self):
computed_lr = 0.0
step_num = self.last_epoch
if step_num > self.total_steps:
raise ValueError(
"Tried to step {} times. The specified number of total steps is {}"
.format(step_num + 1, self.total_steps))
start_step = 0
for i, phase in enumerate(self._schedule_phases):
end_step = phase['end_step']
if step_num <= end_step or i == len(self._schedule_phases) - 1:
pct = (step_num - start_step) / (end_step - start_step)
computed_lr = self.anneal_func(phase['start_lr'],
phase['end_lr'], pct)
break
start_step = phase['end_step']
return computed_lr

View File

@@ -0,0 +1,234 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from paddle import optimizer as optim
class Momentum(object):
"""
Simple Momentum optimizer with velocity state.
Args:
learning_rate (float|Variable) - The learning rate used to update parameters.
Can be a float value or a Variable with one float value as data element.
momentum (float) - Momentum factor.
regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
"""
def __init__(self,
learning_rate,
momentum,
weight_decay=None,
grad_clip=None,
**args):
super(Momentum, self).__init__()
self.learning_rate = learning_rate
self.momentum = momentum
self.weight_decay = weight_decay
self.grad_clip = grad_clip
def __call__(self, model):
train_params = [
param for param in model.parameters() if param.trainable is True
]
opt = optim.Momentum(
learning_rate=self.learning_rate,
momentum=self.momentum,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
parameters=train_params)
return opt
class Adam(object):
def __init__(self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
parameter_list=None,
weight_decay=None,
grad_clip=None,
name=None,
lazy_mode=False,
**kwargs):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.parameter_list = parameter_list
self.learning_rate = learning_rate
self.weight_decay = weight_decay
self.grad_clip = grad_clip
self.name = name
self.lazy_mode = lazy_mode
def __call__(self, model):
train_params = [
param for param in model.parameters() if param.trainable is True
]
opt = optim.Adam(
learning_rate=self.learning_rate,
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
name=self.name,
lazy_mode=self.lazy_mode,
parameters=train_params)
return opt
class RMSProp(object):
"""
Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
Args:
learning_rate (float|Variable) - The learning rate used to update parameters.
Can be a float value or a Variable with one float value as data element.
momentum (float) - Momentum factor.
rho (float) - rho value in equation.
epsilon (float) - avoid division by zero, default is 1e-6.
regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
"""
def __init__(self,
learning_rate,
momentum=0.0,
rho=0.95,
epsilon=1e-6,
weight_decay=None,
grad_clip=None,
**args):
super(RMSProp, self).__init__()
self.learning_rate = learning_rate
self.momentum = momentum
self.rho = rho
self.epsilon = epsilon
self.weight_decay = weight_decay
self.grad_clip = grad_clip
def __call__(self, model):
train_params = [
param for param in model.parameters() if param.trainable is True
]
opt = optim.RMSProp(
learning_rate=self.learning_rate,
momentum=self.momentum,
rho=self.rho,
epsilon=self.epsilon,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
parameters=train_params)
return opt
class Adadelta(object):
def __init__(self,
learning_rate=0.001,
epsilon=1e-08,
rho=0.95,
parameter_list=None,
weight_decay=None,
grad_clip=None,
name=None,
**kwargs):
self.learning_rate = learning_rate
self.epsilon = epsilon
self.rho = rho
self.parameter_list = parameter_list
self.learning_rate = learning_rate
self.weight_decay = weight_decay
self.grad_clip = grad_clip
self.name = name
def __call__(self, model):
train_params = [
param for param in model.parameters() if param.trainable is True
]
opt = optim.Adadelta(
learning_rate=self.learning_rate,
epsilon=self.epsilon,
rho=self.rho,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
name=self.name,
parameters=train_params)
return opt
class AdamW(object):
def __init__(self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
weight_decay=0.01,
multi_precision=False,
grad_clip=None,
no_weight_decay_name=None,
one_dim_param_no_weight_decay=False,
name=None,
lazy_mode=False,
**args):
super().__init__()
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.grad_clip = grad_clip
self.weight_decay = 0.01 if weight_decay is None else weight_decay
self.grad_clip = grad_clip
self.name = name
self.lazy_mode = lazy_mode
self.multi_precision = multi_precision
self.no_weight_decay_name_list = no_weight_decay_name.split(
) if no_weight_decay_name else []
self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
def __call__(self, model):
parameters = [
param for param in model.parameters() if param.trainable is True
]
self.no_weight_decay_param_name_list = [
p.name for n, p in model.named_parameters()
if any(nd in n for nd in self.no_weight_decay_name_list)
]
if self.one_dim_param_no_weight_decay:
self.no_weight_decay_param_name_list += [
p.name for n, p in model.named_parameters() if len(p.shape) == 1
]
opt = optim.AdamW(
learning_rate=self.learning_rate,
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon,
parameters=parameters,
weight_decay=self.weight_decay,
multi_precision=self.multi_precision,
grad_clip=self.grad_clip,
name=self.name,
lazy_mode=self.lazy_mode,
apply_decay_param_fun=self._apply_decay_param_fun)
return opt
def _apply_decay_param_fun(self, name):
return name not in self.no_weight_decay_param_name_list

View File

@@ -0,0 +1,51 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import paddle
class L1Decay(object):
"""
L1 Weight Decay Regularization, which encourages the weights to be sparse.
Args:
factor(float): regularization coeff. Default:0.0.
"""
def __init__(self, factor=0.0):
super(L1Decay, self).__init__()
self.coeff = factor
def __call__(self):
reg = paddle.regularizer.L1Decay(self.coeff)
return reg
class L2Decay(object):
"""
L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
Args:
factor(float): regularization coeff. Default:0.0.
"""
def __init__(self, factor=0.0):
super(L2Decay, self).__init__()
self.coeff = float(factor)
def __call__(self):
return self.coeff