The-Annotated-Transformer-Harvard

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")

%matplotlib inline

背景

减少序列处理任务的计算量是一个重要的问题,也是Extended Neural GPU,ByteNet和ConvS2S等网络的动机。上面提到的这些网络都以cnn为基础,并行计算所有输入和输出位置隐藏表示。

在这些模型中,关联来自两个任意输入或输出位置的信号所需的操作数随位置间的距离增长而增长,比如convs2s呈线性增长,bytenet呈现以对数形式增长,这会使学习较远距离的两个位置之间的依赖关系变得更加困难。而在transformer中,操作次数则被减少到了常数级别

self-attention有时候也被称为intra-attention,是在单个句子不同位置上做的attention,并得到序列的一个表示。它能够很好的应用到很多任务中,包括阅读理解、摘要、文本蕴含,以及独立于任务的句子表示。

端到端的网络一般都是基于循环注意力机制而不是序列对齐循环,并且已经有证据表明在简单语言问答和语言建模任务上表现很好。

据我们所知,Transformer是第一个完全依靠Self-attention而不使用序列对齐的RNN或卷积的方式来计算输入输出表示的转换模型。

模型结构

目前大部分比较热门的神经序列转换模型都有Encoder-Decoder结构。Encoder将输入序列 $(x_1,\dots,x_n)$映射到一个连续表示序列 $z=(z_1,\dots,z_n)$。

对于编码得到的z,Decoder每次解码生成一个符号,直到生成完整的输出序列:$(y_1,\dots,y_m)$ 。对于每一步解码,模型都是自回归的,即在生成下一个符号时将先前生成的符号作为附加输入。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class EncoderDecoder(nn.Module):
"""
A standard Encoder-Decoder architecture. Base for this and many
other models.
"""
def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
super(EncoderDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.generator = generator

def forward(self, src, tgt, src_mask, tgt_mask):
"Take in and process masked src and target sequences."
return self.decode(self.encode(src, src_mask), src_mask,
tgt, tgt_mask)

def encode(self, src, src_mask):
return self.encoder(self.src_embed(src), src_mask)

def decode(self, memory, src_mask, tgt, tgt_mask):
"""这里的memory指的是什么?指的是encoder的输出,即作为decoder第二个sublayer的K,V"""
return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
1
2
3
4
5
6
7
8
class Generator(nn.Module):
"Define standard linear + softmax generation step."
def __init__(self, d_model, vocab):
super(Generator, self).__init__()
self.proj = nn.Linear(d_model, vocab)

def forward(self, x):
return F.log_softmax(self.proj(x), dim=-1)

transformer的整体结构如下图所示,在encoder和decoder中都使用了self-attention,point-wise和全连接层。encoder和decoder的大致结构分别如下图的左半部分和右半部分所示:

1-1

encoder和decoder

encoder

encoder由N=6个相同的层组成。

1
2
3
def clones(module, N):
"Produce N identical layers."
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
1
2
3
4
5
6
7
8
9
10
11
12
class Encoder(nn.Module):
"Core encoder is a stack of N layers"
def __init__(self, layer, N):
super(Encoder, self).__init__()
self.layers = clones(layer, N)
self.norm = LayerNorm(layer.size)

def forward(self, x, mask):
"Pass the input (and mask) through each layer in turn."
for layer in self.layers:
x = layer(x, mask)
return self.norm(x)

在每个layer包含两个sublayer,分别为multi-head self-attention mechanism和fully connected feed-forward network,这两个sublayer都分别使用使用了残差连接(residual connection)和归一化。

1
2
3
4
5
6
7
8
9
10
11
12
class LayerNorm(nn.Module):
"Construct a layernorm module (See citation for details)."
def __init__(self, features, eps=1e-6):
super(LayerNorm, self).__init__()
self.a_2 = nn.Parameter(torch.ones(features))
self.b_2 = nn.Parameter(torch.zeros(features))
self.eps = eps

def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

每个sublayer的输出为LayerNorm(x+Sublayer(x)),其中Sublayer(x)由子层自动实现的函数。在每个子层的输出上使用dropout,然后将进行归一化并作为下一sublayer的输入。

1
2
3
4
5
6
7
8
9
10
11
12
13
class SublayerConnection(nn.Module):
"""
A residual connection followed by a layer norm.
Note for code simplicity the norm is first as opposed to last.
"""
def __init__(self, size, dropout):
super(SublayerConnection, self).__init__()
self.norm = LayerNorm(size)
self.dropout = nn.Dropout(dropout)

def forward(self, x, sublayer):
"Apply residual connection to any sublayer with the same size."
return x + self.dropout(sublayer(self.norm(x)))

每个layer都由两个sublayer组成。第一个sublayer实现了“多头”的self-attention,第二个sublayer则是一个简单的position-wise的全连接前馈网络。

1
2
3
4
5
6
7
8
9
10
11
12
13
class EncoderLayer(nn.Module):
"Encoder is made up of self-attn and feed forward (defined below)"
def __init__(self, size, self_attn, feed_forward, dropout):
super(EncoderLayer, self).__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
self.sublayer = clones(SublayerConnection(size, dropout), 2)
self.size = size

def forward(self, x, mask):
"Follow Figure 1 (left) for connections."
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
return self.sublayer[1](x, self.feed_forward)

decoder

decoder 也是由n=6个相同层组成

1
2
3
4
5
6
7
8
9
10
11
class Decoder(nn.Module):
"Generic N layer decoder with masking."
def __init__(self, layer, N):
super(Decoder, self).__init__()
self.layers = clones(layer, N)
self.norm = LayerNorm(layer.size)

def forward(self, x, memory, src_mask, tgt_mask):
for layer in self.layers:
x = layer(x, memory, src_mask, tgt_mask)
return self.norm(x)

除了每个编码层中的两个子层外,解码器还插入了第三个子层,用于对编码器栈的输出实行”多头”的Attention。与编码器类似,每个子层两端使用残差连接,然后进行层的规范化处理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class DecoderLayer(nn.Module):
"Decoder is made of self-attn, src-attn, and feed forward (defined below)"
def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
super(DecoderLayer, self).__init__()
self.size = size
self.self_attn = self_attn
self.src_attn = src_attn
self.feed_forward = feed_forward
self.sublayer = clones(SublayerConnection(size, dropout), 3)

def forward(self, x, memory, src_mask, tgt_mask):
"Follow Figure 1 (right) for connections."
m = memory
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
return self.sublayer[2](x, self.feed_forward)

我们还修改解码器中的self-attention子层以防止当前位置attend到后续位置。这种masked的attention是考虑到输出embedding会偏移一个位置,确保了生成位置i的预测时,仅依赖小于i的位置处的已知输出,相当于把后面不该看到的信息屏蔽掉。

1
2
3
4
5
def subsequent_mask(size):
"Mask out subsequent positions."
attn_shape = (1, size, size)
subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return torch.from_numpy(subsequent_mask) == 0

下面的attention mask图显示了允许每个目标词(行)查看的位置(列)。在训练期间,当前解码位置的词不能attend到后续位置的词。

1
2
plt.figure(figsize=(5,5))
plt.imshow(subsequent_mask(20)[0])
<matplotlib.image.AxesImage at 0x7f86872b90f0>

png

attention

attention函数可以将query和一组key-value对映射到输出,其中query、key、value和输出都是向量。输出是值的加权和,其中分配给每个value的权重由query与相应key的兼容函数计算。

我们称这种特殊的attention机制为”Scaled dot-product attention”。输入包含维度为$d_k$的query和key,以及维度为$d_v$的value。

首先计算query与各个key的点积,然后将每个点积除以$\sqrt{d_k}$,最后使用softmax函数来获得value的权重。

1-2

在具体实现时,我们可以以矩阵的形式进行并行运算,这样能加速运算过程。具体来说,将所有的Query、key和value向量分别组合称矩阵Q,K和V,这样输出矩阵可以表示为:

1
2
3
4
5
6
7
8
9
10
11
def attention(query, key, value, mask=None, dropout=None):
"Compute 'Scaled Dot Product Attention'"
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) \
/ math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
p_attn = F.softmax(scores, dim = -1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, value), p_attn

两种最常用的Attention函数是加和Attention和点积(乘积)Attention,我们的算法与点积Attention很类似,但是$\frac{1}{\sqrt{d_k}}$的比例因子不同。加和attention使用具有单个隐藏层的前馈网络来计算兼容函数。虽然两种方法理论上的复杂度是相似的,但在实践中,点积attention的运算会更快一些,也更节省空间,因为它可以使用高效的矩阵乘法算法来实现。

虽然对于较小的$dk$,这两种机制的表现相似,但对于较大的$d_k$来说,不使用它进行缩放的情况下,加型Attention要优于点积Attention。我们怀疑,对于较大的$d_k$,点积大幅增大,将softmax函数推向具有极小梯度的区域(为了阐明点积变大的原因,假设q和k是独立的随机变量,平均值为0,方差1,这样他们的点积为$q.k = \sum{i=1}^{d_k}q_kk_i$,同样是均值为0,方差为$d_k$)。为了抵消这种影响,我们用$\frac{1}{\sqrt{d_k}}$来缩放点积。

1-3

多头机制能让模型考虑到不同位置的Attention,另外“多头”Attention可以在不同的子空间表示不一样的关联关系,使用单个head的attention一般达不到这种效果。

$Wi^Q \in \mathbb{R}^{d{model} \times d_k}$

$Wi^K \in \mathbb{R}^{d{model} \times d_k}$

$Wi^V \in \mathbb{R}^{d{model} \times d_v}$

$W^O \in \mathbb{R}^{hdv\times d{model}}$

我们的工作中使用h=8个head并行的attention,对每一个head来说有$dk=d_v=d{model}/h=64$,总计算量与完整维度的单个Head的Attention很相近。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class MultiHeadedAttention(nn.Module):
def __init__(self, h, d_model, dropout=0.1):
"Take in model size and number of heads."
super(MultiHeadedAttention, self).__init__()
assert d_model % h == 0
# We assume d_v always equals d_k
self.d_k = d_model // h
self.h = h
self.linears = clones(nn.Linear(d_model, d_model), 4)
self.attn = None
self.dropout = nn.Dropout(p=dropout)

def forward(self, query, key, value, mask=None):
"Implements Figure 2"
if mask is not None:
# Same mask applied to all h heads.
mask = mask.unsqueeze(1)
nbatches = query.size(0)

# 1) Do all the linear projections in batch from d_model => h x d_k
query, key, value = \
[l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
for l, x in zip(self.linears, (query, key, value))]

# 2) Apply attention on all the projected vectors in batch.
x, self.attn = attention(query, key, value, mask=mask,
dropout=self.dropout)

# 3) "Concat" using a view and apply a final linear.
x = x.transpose(1, 2).contiguous() \
.view(nbatches, -1, self.h * self.d_k)
return self.linears[-1](x)

Attention在模型中的应用

transformer中以三种不同的方式使用了“多头”Attention:

  1. 在”Encoder-Decoder Attention”层,Query来自先前的编码器层,并且Key和Value来自Encoder的输出。Decoder中的每个位置Attend输入序列中的所有位置,这与Seq2Seq模型中的经典的Encoder-Decoder Attention机制一致。

  2. Encoder中的Self-attention层。在self-attention层中,所有的key、value和query都来同一个地方,这里都来自encoder的前一层的输出。encoder中当前层的每个位置都能attend到前一层的所有位置。

  3. 类似的,解码器中的self-attention层允许解码器中的每个位置attend当前解码位置和它前面的所有位置。这里需要屏蔽解码器中向右的信息流以保持自回归属性。具体的实现方式是在缩放后的点积Attention中,屏蔽(设为负无穷)softmax的输入中所有对应着非法连接的Value。

position-wise前馈网络

除了Attention子层之外,Encoder和Decoder中的每个层都包含一个全连接前馈网络,分别地应用于每个位置。其中包括两个线性变换,然后使用ReLU作为激活函数。

虽然线性变换在不同位置上是相同的,但是他们在层与层之间使用不同的参数。这其实就是相当于使用了2个1x1的卷积核。这里设置输入和输出的维数为$d{model}=512$,内层的维度为$d{ff} = 2048$。

1
2
3
4
5
6
7
8
9
10
class PositionwiseFeedForward(nn.Module):
"Implements FFN equation."
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, x):
return self.w_2(self.dropout(F.relu(self.w_1(x))))

Embedding 和softmax

与其它序列转换模型类似,我们使用预训练的embedding将输入token序列和输出token序列转换为$d_{model}$维向量。还使用常用的预训练的线性变换和softmax函数将解码器输出转换为预测下一个token的概率。

在我们的模型中,我们在两个embedding层和pre-softmax线性变换之间共享相同的权重矩阵。在embedding层中,将这些权重乘以$\sqrt{d_{model}}$

1
2
3
4
5
6
7
8
class Embeddings(nn.Module):
def __init__(self, d_model, vocab):
super(Embeddings, self).__init__()
self.lut = nn.Embedding(vocab, d_model)
self.d_model = d_model

def forward(self, x):
return self.lut(x) * math.sqrt(self.d_model)

位置编码

由于此模型不包含递归和卷积结构,为了使模型能够有效利用序列的顺序特征,需要加入序列中各个token间相对位置或token在序列中绝对位置的信息。在这里,我们将位置编码添加到编码器和解码器栈底部的输入embedding。

由于位置编码与embedding具有相同的维度$d_{model}$,因此两者可以直接相加。其实这里还有许多位置编码可供选择,其中包括可更新的和固定不变的。[17]

在此项工作中,我们使用不同频率的正弦和余弦函数:

其中pos是位置,i是维度。也就是说,位置编码的每个维度对应于一个正弦曲线,其波长形成从$2\pi$到$10000.2\pi$的等比级数。

我们之所以选择了这个函数,是因为我们假设它能让模型很容易学会attend相对位置,因为对于任何固定的偏移量k,$PE{pos+k}$可以表示为$PE{pos}$的线性函数。

此外,在编码器和解码器堆栈中,我们在Embedding与位置编码的加和上都使用了dropout机制。在基本模型上,使用$P_{drop}=0.1$的比率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class PositionalEncoding(nn.Module):
"Implement the PE function."
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)

# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model)
position = torch.arange(0., max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0., d_model, 2)*-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)

def forward(self, x):
x = x + Variable(self.pe[:, :x.size(1)],
requires_grad=False)
return self.dropout(x)
1
2
3
4
5
6
##如下所示,位置编码将根据位置添加正弦曲线。曲线的频率和偏移对于每个维度是不同的
plt.figure(figsize=(15,5))
pe = PositionalEncoding(20,0)
y = pe.forward(Variable(torch.zeros(1,100,20)))
plt.plot(np.arange(100),y[0,:,4:8].data.numpy())
plt.legend(['dim %d'%p for p in [4,5,6,7]])
<matplotlib.legend.Legend at 0x7f86849e6208>

png

使用通过训练的方式获取位置embedding和直接使用以上公式生成位置embeding结果基本一样的。因此,这里选择正弦曲线版本的实现,因为使用此版本能让模型能够处理大于语料库中最长序列长度的序列。

完整模型

下面定义了连接完整模型并设置超参数的函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def make_model(src_vocab, tgt_vocab, N=6, 
d_model=512, d_ff=2048, h=8, dropout=0.1):
"Helper: Construct a model from hyperparameters."
c = copy.deepcopy
attn = MultiHeadedAttention(h, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
position = PositionalEncoding(d_model, dropout)
model = EncoderDecoder(
Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
Decoder(DecoderLayer(d_model, c(attn), c(attn),
c(ff), dropout), N),
nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
Generator(d_model, tgt_vocab))

# This was important from their code.
# Initialize parameters with Glorot / fan_avg.
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform(p)
return model
1
tmp_model = make_model(10,10,2)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:20: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.

训练

本节介绍模型的训练方法。

快速穿插介绍训练标准编码器解码器模型需要的一些工具。首先我们定义一个包含源和目标句子的批训练对象用于训练,同时构造掩码。

batches and masking

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class Batch:
"Object for holding a batch of data with mask during training."
def __init__(self, src, trg=None, pad=0):
self.src = src
self.src_mask = (src != pad).unsqueeze(-2)
if trg is not None:
self.trg = trg[:, :-1]#不包含最后一列,input
self.trg_y = trg[:, 1:]#不包含第一列,label
self.trg_mask = \
self.make_std_mask(self.trg, pad)
self.ntokens = (self.trg_y != pad).data.sum()

@staticmethod
def make_std_mask(tgt, pad):
"Create a mask to hide padding and future words."
tgt_mask = (tgt != pad).unsqueeze(-2)
tgt_mask = tgt_mask & Variable(
subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
return tgt_mask

training loop

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def run_epoch(data_iter, model, loss_compute):
"Standard Training and Logging Function"
start = time.time()
total_tokens = 0
total_loss = 0
tokens = 0
for i, batch in enumerate(data_iter):
out = model.forward(batch.src, batch.trg,
batch.src_mask, batch.trg_mask)
loss = loss_compute(out, batch.trg_y, batch.ntokens)
total_loss += loss
total_tokens += batch.ntokens
tokens += batch.ntokens
if i % 50 == 1:
elapsed = time.time() - start
print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
(i, loss / batch.ntokens, tokens / elapsed))
start = time.time()
tokens = 0
return total_loss / total_tokens

Training Data and Batching

在wmt 2014 英语-德语 数据集上进行训练,该数据集包含450万个句对。句子已经使用byte-pair编码方式进行编码了,在source-target 词典中包含大约37000个tokens。

对于英语-法语,我们使用更大的语料库WMT 2014 英语-法语语料库,包含3600万条句对,然后按词进行分词,构建一个包含32000个单词的词典。

在训练中,每一个训练batch包含25000个源语言的token和25000个目标语言的token。

1
2
3
4
5
6
7
8
9
10
11
12
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
"Keep augmenting batch and calculate total number of tokens + padding."
global max_src_in_batch, max_tgt_in_batch
if count == 1:
max_src_in_batch = 0
max_tgt_in_batch = 0
max_src_in_batch = max(max_src_in_batch, len(new.src))
max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
src_elements = count * max_src_in_batch
tgt_elements = count * max_tgt_in_batch
return max(src_elements, tgt_elements)

硬件和训练时长

在一台有8个nvidia p100 gpus的机器上进行训练。对于我们的基础模型来说,使用论文中所说的超参数设置,每一个训练步花费0.4秒。基本模型是训练了100000 步,大概花费12个小时。而对于比较大的那个模型,每个训练步花费1.0秒,这个模型训练了300000步,一共3.5天。

优化

使用adam优化器$\beta1 = 0.9$,$\beta_2 = 0.98$和$\epsilon = 10^{-9}$。使用公式$lrate=d{model}^{-5}.\min(step_num^{-0.5},step_num.warmup_steps^{-1.5})$来设定学习率。即相应的线性的增加学习率对于第一个warmup_steps 训练步,然后把它按比例减小到步骤数的平方根的倒数。我们设定warmup_steps=4000。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class NoamOpt:
"Optim wrapper that implements rate."
def __init__(self, model_size, factor, warmup, optimizer):
self.optimizer = optimizer
self._step = 0
self.warmup = warmup
self.factor = factor
self.model_size = model_size
self._rate = 0

def step(self):
"Update parameters and rate"
self._step += 1
rate = self.rate()
for p in self.optimizer.param_groups:
p['lr'] = rate
self._rate = rate
self.optimizer.step()

def rate(self, step = None):
"Implement `lrate` above"
if step is None:
step = self._step
return self.factor * \
(self.model_size ** (-0.5) *
min(step ** (-0.5), step * self.warmup ** (-1.5)))
1
2
3
def get_std_opt(model):
return NoamOpt(model.src_embed[0].d_model, 2, 4000,
torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
1
2
3
4
5
6
# Three settings of the lrate hyperparameters.
opts = [NoamOpt(512, 1, 4000, None),
NoamOpt(512, 1, 8000, None),
NoamOpt(256, 1, 4000, None)]
plt.plot(np.arange(1, 20000), [[opt.rate(i) for opt in opts] for i in range(1, 20000)])
plt.legend(["512:4000", "512:8000", "256:4000"])
<matplotlib.legend.Legend at 0x7f86830c59e8>

png

正则化

标签平滑

在训练的过程中,我们设置标签平滑值$\epsilon_{ls} = 0.1$。这会增加困惑度,从而使得模型变得更加不确定,但是会提高准确率和BLEU得分。

通过使用kl散度损失来实现label smoothing。通过创建一个包含正确单词置信度和其余的平滑质量值分布在字典中的其它单词中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class LabelSmoothing(nn.Module):
"Implement label smoothing."
def __init__(self, size, padding_idx, smoothing=0.0):
super(LabelSmoothing, self).__init__()
self.criterion = nn.KLDivLoss(size_average=False)
self.padding_idx = padding_idx
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.size = size
self.true_dist = None

def forward(self, x, target):
assert x.size(1) == self.size
true_dist = x.data.clone()
true_dist.fill_(self.smoothing / (self.size - 2))
true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
true_dist[:, self.padding_idx] = 0
mask = torch.nonzero(target.data == self.padding_idx)
if mask.dim() > 0:
true_dist.index_fill_(0, mask.squeeze(), 0.0)
self.true_dist = true_dist
return self.criterion(x, Variable(true_dist, requires_grad=False))

接下来,我们通过一个例子来看看这些平滑质量是如何基于置信度分布到各个单词中的。

1
2
3
4
5
6
7
8
9
10
# Example of label smoothing.
crit = LabelSmoothing(5, 0, 0.4)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.2, 0.7, 0.1, 0],
[0, 0.2, 0.7, 0.1, 0]])
v = crit(Variable(predict.log()),
Variable(torch.LongTensor([2, 1, 0])))

# Show the target distributions expected by the system.
plt.imshow(crit.true_dist)
/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:43: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))





<matplotlib.image.AxesImage at 0x7f86830f6a90>

png

label smoothing是当模型对正确单词给予过大的置信度时进行惩罚,防止过拟合。

1
2
3
4
5
6
7
8
crit = LabelSmoothing(5,0,0.1)
def loss(x):
d = x+3*1
predict = torch.FloatTensor([[0, x/d, 1/d, 1/d, 1/d],])
return crit(Variable(predict.log()),
Variable(torch.LongTensor([1]))).data.item()

plt.plot(np.arange(1,100), [loss(x) for x in range(1,100)])
/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:43: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))





[<matplotlib.lines.Line2D at 0x7f8683047cc0>]

png

第一个例子

给定从小字典中随机出来的符号集合作为输入,输出是生成与输入一样的符号。

生成数据

1
2
3
4
5
6
7
8
def data_gen(V, batch, nbatches):
"Generate random data for a src-tgt copy task."
for i in range(nbatches):
data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
data[:, 0] = 1
src = Variable(data, requires_grad=False)
tgt = Variable(data, requires_grad=False)
yield Batch(src, tgt, 0)

Loss Computation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class SimpleLossCompute:
"A simple loss compute and train function."
def __init__(self, generator, criterion, opt=None):
self.generator = generator
self.criterion = criterion
self.opt = opt

def __call__(self, x, y, norm):
x = self.generator(x)
loss = self.criterion(x.contiguous().view(-1, x.size(-1)),
y.contiguous().view(-1)) / norm
loss.backward()
if self.opt is not None:
self.opt.step()
self.opt.optimizer.zero_grad()
return loss.data * norm

Greedy Decoding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Train the simple copy task.
V = 11
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2)
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

for epoch in range(10):
model.train()
run_epoch(data_gen(V, 30, 20), model,
SimpleLossCompute(model.generator, criterion, model_opt))
model.eval()
print(run_epoch(data_gen(V, 30, 5), model,
SimpleLossCompute(model.generator, criterion, None)))
/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:43: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:20: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.


Epoch Step: 1 Loss: 2.958416 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 1.895539 Tokens per Sec: 540.000000
tensor(1.8822)
Epoch Step: 1 Loss: 2.033992 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 1.702803 Tokens per Sec: 540.000000
tensor(1.7028)
Epoch Step: 1 Loss: 1.865366 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 1.458180 Tokens per Sec: 540.000000
tensor(1.4864)
Epoch Step: 1 Loss: 1.597016 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 1.191519 Tokens per Sec: 540.000000
tensor(1.3019)
Epoch Step: 1 Loss: 1.362523 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 1.144701 Tokens per Sec: 540.000000
tensor(1.1669)
Epoch Step: 1 Loss: 1.323817 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 0.642481 Tokens per Sec: 540.000000
tensor(0.6497)
Epoch Step: 1 Loss: 0.880322 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 0.335012 Tokens per Sec: 540.000000
tensor(0.3691)
Epoch Step: 1 Loss: 0.633148 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 0.275560 Tokens per Sec: 540.000000
tensor(0.3101)
Epoch Step: 1 Loss: 0.628232 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 0.242260 Tokens per Sec: 540.000000
tensor(0.2314)
Epoch Step: 1 Loss: 0.257507 Tokens per Sec: 540.000000
Epoch Step: 1 Loss: 0.186647 Tokens per Sec: 540.000000
tensor(0.1799)

使用greedy decoding来进行预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def greedy_decode(model, src, src_mask, max_len, start_symbol):
memory = model.encode(src, src_mask)
ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
for i in range(max_len-1):
out = model.decode(memory, src_mask,
Variable(ys),
Variable(subsequent_mask(ys.size(1))
.type_as(src.data)))
prob = model.generator(out[:, -1])
_, next_word = torch.max(prob, dim = 1)
next_word = next_word.data[0]
ys = torch.cat([ys,
torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
return ys
1
2
3
4
model.eval()
src = Variable(torch.LongTensor([[1,2,3,4,5,6,7,8,9,10]]) )
src_mask = Variable(torch.ones(1, 1, 10) )
print(greedy_decode(model, src, src_mask, max_len=10, start_symbol=1))
tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])

一个真实数据集案例

使用IWSLT German-English 翻译任务的数据集进行训练和测试。同时基于此任务实现一个多gpu版本模型。

1
2
3
!pip install torchtext spacy
!python -m spacy download en
!python -m spacy download de

Data Loading

1
from torchtext import data,datasets
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
if True:
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD)

MAX_LEN = 100
train, val, test = datasets.IWSLT.splits(
exts=('.de', '.en'), fields=(SRC, TGT),
filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

批处理对提升速度来说至关重要。我们想要非常均匀的划分batch,使得需要尽可能少的padding处理。要做到这一点,需要修改原数据集的batch方式,以确保每个batch里面的句子足够的紧凑,而尽可能少的padding处理。

iterators

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
class MyIterator(data.Iterator):
def create_batches(self):
if self.train:
def pool(d, random_shuffler):
for p in data.batch(d, self.batch_size * 100):
p_batch = data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = pool(self.data(), self.random_shuffler)

else:
self.batches = []
for b in data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
1
2
3
4
def rebatch(pad_idx, batch):
"Fix order in torchtext to match ours"
src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1)
return Batch(src, trg, pad_idx)

Multi-GPU Training

为了真正实现快速训练,我们将使用多gpu。这段代码实现了多gpu词语生成。这并不是特定于transformer的任务,因此这里不会涉及到太多的细节讲解。多gpu的大概工作原理就是,在训练阶段,把词语切分成多个chunk,然后分别使用不同的gpu并行处理这些chunk。这里使用pytorch parallel primitives来实现:

  • replicate——把训练模块复制到各个不同的gpu上
  • scatter——把batch数据逐个分发到各个gpu上
  • parallel_apply——把各个gpu上的batch数据传递给对应gpu上的训练模块中进行训练
  • gather——把各个gpu上的训练结果集中在其中一个gpu上
  • nn.DataParallel—— a special module wrapper that calls these all before evaluating.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Skip if not interested in multigpu.
class MultiGPULossCompute:
"A multi-gpu loss compute and train function."
def __init__(self, generator, criterion, devices, opt=None, chunk_size=5):
# Send out to different gpus.
self.generator = generator
self.criterion = nn.parallel.replicate(criterion,
devices=devices)
self.opt = opt
self.devices = devices
self.chunk_size = chunk_size

def __call__(self, out, targets, normalize):
total = 0.0
generator = nn.parallel.replicate(self.generator,
devices=self.devices)
out_scatter = nn.parallel.scatter(out,
target_gpus=self.devices)
out_grad = [[] for _ in out_scatter]
targets = nn.parallel.scatter(targets,
target_gpus=self.devices)

# Divide generating into chunks.
chunk_size = self.chunk_size
for i in range(0, out_scatter[0].size(1), chunk_size):
# Predict distributions
out_column = [[Variable(o[:, i:i+chunk_size].data,
requires_grad=self.opt is not None)]
for o in out_scatter]
gen = nn.parallel.parallel_apply(generator, out_column)

# Compute loss.
y = [(g.contiguous().view(-1, g.size(-1)),
t[:, i:i+chunk_size].contiguous().view(-1))
for g, t in zip(gen, targets)]
loss = nn.parallel.parallel_apply(self.criterion, y)

# Sum and normalize loss
l = nn.parallel.gather(loss,
target_device=self.devices[0])
l = l.sum()[0] / normalize
total += l.data[0]

# Backprop loss to output of transformer
if self.opt is not None:
l.backward()
for j, l in enumerate(loss):
out_grad[j].append(out_column[j][0].grad.data.clone())

# Backprop all loss through transformer.
if self.opt is not None:
out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad]
o1 = out
o2 = nn.parallel.gather(out_grad,
target_device=self.devices[0])
o1.backward(gradient=o2)
self.opt.step()
self.opt.optimizer.zero_grad()
return total * normalize

到目前为止,模型、criterion、优化器,数据迭代器以及并行化都已经创建好了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#GPUs to use
#5)nn.DataParallel
#devices = [0,1,2,3]
devices = [0]
if True:
pad_idx = TGT.vocab.stoi['<blank>']
model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
model.cuda() #把模型参数转换成cuda tensor
criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
criterion.cuda()
BATCH_SIZE = 12000
train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
repeat=False, sort_key=lambda x: (len(x.src),len(x.trg)),
batch_size_fn=batch_size_fn, train=True)
valid_iter =MyIterator(val, batch_size=BATCH_SIZE, device=0,
repeat=False, sort_key=lambda x:(len(x.src),len(x.trg)),
batch_size_fn=batch_size_fn, train=False)
model_par = nn.DataParallel(model,device_ids=devices)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:20: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.
/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:43: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.

设置warmup_steps,其它的使用默认参数。硬件环境:AWS p3.8xlarge,4Tesla V100s。每秒处理大约27000个tokens,batch size为12000。

Training the System

1
!wget https://s3.amazonaws.com/opennmt-models/iwslt.pt
--2019-10-11 13:59:00--  https://s3.amazonaws.com/opennmt-models/iwslt.pt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.162.157
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.162.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 467317581 (446M) [application/x-www-form-urlencoded]
Saving to: ‘iwslt.pt.2’

iwslt.pt.2          100%[===================>] 445.67M  35.3MB/s    in 14s     

2019-10-11 13:59:14 (32.2 MB/s) - ‘iwslt.pt.2’ saved [467317581/467317581]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
if False:
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000,
torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
for epoch in range(10):
model_par.train()
run_epoch((rebatch(pad_idx, b) for b in train_iter),
model_par,
MultiGPULossCompute(model.generator, criterion,
devices=devices, opt=model_opt))
model_par.eval()
loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter),
model_par,
MultiGPULossCompute(model.generator, criterion,
devices=devices, opt=None))
print(loss)
else:
model = torch.load("iwslt.pt")

接下来使用训练好的模型进行一系列的翻译任务了。首先,对验证集的第一个句子进行翻译。由于验证集非常小,因此使用greedy search策略进行预测可以得到一个很不错的正确率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
for i,batch in enumerate(valid_iter):
src = batch.src.transpose(0,1)[:1]
src_mask = (src!=SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
out = greedy_decode(model.cuda(), src.cuda(), src_mask.cuda(), max_len=60, start_symbol=TGT.vocab.stoi['<s>'])
print("Translation:",end='\t')
for i in range(1,out.size(1)):
sym = TGT.vocab.itos[out[0,i]]
if sym=='</s>':
break
print(sym, end=" ")
print()
print('Target:', end='\t')
for i in range(1, batch.trg.size(0)):
sym = TGT.vocab.itos[batch.trg.data[i,0]]
if sym=='</s>': break
print(sym, end=' ')
print()
break
Translation:    And in 1914 , they 're looking at the rat technology that 's going to turn life on the screen . 
Target:    So 1860 , they are seeing this dirty technology that is going to choke the life out of New York . 

额外组件:BPE,Search,Averaging

在这篇文章中,关于transformer,还有4个层面的技术没有详细的展开讲解。而这四个部分在OpenNMT-py中已经实现了。

  1. BPE/Word-piece: 使用一个第三方库来处理数据集,把它分成子词单元(实现细节请参考:subword-nmt)。最终数据集会被处理成如下形式:
    ▁Die ▁Protokoll datei ▁kann ▁ heimlich ▁per ▁E - Mail ▁oder ▁FTP ▁an ▁einen ▁bestimmte n ▁Empfänger ▁gesendet ▁werden

  2. Shared Embeddings:当使用带有共享字典的BPE,可以在source/target/generator之间共享权重参数。更多细节请参考:cite。把这个技术集成到模型中,只需要以下语句即可实现:

1
2
3
if False:
model.src_embed[0].lut.weight = model.tgt_embeddings[0].lut.weight
model.generator.lut.weight = model.tgt_embed[0].lut.weight
  1. Beam Search: 更多细节请参考:OpenNMT-py

  2. Model Averaging: 论文中把最近的k次checkpoints进行平均,从而构建一个集成模型。

1
2
3
4
def average(model, models):
"""Average models into model"""
for ps in zip(*[m.params() for m in [model]+models]):
p[0].copy_(torch.sum(*ps[1:])/len(ps[1:]))

Result

1
!wget https://s3.amazonaws.com/opennmt-models/en-de-model.pt
--2019-10-11 14:06:11--  https://s3.amazonaws.com/opennmt-models/en-de-model.pt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.105.173
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.105.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 393031182 (375M) [application/x-www-form-urlencoded]
Saving to: ‘en-de-model.pt’

en-de-model.pt      100%[===================>] 374.82M  76.5MB/s    in 4.8s    

2019-10-11 14:06:16 (78.3 MB/s) - ‘en-de-model.pt’ saved [393031182/393031182]
1
model, SRC, TGT = torch.load("en-de-model.pt")
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:453: SourceChangeWarning: source code of class 'torch.nn.modules.container.ModuleList' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.
  warnings.warn(msg, SourceChangeWarning)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:453: SourceChangeWarning: source code of class 'torch.nn.modules.linear.Linear' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.
  warnings.warn(msg, SourceChangeWarning)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:453: SourceChangeWarning: source code of class 'torch.nn.modules.dropout.Dropout' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.
  warnings.warn(msg, SourceChangeWarning)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:453: SourceChangeWarning: source code of class 'torch.nn.modules.container.Sequential' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.
  warnings.warn(msg, SourceChangeWarning)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:453: SourceChangeWarning: source code of class 'torch.nn.modules.sparse.Embedding' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.
  warnings.warn(msg, SourceChangeWarning)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
model.eval()
sent = "▁The ▁log ▁file ▁can ▁be ▁sent ▁secret ly ▁with ▁email ▁or ▁FTP ▁to ▁a ▁specified ▁receiver".split()
src = torch.LongTensor([[SRC.stoi[w] for w in sent]])
src = Variable(src)
src_mask = (src != SRC.stoi["<blank>"]).unsqueeze(-2)
out = greedy_decode(model, src, src_mask,
max_len=60, start_symbol=TGT.stoi["<s>"])
print("Translation:", end="\t")
trans = "<s> "
for i in range(1, out.size(1)):
sym = TGT.itos[out[0, i]]
if sym == "</s>": break
trans += sym + " "
print(trans)
Translation:    <s> ▁Die ▁Protokoll datei ▁kann ▁ heimlich ▁per ▁E - Mail ▁oder ▁FTP ▁an ▁einen ▁bestimmte n ▁Empfänger ▁gesendet ▁werden . 

注意力可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
tgt_sent = trans.split()
def draw(data, x, y, ax):
seaborn.heatmap(data,
xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
cbar=False, ax=ax)

for layer in range(1, 6, 2):
fig, axs = plt.subplots(1,4, figsize=(20, 10))
print("Encoder Layer", layer+1)
for h in range(4):
draw(model.encoder.layers[layer].self_attn.attn[0, h].data,
sent, sent if h ==0 else [], ax=axs[h])
plt.show()

for layer in range(1, 6, 2):
fig, axs = plt.subplots(1,4, figsize=(20, 10))
print("Decoder Self Layer", layer+1)
for h in range(4):
draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(tgt_sent)],
tgt_sent, tgt_sent if h ==0 else [], ax=axs[h])
plt.show()
print("Decoder Src Layer", layer+1)
fig, axs = plt.subplots(1,4, figsize=(20, 10))
for h in range(4):
draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(sent)],
sent, tgt_sent if h ==0 else [], ax=axs[h])
plt.show()
Encoder Layer 2

png

Encoder Layer 4

png

Encoder Layer 6

png

Decoder Self Layer 2

png

Decoder Src Layer 2

png

Decoder Self Layer 4

png

Decoder Src Layer 4

png

Decoder Self Layer 6

png

Decoder Src Layer 6

png

参考资料