基于Tensorflow实现Transformer模型

一、Transformer模型

基于Tensorflow实现Transformer模型插图
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
import tensorflow as tf 
from official.transformer.model import attention_layer
from official.transformer.model import beam_search
from official.transformer.model import embedding_layer
from official.transformer.model import ffn_layer
from official.transformer.model import model_utils
from official.transformer.utils.tokenizer import EOS_ID
 
class Transformer(object):
    """
    transformer模型由encoder和decoder创建。输入是int序列,encoder产生连续输出,decoder使用ecoder output输出序列概率
    """
    def __int__(self,params,train):
        """
        transformer model 初始化
        :param params: 超参数设置,如:layer size,dropout rate 等
        :param train: train模式使用dropput
        :return:
        """
 
        self.trian=train
        self.params=params
 
        # 创建embedding层,input/output embedding,positional embedding
        # matmul在tpu上训练速度更快,gather在cpu,gpu更快
        self.embedding_softmax_layer=embedding_layer.EmbeddingSharedWeights(
            params['vocab_size'],params['hidden_size'],
            method='matmul' if params['tpu'] else 'gather'
        )
    def __call__(self, inputs, targets=None):
        """
        训练/预测阶段的模型输出
        :param input: tensor shape[batch_size,input_lenght]
        :param targets: None 或者 shape[batch_size,target_length]
        :return: 训练模式下,输出[batch_size,target_length,vocab_size];预测模式下,输出字典:
        {
        output:[batch_size,decoded_length]
        # BLEU分数
        score:[batch_size,float]
        }
        """
        # 使用方差缩放
        initizlizer=tf.variance_scaling_initializer(
            self.params['initializerz_gain'],mode='fan_avg',
            distribution='unform'
        )
 
        with tf.variable_scope('transformer',initializer=initizlizer):
            # 计算encoder,decoder中的attention bias
            attention_bias=model_utils.get_padding_bias(inputs)
 
            # 获取encoder output
            encoder_outputs=self.encode(inputs,attention_bias)
 
            # 训练模式,预测模式不同输出
            if targets == None:
                return self.predict(encoder_outputs,attention_bias)
            else:
                logits=self.decode(targets,encoder_outputs,attention_bias)
                return logits
    def encode(self,inputs,attention_bias):
        """
        :param inputs: int shape[batch_size,input_length]
        :param attention_bias:float shape[batch_size,1,1,input_length]
        :return: float shape[batch_size,input_length,hidden_size]
        """
        with tf.name_scope('encode'):
            # encode_input 由 input embedding,positional encoding 合并创建,并添加dropout
            # 此时应注意 input embedding,positional encoding 的维度大小,可以两者相加
 
            embedding_inputs=self.embedding_softmax_layer(inputs)
            inputs_padding=model_utils.get_padding(inputs)
 
            with tf.name_scope('add_pos_encoding'):
                lenth=tf.shape(embedding_inputs)[1]
                pos_encoding=model_utils.get_position_encoding(
                    lenth,self.params['hidden_size']
                )
                encoder_inputs=embedding_inputs+pos_encoding
 
            # 训练模式使用dropout
            if self.train:
                encoder_inputs=tf.nn.dropout(
                    encoder_inputs,1-self.params['layer_postprocess_dropout']
                )
 
                # encode,decode 都是默认6层
                return self.encode_stack(encoder_inputs,attention_bias,inputs_padding)
 
    def decode(self,targets,encoder_outputs,attention_bias):
        """
 
        :param targets: int shape[batch_size,target_size]
        :param encoder_outputs:  float shape[batch_size,input_lenth,hidden_size]
        :param attention_bias: float shape[batch_size,1,1,input_length]
        :return: float shape[batch_size,target_lenth,vocab_size]
        """
        with tf.name_scope('decode'):
            # 将decode input向右移一位(需要把decoder的输入前面加上开始符号并去掉最后一位。然后最终预测出完整的targets)
            # 并添加 positional encoding使用dropout
            decoder_inputs=self.embedding_softmax_layer(targets)
 
            # 向右移一位,并去除最后一位
            with tf.name_scope('shift_targets'):
                decoder_inputs=tf.pad(
                    decoder_inputs,[[0,0],[1,0],[0,0]]
                )[:,:-1,:]
 
            # 添加pos_encoding
            with tf.name_scope('add_pos_encoding'):
                length=tf.shape(decoder_inputs)[1]
                decoder_inputs+=model_utils.get_position_encoding(
                    length,self.params['hidden_size']
                )
 
            # 训练模式使用dropout
            if self.train:
                decoder_inputs=tf.nn.dropout(
                    decoder_inputs,1-self.params['layer_posprocess_dropout']
                )
 
            # 多头注意力层
            decoder_self_attention_bias=model_utils.get_decoder_self_attention_bias(length)
 
            outputs=self.decoder_stack(
                decoder_inputs,encoder_outputs,decoder_self_attention_bias,attention_bias
            )
 
            logits=self.embedding_softmax_layer.linear(outputs)
 
            return logits
    def _get_symbols_to_logits_fn(self,max_decode_length):
        """
        返回一个用于计算下一个tokens模型输出值的方法
        :param max_decode_length:
        :return:
        """
        timing_signal=model_utils.get_position_encoding(
            max_decode_length+1,self.params['hidden_size']
        )
        decoder_self_atttention_bias=model_utils.get_decoder_self_attention_bias(max_decode_length)
 
        def symbols_to_logits_fn(ids,i,cache):
            """
            生成下一个模型输出值ID
            :param ids:当前编码序列
            :param i: 循环索引
            :param cache: 保存ecoder_output,encoder-decoder attention bias,上一个decoder attention bias值
            :return: ([batch_size*beam_size,vocab_size],updated cache values)
            """
 
            # 将decode input 设置为最后一个输出ID
            decoder_input=ids[:,-1,:]
 
            # decode input 通过embedding并添加timing signal
            decoder_input=self.embedding_softmax_layer(decoder_input)
            decoder_input+=timing_signal[i:i+1]
 
            self_attention_bias=decoder_self_atttention_bias[:, :, i:i + 1, :i + 1]
            decoder_outputs=self.decoder_stack(
                decoder_input,cache.get('encoder_outputs'),self_attention_bias,
                cache.get('encoder_decoder_attention_bias'),cache
            )
 
            # 模型最后是一层全连接层+softmax层
            logits=self.embedding_softmax_layer.linear(decoder_outputs)
            logits=tf.squeeze(logits,axis=[1])
 
            return logits,cache
        return symbols_to_logits_fn
 
    def predict(self,encoder_outputs,encoder_decoder_attention_bias):
        """
 
        :param endoer_outputs:
        :param encoder_decoder_attention_bias:
        :return:
        """
        # encoder_outputs shape[batch_size,input_length,hidden_size]
        batch_size=tf.shape(encoder_outputs)[0]
        input_length=tf.shape(encoder_outputs)[1]
        max_decode_length=input_length+self.params['extra_decode_length']
        symbols_to_logits_fn=self._get_symbols_to_logits_fn(max_decode_length)
 
        # 初始化sybols_to_logits_fn ID输入
        initial_ids=tf.zeros(shape=[batch_size],dtype=tf.int32)
 
        # 保存每一层的decode attention值
        cache={
            'layer_%d'%layer:{
                'k':tf.zeros([batch_size,0,self.params['hidden_size']]),
                'v':tf.zeros([batch_size,0,self.params['hidden_size']])
            }for layer in range(self.params['num_hidden_layers'])
        }
 
        cache['encoder_outputs']=encoder_outputs
        cache['encoder_decoder_attention_bias']=encoder_decoder_attention_bias
 
        # 使用beam search搜索
        decoded_ids,scores=beam_search.sequence_beam_search(
            symbols_to_logits_fn=symbols_to_logits_fn,
            initial_ids=initial_ids,
            initial_cache=cache,
            vocab_size=self.params["vocab_size"],
            beam_size=self.params["beam_size"],
            alpha=self.params["alpha"],
            max_decode_length=max_decode_length,
            eos_id=EOS_ID
        )
 
        #获取每个batch数据中,顶部数据
        top_decoded_ids=decoded_ids[:,0,1:]
        top_scores=scores[:,0]
 
        return {'outputs':top_decoded_ids,"scores":top_scores}
class LayerNormalization(tf.keras.layers.Layer):
    # 层归一化
    def __int__(self,hidden_size):
        super(LayerNormalization,self).__init__()
        self.hidden_size=hidden_size
    def build(self,_):
        self.scale=tf.get_variable('layer_nor_scale',[self.hidden_size],initializer=tf.ones_initializer())
        self.bias=tf.get_variable('layer_norm_bias',[self.hidden_size],initializer=tf.zeros_initializer())
        self.built=True
 
    def call(self, x, epsilon=1e-6):
        mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
        variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
        norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
        return norm_x * self.scale + self.bias
class PrePostProcessingWrapper(object):
    """
    用于包装模型起点的attention层和最后的feed_forward全连接层
    """
    def __int__(self,layer,params,train):
        self.layer=layer,
        # 每层都使用到dropout
        self.postprocess_dropout=params['layer_postprocess_dropout']
        self.train=train
        self.layer_norm=LayerNormalization(params['hidden_size'])
    def __call__(self, x,*args, **kwargs):
        # 层归一化
        y=self.layer_norm(x)
 
        y=self.layer(y,*args,**kwargs)
 
        # 训练模式使用dropout
        # 应用残差网络
 
        if self.train:
            y=tf.nn.dropout(y,1-self.postprocess_dropout)
        return x+y
class EncoderStack(tf.keras.layers.Layer):
    """
    模型默认6层encoder,每一层有两个子层:1,self-attention层,2,feedforward全连接层(此层内又有两个子层)
    """
    def __init__(self,params,train):
        super(EncoderStack,self).__init__()
        self.layers=[]
 
        for _ in range(params['num_hidden_layers']):
            #创建子层
            #多头注意力模型默认是8个
            self_attention_layer=attention_layer.SelfAttention(
                params['hidden_size'],params['num_heads'],
                params['attention_dropout'],train
            )
            feed_forward_network=ffn_layer.FeedFowardNetwork(
                params['hidden_size'],params['filter_size'],
                params['relu_dropout'],train,params['allow_ffn_add']
            )
 
            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer,params,train),
                PrePostProcessingWrapper(feed_forward_network,params,train)
            ])
 
            # 创建最后一层,层归一化
            self.output_normalization=LayerNormalization(params['hidden_size'])
 
    def call(self,encoder_inputs,attention_bias,inputs_padding):
        """
        返回叠层的encoder output
        :param encoder_inputs: int shape[batch_size,input_length,hidden_size]
        :param attention_bias: shape[batch_size,1,1,input_length]
        :param inputs_padding:
        :return: float shape[batch_size,input_length,hidden_size]
        """
        for n,layer in enumerate(self.layers):
            self_attention_layer=layer[0]
            feed_forward_network=layer[1]
 
            with tf.variable_scope('layer_%d'%n):
                with tf.variable_scope('self_attention'):
                    encoder_inputs=self_attention_layer(encoder_inputs,attention_bias)
                with tf.variable_scope('ffn'):
                    encoder_inputs=feed_forward_network(encoder_inputs,inputs_padding)
        return self.output_normalization
 
class DecoderStack(tf.keras.layers.Layer):
    """
    层数与encoder一样,区别是decoder有三层
    1,attention层
    2,融合encoder output 前一个attention层的多头注意力层
    3,feedforward全连接层(此层内又有两个子层)
    """
    def __int__(self,params,train):
        super(DecoderStack,self).__init__()
        self.layers=[]
 
        for _ in range(params['num_hidden_size']):
 
            # attention层
            self_attention_layer=attention_layer.SelfAttention(
                params['hidden_size'],params['num_heads'],
                params['attention_dropout'],train
            )
            # 融合encoder output 前一个attention层的多头注意力层
            enc_dec_attention_layer=attention_layer.Attention(
                params['hidden_size'],params['num_heads'],
                params['attention_dropout'],train
            )
            # feedforward全连接层(此层内又有两个子层)
            feed_forward_network=ffn_layer.FeedFowardNetwork(
                params['hidden_size'],params['filter_size'],
                params['relu_dropout'],train,params['allow_ffn_pad']
            )
 
            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer,params,train),
                PrePostProcessingWrapper(enc_dec_attention_layer,params,train),
                PrePostProcessingWrapper(feed_forward_network,params,train)
            ])
 
        # 最后,添加层归一化
        self.output_normalization=LayerNormalization(params['hidden_size'])
    def call(self,decoder_inputs, encoder_outputs, decoder_self_attention_bias,
           attention_bias, cache=None):
        """
 
        :param decoder_inputs: shape[batch_size,target_length,hidden_size]
        :param encoder_outputs: shape[batch_size,input_length,hidden_size]
        :param decoder_self_attention_bias:[1,1,target_len,target_length]
        :param attention_bias: shape[batch_size,1,1,input_length]
        :param cache:
        :return: float shape[batch_size,target_length,hidden_size]
        """
 
        for n,layer in enumerate(self.layers):
 
            # 分别是decoder的三层
            self_attention_layer=layer[0]
            enc_dec_attention_layer=layer[1]
            feed_forward_network=layer[2]
 
            layer_name = "layer_%d" % n
            layer_cache = cache[layer_name] if cache is not None else None
             
            # 将input送入模型
            with tf.variable_scope(layer_name):
                with tf.variable_scope('self_attention'):
                    decoder_inputs=self_attention_layer(
                        decoder_inputs,decoder_self_attention_bias,cache
                    )
                    with tf.variable_scope('encdec_attention'):
                        decoder_inputs=enc_dec_attention_layer(
                            decoder_inputs,encoder_outputs,attention_bias
                        )
                    with tf.variable_scope('ffn'):
                        decoder_inputs=feed_forward_network(decoder_inputs)
        # 最后进行层归一化
        return self.output_normalization(decoder_inputs)

二、Attention

基于Tensorflow实现Transformer模型插图2
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import tensorflow as tf
 
class Attention(tf.keras.layers.Layer):
    """
    多头注意力层
    """
    def __init__(self,hidden_size,num_heads,attention_dropout,train):
        # hidden 必须能与 num_head 整除
        if hidden_size % num_heads != 0:
            raise ValueError('Hidden size must be evenly divisible by the number of ""heads')
        super(Attention,self).__init__()
        self.hidden_size=hidden_size
        self.num_heads=num_heads
        self.attention_dropout=attention_dropout
        self.train=train
 
        # 计算'q','k','v'
        self.q_dense_layer=tf.keras.layers.Dense(hidden_size,use_bias=False,name='q')
        self.k_dense_layer=tf.keras.layers.Dense(hidden_size,use_bias=False,name='k')
        self.v_dense_layer=tf.keras.layers.Dense(hidden_size,use_bias=False,name='v')
 
        # attention输出层
        self.output_dense_layer=tf.keras.layers.Dense(hidden_size,use_bias=False,name='outpout_dropout')
    def split_heads(self,x):
        """
        将x拆分不同的注意力head,并将结果转置(转置的目的是为了矩阵相乘时维度正确)
        :param x: shape[batch_size,length,hidden_size]
        :return: shape[batch_size,num_heads,length,hidden_size/num_heads]
        """
 
        with tf.name_scope('split_heads'):
            batch_size=tf.shape(x)[0]
            length=tf.shape(x)[1]
 
            # 计算最后一个维度的深度
            depth=(self.hidden_size // self.num_heads)
 
            # 拆分最后一个维度
            x=tf.reshape(x,[batch_size,length,self.num_heads,depth])
 
            # 将结果转置,即:[batch_size,self.num_heads,length,depth]
            return tf.transpose(x,[0,2,1,3])
 
    def combine_heads(self,x):
        """
        将拆分的张量再次连接(split_heads逆操作),input是split_heads_fn的输出
        :param x: shape[batch_size,num_heads,length,hidden_size/num_heads]
        :return:  shape[batch_size,length,hidden_size]
        """
        with tf.name_scope('combine_heads'):
            batchs_size=tf.shape(x)[0]
            length=tf.shape(x)[2]
 
            # [batch_size,length,num_heads,depth]
            x=tf.transpose(x,[0,2,1,3])
            return tf.reshape(x,[batchs_size,length,self.hidden_size])
 
 
    def call(self,x,y,bias,cache=None):
        """
 
        :param x: shape[batch_size,length_x,hidden_size]
        :param y: shape[batch_size,length_y,hidden_size]
        :param bias: 与点积结果相加
        :param cache: 预测模式使用;返回类型为字典:
        {
        'k':shape[batch_size,i,key_channels],
        'v':shape[batch_size,i,value_channels]
        }
        i:当前decoded长度
        :return: shape[batch_size,length_x,hidden_size]
        """
        # 获取'q','k','v'
        q=self.q_dense_layer(x)
        k=self.k_dense_layer(y)
        v=self.v_dense_layer(y)
 
        # 预测模式
        if cache is not None:
            # 合并k和v值
            k=tf.concat([cache['k'],k],axis=1)
            v=tf.concat([cache['v'],v],axis=1)
 
            cache['k']=k
            cache['v']=v
        # 将q,k,v拆分
        q=self.split_heads(q)
        k=self.split_heads(k)
        v=self.split_heads(v)
 
        #缩放q以防止q和k之间的点积过大
        depth = (self.hidden_size // self.num_heads)
        q *= depth ** -0.5
 
        # 计算点积,将k转置
        logits=tf.matmul(q,k,transpose_b=True)
        logits+=bias
        weights=tf.nn.softmax(logits,name='attention_weight')
 
        # 训练模式使用dropout
        if self.train:
            weights=tf.nn.dropout(weights,1.0-self.attention_dropout)
        attention_outpout=tf.matmul(weights,v)
 
        # 单头结束,计算多头
        attention_outpout=self.combine_heads(attention_outpout)
 
        # 使用全连接层输出
        attention_outpout=self.output_dense_layer(attention_outpout)
 
        return attention_outpout
 
class SelfAttention(Attention):
  """多头注意力层"""
 
  def call(self, x, bias, cache=None):
    return super(SelfAttention, self).call(x, x, bias, cache)

三、Embedding

01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import tensorflow as tf
from official.transformer.model import model_utils
from official.utils.accelerator import tpu as tpu_utils
 
class EmbeddingSharedWeights(tf.keras.layers.Layer):
    """
    用于encoder,decoder的input embedding,并共享权重
    """
    def __init__(self,vocab_size,hidden_size,method='gater'):
        """
 
        :param voab_size: 标记字符(tokens)数量,一般小于32000
        :param hidden_size:embedding层神经元数量,一般512或1024
        :param method: gather更适用于CPU,GPU,matmulTPU运算更快
        """
        super(EmbeddingSharedWeights,self).__init__()
        self.vocab_size=vocab_size
        self.hidden_size=hidden_size
 
        if method not in ('gather','matmul'):
            raise ValueError("method {} must be 'gather' or 'matmul'".format(method))
        self.method=method
    def build(self,_):
        with tf.variable_scope('embedding_and_softmax',reuse=tf.AUTO_REUSE):
            # 创建并初始化权重
            self.shared_weights=tf.get_variable(
                'weights',shape=[self.vocab_size,self.hidden_size],
                initializer=tf.random_normal_initializer(
                    0.,self.hidden_size**-0.5
                )
            )
            self.built=True
    def call(self,x):
        """
        获取embedding后的x
        :param x: int shape[batch_size,length]
        :return: embeddings:shape [batch_size,length,embedding_size]
                 padding: shape[batch_size,length]
 
                 因为模型默认输入长度必须是固定的,所以需要补长。现在有其它衍变版本transformer-xl可以实现动态更改长度
        """
        with tf.name_scope('embedding'):
            # 创建二进制mask
            mask=tf.to_float(tf.not_equal(x,0))
 
            if self.method == 'gather':
                embeddings=tf.gather(self.shared_weights)
                embeddings*=tf.expand_dims(mask,-1)
            else:
                embeddings=tpu_utils.embedding_matmul(
                    embedding_table=self.shared_weights,
                    values=tf.cast(x,type=tf.int32),
                    mask=mask
                )
            # 缩放embedding
                embeddings*=self.hidden_size**0.5
        return embeddings
 
    def linear(self,x):
        """
        输出模型logits
        :param x: shape[batch_size,length,hidden_size]
        :return: shape[batch_size,length,vovab_size]
        """
        with tf.name_scope('presoftmax_linear'):
            batch_size=tf.shape(x)[0]
            length=tf.shape(x)[1]
 
            x=tf.reshape(x,[-1,self.hidden_size])
             
            # shared_weights 转置
            logits=tf.matmul(x,self.shared_weights,transpose_b=True)
 
            return tf.reshape(logits, [batch_size, length, self.vocab_size])

四、FFN_layer

01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import tensorflow as tf
 
class FeedFowardNetWork(tf.keras.layers.Layer):
    """
    全连接层,共2层
    """
    def __init__(self,hidden_size,filter_size,relu_dropout,train,allow_pad):
        super(FeedFowardNetWork,self).__init__()
 
        self.hidden_size=hidden_size
        self.filter_size=filter_size
        self.relu_dropout=relu_dropout
        self.train=train
 
        # 模型默认需要固定长度
        self.all_pad=allow_pad
 
        self.filter_dense_layer=tf.keras.layers.Dense(
            filter_size,use_bias=True,activation=tf.nn.relu,
            name='filter_layer'
        )
        self.outpout_dense_layer=tf.keras.layers.Dense(
            hidden_size,use_bias=True,name='outpout_layer'
        )
    def call(self,x,padding=None):
        """
        返回全连接层输出
        :param x: shape[batch_size,length,hidden_size]
        :param padding:shape[batch_size,length]
        :return:
        """
        padding=None if not self.all_pad else padding
 
        # 获取已知shape
        batch_size=tf.shape(x)[0]
        length=tf.shape(x)[1]
 
        if padding is not None:
            with tf.name_scope('remove_padding'):
                pad_mask=tf.reshape(padding,[-1])
                nopad_ids=tf.to_int32(tf.where(pad_mask<1e-9))
 
                # 将x维度修改成[batch_size,selt.hidden_size]以移除padding
                x=tf.reshape(x[-1,self.hidden_size])
                x=tf.gather_nd(x,indices=nopad_ids)
 
                # 扩展一维
                x.set_shape([None, self.hidden_size])
                x = tf.expand_dims(x, axis=0)
        outpout=self.filter_dense_layer(x)
 
        # 训练模式使用dropout
        if self.train:
            outpout=tf.nn.dropout(outpout,1.0-self.relu_dropout)
        outpout=self.outpout_dense_layer(outpout)
 
        if padding is not None:
            with tf.name_scope('re_add_padding'):
                # 去除指定维度中,大小为1的
                output=tf.squeeze(outpout,axis=0)
                output = tf.scatter_nd(
                    indices=nopad_ids,
                    updates=output,
                    shape=[batch_size * length, self.hidden_size]
                )
                output = tf.reshape(output, [batch_size, length, self.hidden_size])
                 
            return output

五、模型参数

01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from collections import defaultdict
"""
基本模型参数配置
"""
BASE_PARAMS = defaultdict(
    lambda: None,
 
    # 输入参数,batch_size的设定要考虑内存情况
    default_batch_size=2048#CPU,GPU环境下batch_size大小
    default_batch_size_tpu=32768,
    max_length=256# 单个样本最大长度
 
    # 模型参数
    initializer_gain=1.0# 可训练变量初始化
    vocab_size=33708# 词表大小
    hidden_size=512# 隐藏层神经元数量(全连接层的第二层)
    num_hidden_layers=6# encoder,decoder层数
    num_heads=8# 多头注意力机制中head数量
    filter_size=2048# feedforward连接层中神经元数量
 
    # dropout参数
    layer_postprocess_dropout=0.1, # 残差连接中dropout参数
    attention_dropout=0.1, # 多头注意力机制中dropout参数
    relu_dropout=0.1, # 全连接层中dropout设置
 
    # 训练阶段参数
    label_smoothing=0.1, # 平滑参数,用于防止过拟合
    learning_rate=2.0, # 学习率
    learning_rate_decay_rate=1.0, #学习率衰减系数
    learning_rate_warmup_steps=16000,# 模型预热步数
 
    # adam激活函数参数
    optimizer_adam_beta1=0.9,
    optimizer_adam_beta2=0.997,
    optimizer_adam_epsilon=1e-09,
 
    # 预测模式参数设置
    extra_decode_length=50,
    beam_size=4,
 
    # TPU参数设置
    use_tpu=False,
    static_batch=False,
    allow_ffn_pad=True,
)
 
# 适合TPU环境训练配置
BIG_PARAMS = BASE_PARAMS.copy()
BIG_PARAMS.update(
     
    default_batch_size=4096,
    default_batch_size_tpu=16384,
 
    hidden_size=1024,
    filter_size=4096,
    num_heads=16,
)
 
# 多GPU环境训练参数
BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
BASE_MULTI_GPU_PARAMS.update(
    learning_rate_warmup_steps=8000
)
 
#多TPU环境训练参数
BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
BIG_MULTI_GPU_PARAMS.update(
    layer_postprocess_dropout=0.3,
    learning_rate_warmup_steps=8000
)
 
# 测试模型参数
TINY_PARAMS = BASE_PARAMS.copy()
TINY_PARAMS.update(
    default_batch_size=1024,
    default_batch_size_tpu=1024,
    hidden_size=32,
    num_heads=4,
    filter_size=256,
)

发表评论

欢迎阅读『基于Tensorflow实现Transformer模型|AI人工智能、Python、算法|Nick Tan-梓潼Blog』