动手学Transformer (2)

动手学Transformer

Decoder和Encoder类似,由N个Decoder layer 堆叠而成, 接受输入有三部分:token embedding、位置编码、 最后一个Encoder的输出

class Decoder(tf.keras.layers.Layer): ''' Decoder 部分: input embedding;Decoder layer stack; ''' def __init__(self,num_layers,dim_model, num_heads, dim_ff, target_vocab_size, rate=0.1): super(Decoder, self).__init__() self.dim_model = dim_model self.num_layers = num_layers self.embedding = tf.keras.layers.Embedding(target_vocab_size, self.dim_model) self.pos_encoding = positional_encoding(target_vocab_size, self.dim_model) self.dec_layers = [DecoderLayer(dim_model, num_heads, dim_ff, rate) for _ in range(num_layers)]#创建Decoder layer self.dropout = tf.keras.layers.Dropout(rate) def call(self,x,enc_output,training,look_ahead_mask,padding_mask): seq_len = tf.shape(x)[1] attention_weights = {} x = self.embedding(x)# (batch_size, target_seq_len, d_model) x += self.pos_encoding[:,:seq_len,:] for i,layer in enumerate(self.dec_layers): x,block1,block2 = layer(x,enc_output,training,look_ahead_mask, padding_mask) attention_weights['decoder_layer{}_block1'.format(i+1)] = block1 attention_weights['decoder_layer{}_block2'.format(i+1)] = block2 # x.shape == (batch_size, target_seq_len, d_model) return x, attention_weights

单个Decoder层有三个子层:masked attention层、attention层和point wise feed forward network. masked attention层会掩盖掉序列中还没看到的位置,attention层以target token 为query,Encoder输出为 key和value

class DecoderLayer(tf.keras.layers.Layer): ''' Decoder layer: masked multihead attention;add&norm;multihead attention;add&norm;FeedForward;add&norm ''' def __init__(self, dim_model, num_heads,dim_ff,rate=0.1): super(DecoderLayer,self).__init__() self.mask_mha = MultiHeadAttention(dim_model, num_heads) self.mha = MultiHeadAttention(dim_model, num_heads) self.ffn = point_wise_feed_forward_network(dim_model, dim_ff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate) def call(self,x,enc_output, training, look_ahead_mask, padding_mask): #masked multi-head attention mask_attn_output, attn_weights_block1 = self.mask_mha(x,x,x,look_ahead_mask) mask_attn_output = self.dropout1(mask_attn_output,training) out1 = self.layernorm1(x + mask_attn_output) #multi-head attention attn_output,attn_weights_block2 = self.mha(out1,enc_output,enc_output,padding_mask) attn_ouput = self.dropout2(attn_output, training=training) out2 = self.layernorm2(attn_output + out1) # (batch_size, target_seq_len, d_model) # feed-forward ffn_output = self.ffn(out2) # (batch_size, input_seq_len, d_model) ffn_output = self.dropout2(ffn_output, training=training) out3 = self.layernorm3(out1 + ffn_output) # (batch_size, input_seq_len, d_model) return out3, attn_weights_block1, attn_weights_block2

Attention

动手学Transformer

点积attention

\[Attention(Q,K,V)=softmax(\frac {QK^T}{\sqrt {d_k}})V \]

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/zyyyfj.html