Technology and Art
# The encoder output is injected directly into the sublayer of every Decoder. To build up the chain of Decoders
# in PyTorch, so that we can put the full stack inside a Sequential block, we simply inject the encoder output
# to the root Decoder, and have it output the encoder output (together with the actual Decoder output) as part of
# the Decoder's actual output to make it easy for the next Decoder in the stack to consume the Encoder and Decoder
# outputs
def forward(self, input):
encoder_output, previous_stage_output = input
masked_mh_output = self.masked_multiheaded_attention_layer(
self.masked_qkv_source.forward(previous_stage_output))
input_qkv = self.unmasked_qkv_source.forward((encoder_output, masked_mh_output))
mh_output = self.multiheaded_attention_layer(input_qkv)
# Adds the residual connection to the output of the attention layer
layer_normed_multihead_output = self.layer_norm(mh_output + previous_stage_output)
ffnn_outputs = torch.stack(
list(map(lambda attention_vector: self.feedforward_layer(attention_vector), layer_normed_multihead_output)))
layer_normed_ffnn_output = self.layer_norm(ffnn_outputs + layer_normed_multihead_output)
return (encoder_output, layer_normed_ffnn_output)