Transformer() can get the 2D or 3D tensor of the one or more elements computed by Transformer from the 2D or 3D tensor of one or more elements as shown below:
import torch
from torch import nn
tensor1 = torch.tensor([[8., -3., 0., 1.]])
tensor2 = torch.tensor([[5., 9., -4., 8.],
[-2., 7., 3., 6.]])
tensor1.requires_grad
tensor2.requires_grad
# False
torch.manual_seed(42)
tran1 = nn.Transformer(d_model=4, nhead=2)
tensor3 = tran1(src=tensor1, tgt=tensor2)
tensor3
# tensor([[1.5608, 0.1450, -0.6434, -1.0624],
# [0.8815, 1.0994, -1.1523, -0.8286]],
# grad_fn=<NativeLayerNormBackward0>)
tensor3.requires_grad
# True
tran1
# Transformer(
# (encoder): TransformerEncoder(
# (layers): ModuleList(
# (0-5): 6 x TransformerEncoderLayer(
# (self_attn): MultiheadAttention(
# (out_proj): NonDynamicallyQuantizableLinear(
# in_features=4, out_features=4, bias=True
# )
# )
# (linear1): Linear(in_features=6, out_features=2048, bias=True)
# (dropout): Dropout(p=0.1, inplace=False)
# (linear2): Linear(in_features=2048, out_features=6, bias=True)
# (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (dropout1): Dropout(p=0.1, inplace=False)
# (dropout2): Dropout(p=0.1, inplace=False)
# )
# )
# (norm): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# )
# (decoder): TransformerDecoder(
# (layers): ModuleList(
# (0-5): 6 x TransformerDecoderLayer(
# (self_attn): MultiheadAttention(
# (out_proj): NonDynamicallyQuantizableLinear(
# in_features=4, out_features=4, bias=True
# )
# )
# (multihead_attn): MultiheadAttention(
# (out_proj): NonDynamicallyQuantizableLinear(
# in_features=4, out_features=4, bias=True
# )
# )
# (linear1): Linear(in_features=4, out_features=2048, bias=True)
# (dropout): Dropout(p=0.1, inplace=False)
# (linear2): Linear(in_features=2048, out_features=4, bias=True)
# (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (norm3): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (dropout1): Dropout(p=0.1, inplace=False)
# (dropout2): Dropout(p=0.1, inplace=False)
# (dropout3): Dropout(p=0.1, inplace=False)
# )
# )
# (norm): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# )
# )
tran1.encoder
# TransformerEncoder(
# (layers): ModuleList(
# (0-5): 6 x TransformerEncoderLayer(
# (self_attn): MultiheadAttention(
# (out_proj): NonDynamicallyQuantizableLinear(
# in_features=4, out_features=4, bias=True
# )
# )
# (linear1): Linear(in_features=4, out_features=2048, bias=True)
# (dropout): Dropout(p=0.1, inplace=False)
# (linear2): Linear(in_features=2048, out_features=6, bias=True)
# (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (dropout1): Dropout(p=0.1, inplace=False)
# (dropout2): Dropout(p=0.1, inplace=False)
# )
# )
# (norm): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# )
tran1.decoder
# TransformerDecoder(
# (layers): ModuleList(
# (0-5): 6 x TransformerDecoderLayer(
# (self_attn): MultiheadAttention(
# (out_proj): NonDynamicallyQuantizableLinear(
# in_features=4, out_features=4, bias=True
# )
# )
# (multihead_attn): MultiheadAttention(
# (out_proj): NonDynamicallyQuantizableLinear(
# in_features=4, out_features=4, bias=True
# )
# )
# (linear1): Linear(in_features=4, out_features=2048, bias=True)
# (dropout): Dropout(p=0.1, inplace=False)
# (linear2): Linear(in_features=2048, out_features=6, bias=True)
# (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (norm3): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# (dropout1): Dropout(p=0.1, inplace=False)
# (dropout2): Dropout(p=0.1, inplace=False)
# (dropout3): Dropout(p=0.1, inplace=False)
# )
# )
# (norm): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
# )
tran1.d_model
# 4
tran1.nhead
# 2
tran1.batch_first
# False
torch.manual_seed(42)
tran2 = nn.Transformer(d_model=4, nhead=2)
tran1(src=tensor2, tgt=tensor3)
# tensor([[-0.8631, 1.6747, -0.6517, -0.1599],
# [-0.0919, 1.6377, -0.5336, -1.0122]],
# grad_fn=<NativeLayerNormBackward0>)
torch.manual_seed(42)
tran = nn.Transformer(d_model=4, nhead=2, num_encoder_layers=6,
num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
activation='relu', custom_encoder=None, custom_decoder=None,
layer_norm_eps=1e-05, batch_first=False, norm_first=False,
bias=True, device=None, dtype=None)
tran(src=tensor1, tgt=tensor2, src_mask=None, tgt_mask=None,
memory_mask=None, src_key_padding_mask=None,
tgt_key_padding_mask=None, memory_key_padding_mask=None,
src_is_causal=None, tgt_is_causal=None, memory_is_causal=False)
# tensor([[1.5608, 0.1450, -0.6434, -1.0624],
# [0.8815, 1.0994, -1.1523, -0.8286]],
# grad_fn=<NativeLayerNormBackward0>)
tensor1 = torch.tensor([[8., -3.], [0., 1.]])
tensor2 = torch.tensor([[5., 9.], [-4., 8.],
[-2., 7.], [3., 6.]])
torch.manual_seed(42)
tran = nn.Transformer(d_model=2, nhead=2)
tran(src=tensor1, tgt=tensor2)
# tensor([[1.0000, -1.0000],
# [-1.0000, 1.0000],
# [-1.0000, 1.0000],
# [-1.0000, 1.0000]], grad_fn=<NativeLayerNormBackward0>)
tensor1 = torch.tensor([[8.], [-3.], [0.], [1.]])
tensor2 = torch.tensor([[5.], [9.], [-4.], [8.],
[-2.], [7.], [3.], [6.]])
torch.manual_seed(42)
tran = nn.Transformer(d_model=1, nhead=1)
tran(src=tensor1, tgt=tensor2)
# tensor([[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
# grad_fn=<NativeLayerNormBackward0>)
tensor1 = torch.tensor([[[8.], [-3.], [0.], [1.]]])
tensor2 = torch.tensor([[[5.], [9.], [-4.], [8.]],
[[-2.], [7.], [3.], [6.]]])
torch.manual_seed(42)
tran = nn.Transformer(d_model=1, nhead=1)
tran(src=tensor1, tgt=tensor2)
# tensor([[[0.], [0.], [0.], [0.]],
# [[0.], [0.], [0.], [0.]]], grad_fn=<NativeLayerNormBackward0>)
Transformer().generate_square_subsequent_mask()
can get the 2D tensor of the zero or more 0.
(Default), 0.+0.j
or False
and -inf
(Default), -inf+0.j
or True
as shown below:
import torch
from torch import nn
tran = nn.Transformer()
tran.generate_square_subsequent_mask(sz=3)
tran.generate_square_subsequent_mask(sz=3, device=None, dtype=None)
# tensor([[0., -inf, -inf],
# [0., 0., -inf],
# [0., 0., 0.]])
tran1.generate_square_subsequent_mask(sz=5)
# tensor([[0., -inf, -inf, -inf, -inf],
# [0., 0., -inf, -inf, -inf],
# [0., 0., 0., -inf, -inf],
# [0., 0., 0., 0., -inf],
# [0., 0., 0., 0., 0.]])
tran1.generate_square_subsequent_mask(sz=5, dtype=torch.complex64)
# tensor([[0.+0.j, -inf+0.j, -inf+0.j, -inf+0.j, -inf+0.j],
# [0.+0.j, 0.+0.j, -inf+0.j, -inf+0.j, -inf+0.j],
# [0.+0.j, 0.+0.j, 0.+0.j, -inf+0.j, -inf+0.j],
# [0.+0.j, 0.+0.j, 0.+0.j, 0.+0.j, -inf+0.j],
# [0.+0.j, 0.+0.j, 0.+0.j, 0.+0.j, 0.+0.j]])
tran1.generate_square_subsequent_mask(sz=5, dtype=torch.bool)
# tensor([[False, True, True, True, True],
# [False, False, True, True, True],
# [False, False, False, True, True],
# [False, False, False, False, True],
# [False, False, False, False, False]])
Source link
lol