Transformer
Transformer 是一种基于自注意力机制的深度学习架构,是现代大语言模型的基础。
架构原理
整体架构
输入
↓
Embedding + Positional Encoding
↓
┌─────────────────────────────────────┐
│ Encoder (N层) │
│ ┌───────────────────────────────┐ │
│ │ Multi-Head Attention │ │
│ └───────────────────────────────┘ │
│ ┌───────────────────────────────┐ │
│ │ Feed Forward Network │ │
│ └───────────────────────────────┘ │
└─────────────────────────────────────┘
↓
┌─────────────────────────────────────┐
│ Decoder (N层) │
│ ┌───────────────────────────────┐ │
│ │ Masked Multi-Head Attention │ │
│ └───────────────────────────────┘ │
│ ┌───────────────────────────────┐ │
│ │ Encoder-Decoder Attention │ │
│ └───────────────────────────────┘ │
│ ┌───────────────────────────────┐ │
│ │ Feed Forward Network │ │
│ └───────────────────────────────┘ │
└─────────────────────────────────────┘
↓
Linear + Softmax
↓
输出注意力机制
Scaled Dot-Product Attention
python
import torch
import torch.nn.functional as F
def scaled_dot_product_attention(Q, K, V, mask=None):
d_k = Q.size(-1)
scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention_weights = F.softmax(scores, dim=-1)
output = torch.matmul(attention_weights, V)
return output, attention_weightsMulti-Head Attention
python
class MultiHeadAttention(torch.nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = torch.nn.Linear(d_model, d_model)
self.W_k = torch.nn.Linear(d_model, d_model)
self.W_v = torch.nn.Linear(d_model, d_model)
self.W_o = torch.nn.Linear(d_model, d_model)
def forward(self, Q, K, V, mask=None):
batch_size = Q.size(0)
# 线性变换
Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
# 注意力计算
output, _ = scaled_dot_product_attention(Q, K, V, mask)
# 拼接多头输出
output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
output = self.W_o(output)
return outputEncoder-Decoder
Encoder 层
python
class EncoderLayer(torch.nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = torch.nn.Sequential(
torch.nn.Linear(d_model, d_ff),
torch.nn.ReLU(),
torch.nn.Linear(d_ff, d_model)
)
self.norm1 = torch.nn.LayerNorm(d_model)
self.norm2 = torch.nn.LayerNorm(d_model)
self.dropout = torch.nn.Dropout(dropout)
def forward(self, x, mask=None):
# 自注意力
attn_output = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout(attn_output))
# 前馈网络
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout(ff_output))
return xDecoder 层
python
class DecoderLayer(torch.nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.masked_self_attn = MultiHeadAttention(d_model, num_heads)
self.cross_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = torch.nn.Sequential(
torch.nn.Linear(d_model, d_ff),
torch.nn.ReLU(),
torch.nn.Linear(d_ff, d_model)
)
self.norm1 = torch.nn.LayerNorm(d_model)
self.norm2 = torch.nn.LayerNorm(d_model)
self.norm3 = torch.nn.LayerNorm(d_model)
self.dropout = torch.nn.Dropout(dropout)
def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
# 掩码自注意力
attn_output = self.masked_self_attn(x, x, x, tgt_mask)
x = self.norm1(x + self.dropout(attn_output))
# 编码器-解码器注意力
cross_output = self.cross_attn(x, enc_output, enc_output, src_mask)
x = self.norm2(x + self.dropout(cross_output))
# 前馈网络
ff_output = self.feed_forward(x)
x = self.norm3(x + self.dropout(ff_output))
return x