Transformer

Transformer 是一种基于自注意力机制的深度学习架构，是现代大语言模型的基础。

架构原理

整体架构

输入
  ↓
Embedding + Positional Encoding
  ↓
┌─────────────────────────────────────┐
│         Encoder (N层)              │
│  ┌───────────────────────────────┐  │
│  │ Multi-Head Attention          │  │
│  └───────────────────────────────┘  │
│  ┌───────────────────────────────┐  │
│  │ Feed Forward Network          │  │
│  └───────────────────────────────┘  │
└─────────────────────────────────────┘
  ↓
┌─────────────────────────────────────┐
│         Decoder (N层)              │
│  ┌───────────────────────────────┐  │
│  │ Masked Multi-Head Attention   │  │
│  └───────────────────────────────┘  │
│  ┌───────────────────────────────┐  │
│  │ Encoder-Decoder Attention     │  │
│  └───────────────────────────────┘  │
│  ┌───────────────────────────────┐  │
│  │ Feed Forward Network          │  │
│  └───────────────────────────────┘  │
└─────────────────────────────────────┘
  ↓
Linear + Softmax
  ↓
输出

注意力机制

Scaled Dot-Product Attention

python

import torch
import torch.nn.functional as F

def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    
    return output, attention_weights

Multi-Head Attention

python

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = torch.nn.Linear(d_model, d_model)
        self.W_k = torch.nn.Linear(d_model, d_model)
        self.W_v = torch.nn.Linear(d_model, d_model)
        self.W_o = torch.nn.Linear(d_model, d_model)
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        
        # 线性变换
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        # 注意力计算
        output, _ = scaled_dot_product_attention(Q, K, V, mask)
        
        # 拼接多头输出
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.W_o(output)
        
        return output

Encoder-Decoder

Encoder 层

python

class EncoderLayer(torch.nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(d_model, d_ff),
            torch.nn.ReLU(),
            torch.nn.Linear(d_ff, d_model)
        )
        self.norm1 = torch.nn.LayerNorm(d_model)
        self.norm2 = torch.nn.LayerNorm(d_model)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # 自注意力
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # 前馈网络
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

Decoder 层

python

class DecoderLayer(torch.nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.masked_self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(d_model, d_ff),
            torch.nn.ReLU(),
            torch.nn.Linear(d_ff, d_model)
        )
        self.norm1 = torch.nn.LayerNorm(d_model)
        self.norm2 = torch.nn.LayerNorm(d_model)
        self.norm3 = torch.nn.LayerNorm(d_model)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # 掩码自注意力
        attn_output = self.masked_self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # 编码器-解码器注意力
        cross_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(cross_output))
        
        # 前馈网络
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x

Transformer ​

架构原理 ​

整体架构 ​

注意力机制 ​

Scaled Dot-Product Attention ​

Multi-Head Attention ​

Encoder-Decoder ​

Encoder 层 ​

Decoder 层 ​

Transformer

架构原理

整体架构

注意力机制

Scaled Dot-Product Attention

Multi-Head Attention

Encoder-Decoder

Encoder 层

Decoder 层