ChatGPT 的基础模型就是 GPT3, 那么 GPT 模型是怎么样的呢?这里就开源了一个nanoGPT。
要复现 GPT2,就需要To reproduce GPT-2 (124M) you'll want at least an 8X A100 40GB.
。
训练的数据集 OpenWebText有 41.7G。
1.model
class Block(nn.Module): | |
def __init__(self, config): | |
super().__init__() | |
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) | |
self.attn = CausalSelfAttention(config) | |
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) | |
self.mlp = MLP(config) | |
def forward(self, x): | |
x = x + self.attn(self.ln_1(x)) | |
x = x + self.mlp(self.ln_2(x)) | |
return x |
GPT 的最主要的部分是用 config 实例化的 Block
,对于每个Block
都是layerNorm1—>attention—>layerNorm2—>MLP。
class GPT(nn.Module): | |
def __init__(self, config): | |
super().__init__() | |
assert config.vocab_size is not None | |
assert config.block_size is not None | |
self.config = config | |
self.transformer = nn.ModuleDict(dict(wte = nn.Embedding(config.vocab_size, config.n_embd), | |
wpe = nn.Embedding(config.block_size, config.n_embd), | |
drop = nn.Dropout(config.dropout), | |
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), | |
ln_f = LayerNorm(config.n_embd, bias=config.bias), | |
)) |
forward部分会计算 logits 和 loss.
if targets is not None: | |
# if we are given some desired targets also calculate the loss | |
logits = self.lm_head(x) | |
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) | |
else: | |
# inference-time mini-optimization: only forward the lm_head on the very last position | |
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim | |
loss = None | |
return logits, loss |
而其生成部分:
@torch.no_grad() | |
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): | |
""" | |
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete | |
the sequence max_new_tokens times, feeding the predictions back into the model each time. | |
Most likely you'll want to make sure to be in model.eval() mode of operation for this.""" | |
for _ in range(max_new_tokens): | |
# if the sequence context is growing too long we must crop it at block_size | |
idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:] | |
# forward the model to get the logits for the index in the sequence | |
logits, _ = self(idx_cond) | |
# pluck the logits at the final step and scale by desired temperature | |
logits = logits[:, -1, :] / temperature | |
# optionally crop the logits to only the top k options | |
if top_k is not None: | |
v, _ = torch.topk(logits, min(top_k, logits.size(-1))) | |
logits[logits < v[:, [-1]]] = -float('Inf') | |
# apply softmax to convert logits to (normalized) probabilities | |
probs = F.softmax(logits, dim=-1) | |
# sample from the distribution | |
idx_next = torch.multinomial(probs, num_samples=1) | |
# append sampled index to the running sequence and continue | |
idx = torch.cat((idx, idx_next), dim=1) |
max_new_tokens
:生成 token 次数temperature
:放缩 logitstop_k
:前 k 个
2.train
train 部分实现了:
- ddp
- resume:继续训练
1. ddp
# various inits, derived attributes, I/O setup | |
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? | |
if ddp: | |
init_process_group(backend=backend) | |
ddp_rank = int(os.environ['RANK']) | |
ddp_local_rank = int(os.environ['LOCAL_RANK']) | |
ddp_world_size = int(os.environ['WORLD_SIZE']) | |
device = f'cuda:{ddp_local_rank}' | |
torch.cuda.set_device(device) | |
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. | |
seed_offset = ddp_rank # each process gets a different seed | |
assert gradient_accumulation_steps % torch.cuda.device_count() == 0 | |
gradient_accumulation_steps //= torch.cuda.device_count() | |
else: | |
# if not ddp, we are running on a single gpu, and one process | |
master_process = True | |
seed_offset = 0 | |
ddp_world_size = 1 | |
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size | |
print(f"tokens per iteration will be: {tokens_per_iter:,}") |
2. resume
if init_from == 'scratch': | |
# init a new model from scratch | |
print("Initializing a new model from scratch") | |
# determine the vocab size we'll use for from-scratch training | |
if meta_vocab_size is None: | |
print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)") | |
model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 | |
gptconf = GPTConfig(**model_args) | |
model = GPT(gptconf) | |
elif init_from == 'resume': | |
print(f"Resuming training from {out_dir}") | |
# resume training from a checkpoint. | |
ckpt_path = os.path.join(out_dir, 'ckpt.pt') | |
checkpoint = torch.load(ckpt_path, map_location=device) | |
checkpoint_model_args = checkpoint['model_args'] | |
# force these config attributes to be equal otherwise we can't even resume training | |
# the rest of the attributes (e.g. dropout) can stay as desired from command line | |
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: | |
model_args[k] = checkpoint_model_args[k] | |
# create the model | |
gptconf = GPTConfig(**model_args) | |
model = GPT(gptconf) | |
state_dict = checkpoint['model'] | |
# fix the keys of the state dictionary :( | |
# honestly no idea how checkpoints sometimes get this prefix, have to debug more | |
unwanted_prefix = '_orig_mod.' | |
for k,v in list(state_dict.items()): | |
if k.startswith(unwanted_prefix): | |
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) | |
model.load_state_dict(state_dict) | |
iter_num = checkpoint['iter_num'] | |
best_val_loss = checkpoint['best_val_loss'] | |
elif init_from.startswith('gpt2'): | |
print(f"Initializing from OpenAI GPT-2 weights: {init_from}") | |
# initialize from OpenAI GPT-2 weights | |
override_args = dict(dropout=dropout) | |
model = GPT.from_pretrained(init_from, override_args) | |
# read off the created config params, so we can store them into checkpoint correctly | |
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: | |
model_args[k] = getattr(model.config, k) |
正文完