diff --git a/README.md b/README.md new file mode 100644 index 00000000..1648bb39 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ + +## llama2.c + +![llama2c](assets/llama_cute.jpg) + +Have you ever wanted to inference a baby [Llama 2](https://ai.meta.com/llama/) model in pure C? No? Well, now you can! + +Code in this repo first lets you train the Llama 2 architecture from scratch in PyTorch, then save the weights to a raw binary file, then load that into one ~simple 500-line C file that inferences the model, simply in fp32 for now. + +Of course, this is not super fast, but it's not too bad either. E.g. on my cloud Linux devbox a dim 288 6-layer 6-head model (~15M params) inferences at ~18 tok/s in fp32, and about the same on my M1 MacBook Air. + +Please note that this is just a weekend project where I took nanoGPT, gutted it to implement the Llama-2 architecture (instead of GPT-2), and then wrote the C inference engine for it in `run.c`. So this is not really meant to be a production-grade library right now. + +Hat tip to [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. I wanted something super minimal so I chose to hard-code the llama-2 architecture, stick to fp32, and just roll one inference file of pure C with no dependencies. + +## howto + +It should be possible to load the weights released by Meta but I haven't tried because the inference speed, even of the 7B model, would probably be not great with this baby single-threaded C program. So in this repo we focus on more narrow applications, and train the same architecture but from scratch, in this case on the TinyStories dataset for fun. + +First let's download and pretokenize the TinyStories dataset: + +```bash +python tinystories.py download +python tinystories.py pretokenize +``` + +Then train our model: + +```bash +python train.py +``` + +See the train.py script for more exotic launches and hyperparameter overrides. I didn't tune the hyperparameters, I expect simple hyperparameter exploration should give better models. Totally understand if you want to skip model training, for simple demo just download my pretrained model: + +```bash +wget TODOhoweasiesthmm +``` + +Once we have the model.bin file, we can inference in C. Compile the C code first: + +```bash +gcc -o run run.c -lm +``` + +You can now run it simply as + +```bash +./run +``` + +But note that this only emits the SentencePiece tokens. To decode the tokens into text too, run this script through a simple wrapper: + +```bash +python run_wrap.py +``` + +I hope to delete this script soon though. Anyway, watch the tokens stream by, fun! + +To verify correctness, we can also run the PyTorch inference script: + +```bash +python sample.py +``` + +Which gives the same results. I'd love to find some time to create actual tests, one day maybe. For now I just manually inspected activations and verified that they match, and that the samples are identical at temperature 0. If someone wishes to help me with tests I welcome PRs. + +## unsorted todos + +- why SentencePiece can't iteratively decode properly? +- would love to delete run_wrap.py and just directly use C code to string, help welcome +- todo multiquery support? doesn't seem as useful for smaller models that run on CPU +- todo support inferencing beyond max_seq_len steps, have to think through the kv cache +- why is MFU so low (~20%) on my A100 40GB for training? +- weird errors with torch.compile and wandb when using DDP +- make tests to decrease yolo + +## License +MIT diff --git a/assets/llama_cute.jpg b/assets/llama_cute.jpg new file mode 100644 index 00000000..c7fbda5e Binary files /dev/null and b/assets/llama_cute.jpg differ diff --git a/configurator.py b/configurator.py new file mode 100644 index 00000000..a8bba959 --- /dev/null +++ b/configurator.py @@ -0,0 +1,47 @@ +""" +Poor Man's Configurator. Probably a terrible idea. Example usage: +$ python train.py config/override_file.py --batch_size=32 +this will first run config/override_file.py, then override batch_size to 32 + +The code in this file will be run as follows from e.g. train.py: +>>> exec(open('configurator.py').read()) + +So it's not a Python module, it's just shuttling this code away from train.py +The code in this script then overrides the globals() + +I know people are not going to love this, I just really dislike configuration +complexity and having to prepend config. to every single variable. If someone +comes up with a better simple Python solution I am all ears. +""" + +import sys +from ast import literal_eval + +for arg in sys.argv[1:]: + if '=' not in arg: + # assume it's the name of a config file + assert not arg.startswith('--') + config_file = arg + print(f"Overriding config with {config_file}:") + with open(config_file) as f: + print(f.read()) + exec(open(config_file).read()) + else: + # assume it's a --key=value argument + assert arg.startswith('--') + key, val = arg.split('=') + key = key[2:] + if key in globals(): + try: + # attempt to eval it it (e.g. if bool, number, or etc) + attempt = literal_eval(val) + except (SyntaxError, ValueError): + # if that goes wrong, just use the string + attempt = val + # ensure the types match ok + assert type(attempt) == type(globals()[key]) + # cross fingers + print(f"Overriding: {key} = {attempt}") + globals()[key] = attempt + else: + raise ValueError(f"Unknown config key: {key}") diff --git a/model.py b/model.py new file mode 100644 index 00000000..8a310a8c --- /dev/null +++ b/model.py @@ -0,0 +1,360 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +import math +import struct +import inspect +from dataclasses import dataclass +from typing import Any, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + norm_eps: float = 1e-5 + max_seq_len: int = 2048 + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 + return freqs_cis + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + x[:, :, :, None, :] + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + +class Attention(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads + model_parallel_size = 1 + self.n_local_heads = args.n_heads // model_parallel_size + self.n_local_kv_heads = self.n_kv_heads // model_parallel_size + self.n_rep = self.n_local_heads // self.n_local_kv_heads + self.head_dim = args.dim // args.n_heads + self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False) + self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False) + + # use flash attention or a manual implementation? + self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') + if not self.flash: + print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0") + mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf")) + mask = torch.triu(mask, diagonal=1) + self.register_buffer("mask", mask) + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + bsz, seqlen, _ = x.shape + + # QKV + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + + # RoPE relative positional embeddings + xq, xk = apply_rotary_emb(xq, xk, freqs_cis) + + # grouped multiquery attention: expand out keys and values + xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + # make heads into a batch dimension + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xk = xk.transpose(1, 2) + xv = xv.transpose(1, 2) + + # flash implementation + if self.flash: + output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=0.0, is_causal=True) + else: + # manual implementation + scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim) + scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen) + scores = F.softmax(scores.float(), dim=-1).type_as(xq) + output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim) + + # restore time as batch dimension and concat heads + output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) + + # final projection into the residual stream + output = self.wo(output) + return output + + +class FeedForward(nn.Module): + def __init__(self, dim: int, hidden_dim: int, multiple_of: int): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +class TransformerBlock(nn.Module): + def __init__(self, layer_id: int, args: ModelArgs): + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args) + self.feed_forward = FeedForward( + dim=args.dim, + hidden_dim=4 * args.dim, + multiple_of=args.multiple_of, + ) + self.layer_id = layer_id + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) + + def forward(self, x, freqs_cis): + h = x + self.attention.forward(self.attention_norm(x), freqs_cis) + out = h + self.feed_forward.forward(self.ffn_norm(h)) + return out + + +class Transformer(nn.Module): + def __init__(self, params: ModelArgs): + super().__init__() + self.params = params + self.vocab_size = params.vocab_size + self.n_layers = params.n_layers + + self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) + self.layers = torch.nn.ModuleList() + for layer_id in range(params.n_layers): + self.layers.append(TransformerBlock(layer_id, params)) + self.norm = RMSNorm(params.dim, eps=params.norm_eps) + self.output = nn.Linear(params.dim, params.vocab_size, bias=False) + + # share the unembedding parameters with the embedding parameters + self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying + + # some useful precompute for the RoPE relative positional embeddings. TODO why * 2 here? confuse + self.freqs_cis = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len * 2) + + # init all weights + self.apply(self._init_weights) + # apply special scaled init to the residual projections, per GPT-2 paper + for pn, p in self.named_parameters(): + if pn.endswith('w3.weight') or pn.endswith('wo.weight'): + torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers)) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, tokens, targets=None): + _bsz, seqlen = tokens.shape + h = self.tok_embeddings(tokens) + self.freqs_cis = self.freqs_cis.to(h.device) + freqs_cis = self.freqs_cis[:seqlen] + + for layer in self.layers: + h = layer(h, freqs_cis) + h = self.norm(h) + + if targets is not None: + # if we are given some desired targets also calculate the loss + logits = self.output(h) + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) + else: + # inference-time mini-optimization: only forward the output on the very last position + logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim + loss = None + + return logits, loss + + def configure_optimizers(self, weight_decay, learning_rate, betas, device_type): + # start with all of the candidate parameters + param_dict = {pn: p for pn, p in self.named_parameters()} + # filter out those that do not require grad + param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} + # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no. + # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't. + decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] + nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] + optim_groups = [ + {'params': decay_params, 'weight_decay': weight_decay}, + {'params': nodecay_params, 'weight_decay': 0.0} + ] + num_decay_params = sum(p.numel() for p in decay_params) + num_nodecay_params = sum(p.numel() for p in nodecay_params) + print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters") + print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters") + # Create AdamW optimizer and use the fused version if it is available + fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters + use_fused = fused_available and device_type == 'cuda' + extra_args = dict(fused=True) if use_fused else dict() + optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args) + print(f"using fused AdamW: {use_fused}") + + return optimizer + + def estimate_mfu(self, fwdbwd_per_iter, dt): + """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """ + # first estimate the number of flops we do per iteration. + # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311 + N = sum(p.numel() for p in self.parameters()) + cfg = self.params + L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len + flops_per_token = 6*N + 12*L*H*Q*T + flops_per_fwdbwd = flops_per_token * T + flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter + # express our flops throughput as ratio of A100 bfloat16 peak flops + flops_achieved = flops_per_iter * (1.0/dt) # per second + flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS + mfu = flops_achieved / flops_promised + return mfu + + @torch.inference_mode() + def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): + """ + Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete + the sequence max_new_tokens times, feeding the predictions back into the model each time. + Most likely you'll want to make sure to be in model.eval() mode of operation for this. + Also note this is a super inefficient version of sampling with no key/value cache. + """ + for _ in range(max_new_tokens): + # if the sequence context is growing too long we must crop it at block_size + idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:] + # forward the model to get the logits for the index in the sequence + logits, _ = self(idx_cond) + # pluck the logits at the final step and scale by desired temperature + logits = logits[:, -1, :] / temperature + # optionally crop the logits to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float('Inf') + # apply softmax to convert logits to (normalized) probabilities + probs = F.softmax(logits, dim=-1) + if temperature == 0.0: + # sample the most likely index + _, idx_next = torch.topk(probs, k=1, dim=-1) + else: + # sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) + # append sampled index to the running sequence and continue + idx = torch.cat((idx, idx_next), dim=1) + + return idx + + def export(self, filepath='model.bin'): + """export the model weights in fp32 into .bin file to be read from C""" + f = open(filepath, 'wb') + + def serialize(t): + d = t.detach().cpu().view(-1).numpy().astype(np.float32) + b = struct.pack(f'{len(d)}f', *d) + f.write(b) + + # first write out the header + hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0] + p = self.params + n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads + header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, + n_kv_heads, p.vocab_size, p.max_seq_len) + f.write(header) + + # next write out the embedding weights + serialize(self.tok_embeddings.weight) + + # now all the layers + # attention weights + for layer in self.layers: + serialize(layer.attention_norm.weight) + for layer in self.layers: + serialize(layer.attention.wq.weight) + for layer in self.layers: + serialize(layer.attention.wk.weight) + for layer in self.layers: + serialize(layer.attention.wv.weight) + for layer in self.layers: + serialize(layer.attention.wo.weight) + # ffn weights + for layer in self.layers: + serialize(layer.ffn_norm.weight) + for layer in self.layers: + serialize(layer.feed_forward.w1.weight) + for layer in self.layers: + serialize(layer.feed_forward.w2.weight) + for layer in self.layers: + serialize(layer.feed_forward.w3.weight) + # final rmsnorm + serialize(self.norm.weight) + # note: no need to write final classifier weights due to weight sharing + # freqs_cis + serialize(self.freqs_cis.real[:p.max_seq_len]) + serialize(self.freqs_cis.imag[:p.max_seq_len]) + + # write to binary file + f.close() + print(f"wrote {filepath}") diff --git a/run.c b/run.c new file mode 100644 index 00000000..a0d1ea84 --- /dev/null +++ b/run.c @@ -0,0 +1,492 @@ +/* +Inference for Llama-2 Transformer model in pure C. + +Compile simply with: +$ gcc -o run run.c +Or if that doesn't work then: +$ gcc -o run run.c -lm + +Then run with: +$ ./run +*/ + +#include +#include +#include +#include + +// ---------------------------------------------------------------------------- +// Transformer and RunState structs, and related memory management + +typedef struct { + int dim; // transformer dimension + int hidden_dim; // for ffn layers + int n_layers; // number of layers + int n_heads; // number of query heads + int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery) + int vocab_size; // vocabulary size, usually 256 (byte-level) + int seq_len; // max sequence length +} Config; + +typedef struct { + // token embedding table + float* token_embedding_table; // (vocab_size, dim) + // weights for rmsnorms + float* rms_att_weight; // (layer, dim) rmsnorm weights + float* rms_ffn_weight; // (layer, dim) + // weights for matmuls + float* wq; // (layer, dim, dim) + float* wk; // (layer, dim, dim) + float* wv; // (layer, dim, dim) + float* wo; // (layer, dim, dim) + // weights for ffn + float* w1; // (layer, hidden_dim, dim) + float* w2; // (layer, dim, hidden_dim) + float* w3; // (layer, hidden_dim, dim) + // final rmsnorm + float* rms_final_weight; // (dim,) + // freq_cis for RoPE relatively positional embeddings + float* freq_cis_real; // (seq_len, dim/2) + float* freq_cis_imag; // (seq_len, dim/2) +} TransformerWeights; + +typedef struct { + // current wave of activations + float *x; // activation at current time stamp (dim,) + float *xb; // same, but inside a residual branch (dim,) + float *xb2; // an additional buffer just for convenience (dim,) + float *hb; // buffer for hidden dimension in the ffn (hidden_dim,) + float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,) + float *q; // query (dim,) + float *k; // key (dim,) + float *v; // value (dim,) + float *att; // buffer for scores/attention values (seq_len,) + float *logits; // output logits + // kv cache + float* key_cache; // (layer, seq_len, dim) + float* value_cache; // (layer, seq_len, dim) +} RunState; + +void malloc_run_state(RunState* s, Config* p) { + // we calloc instead of malloc to keep valgrind happy + s->x = calloc(p->dim, sizeof(float)); + s->xb = calloc(p->dim, sizeof(float)); + s->xb2 = calloc(p->dim, sizeof(float)); + s->hb = calloc(p->hidden_dim, sizeof(float)); + s->hb2 = calloc(p->hidden_dim, sizeof(float)); + s->q = calloc(p->dim, sizeof(float)); + s->k = calloc(p->dim, sizeof(float)); + s->v = calloc(p->dim, sizeof(float)); + s->att = calloc(p->seq_len, sizeof(float)); + s->logits = calloc(p->vocab_size, sizeof(float)); + s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float)); + s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float)); + // ensure all mallocs went fine + if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q + || !s->k || !s->v || !s->att || !s->logits || !s->key_cache + || !s->value_cache) { + printf("malloc failed!\n"); + exit(1); + } +} + +void free_run_state(RunState* s, Config* p) { + free(s->x); + free(s->xb); + free(s->xb2); + free(s->hb); + free(s->hb2); + free(s->q); + free(s->k); + free(s->v); + free(s->att); + free(s->logits); + free(s->key_cache); + free(s->value_cache); +} + +void malloc_weights(TransformerWeights* w, Config* p) { + // we calloc instead of malloc to keep valgrind happy + w->token_embedding_table = calloc(p->vocab_size * p->dim, sizeof(float)); + w->rms_att_weight = calloc(p->n_layers * p->dim, sizeof(float)); + w->rms_ffn_weight = calloc(p->n_layers * p->dim, sizeof(float)); + w->wq = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); + w->wk = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); + w->wv = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); + w->wo = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); + w->w1 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float)); + w->w2 = calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float)); + w->w3 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float)); + w->rms_final_weight = calloc(p->dim, sizeof(float)); + w->freq_cis_real = calloc(p->seq_len * p->dim / 2, sizeof(float)); + w->freq_cis_imag = calloc(p->seq_len * p->dim / 2, sizeof(float)); + // ensure all mallocs went fine + if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight + || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || + !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) { + printf("malloc failed!\n"); + exit(1); + } +} + +void free_weights(TransformerWeights* w, Config* p) { + free(w->token_embedding_table); + free(w->rms_att_weight); + free(w->rms_ffn_weight); + free(w->wq); + free(w->wk); + free(w->wv); + free(w->wo); + free(w->w1); + free(w->w2); + free(w->w3); + free(w->rms_final_weight); + free(w->freq_cis_real); + free(w->freq_cis_imag); +} + +// ---------------------------------------------------------------------------- +// initialization: random init, or read from checkpoint + +// initializes weights to random numbers from -.5 to .5 +void init_rand(float* w, int size) { + for (int i = 0; i < size; i++) { + w[i] = ((float)rand()/(float)(RAND_MAX)) - 0.5f; + } +} + +// constant init +void init_const(float* w, int size, float val) { + for (int i = 0; i < size; i++) { + w[i] = val; + } +} + +void random_init_weights(TransformerWeights* w, Config* p) { + init_rand(w->token_embedding_table, p->vocab_size * p->dim); + init_const(w->rms_att_weight, p->n_layers * p->dim, 1.0f); + init_const(w->rms_ffn_weight, p->n_layers * p->dim, 1.0f); + init_rand(w->wq, p->n_layers * p->dim * p->dim); + init_rand(w->wk, p->n_layers * p->dim * p->dim); + init_rand(w->wv, p->n_layers * p->dim * p->dim); + init_rand(w->wo, p->n_layers * p->dim * p->dim); + init_rand(w->w1, p->n_layers * p->dim * p->hidden_dim); + init_rand(w->w2, p->n_layers * p->hidden_dim * p->dim); + init_rand(w->w3, p->n_layers * p->dim * p->hidden_dim); + init_const(w->rms_final_weight, p->dim, 1.0f); + init_rand(w->freq_cis_real, p->seq_len * p->dim / 2); + init_rand(w->freq_cis_imag, p->seq_len * p->dim / 2); +} + +void checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) { + fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f); + fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f); + fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f); + fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f); + fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f); + fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f); + fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f); + fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f); + fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f); + fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f); + fread(w->rms_final_weight, sizeof(float), p->dim, f); + int head_size = p->dim / p->n_heads; + fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f); + fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f); +} + +// ---------------------------------------------------------------------------- +// neural net blocks + +void copy(float *a, float *b, int size) { + for (int i = 0; i < size; i++) { + a[i] = b[i]; + } +} + +void accum(float *a, float *b, int size) { + for (int i = 0; i < size; i++) { + a[i] += b[i]; + } +} + +void rmsnorm(float* o, float* x, float* weight, int size) { + // calculate sum of squares + float ss = 0.0f; + for (int j = 0; j < size; j++) { + ss += x[j] * x[j]; + } + ss /= size; + ss += 1e-5f; + ss = 1.0f / sqrt(ss); + // normalize and scale + for (int j = 0; j < size; j++) { + o[j] = weight[j] * (ss * x[j]); + } +} + +void softmax(float* x, int size) { + if(size == 1) { + x[0] = 1.0f; + return; + } + // find max value (for numerical stability) + float max_val = x[0]; + for (int i = 1; i < size; i++) { + if (x[i] > max_val) { + max_val = x[i]; + } + } + // e^x + for (int i = 0; i < size; i++) { + x[i] = exp(x[i] - max_val); + } + // normalize + float sum = 0.0f; + for (int i = 0; i < size; i++) { + sum += x[i]; + } + for (int i = 0; i < size; i++) { + x[i] /= sum; + } +} + +void matmul(float* xout, float* x, float* w, int n, int d) { + // W (d,n) @ x (n,) -> xout (d,) + for (int i = 0; i < d; i++) { + float val = 0.0f; + for (int j = 0; j < n; j++) { + val += w[i * n + j] * x[j]; + } + xout[i] = val; + } +} + +void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) { + + // a few convenice variables + float *x = s->x; + int dim = p->dim; + int hidden_dim = p->hidden_dim; + int head_size = dim / p->n_heads; + + // copy the token embedding into x + float* content_row = &(w->token_embedding_table[token * dim]); + copy(x, content_row, dim); + + // pluck out the "pos" row of freq_cis_real and freq_cis_imag + float* freq_cis_real_row = w->freq_cis_real + pos * head_size / 2; + float* freq_cis_imag_row = w->freq_cis_imag + pos * head_size / 2; + + // forward all the layers + for(int l = 0; l < p->n_layers; l++) { + + // attention rmsnorm + rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim); + + // qkv matmuls for this position + matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim); + matmul(s->k, s->xb, w->wk + l*dim*dim, dim, dim); + matmul(s->v, s->xb, w->wv + l*dim*dim, dim, dim); + + // apply RoPE rotation to the q and k vectors for each head + for (int h = 0; h < p->n_heads; h++) { + // get the q and k vectors for this head + float* q = s->q + h * head_size; + float* k = s->k + h * head_size; + // rotate q and k by the freq_cis_real and freq_cis_imag + for (int i = 0; i < head_size; i+=2) { + float q0 = q[i]; + float q1 = q[i+1]; + float k0 = k[i]; + float k1 = k[i+1]; + float fcr = freq_cis_real_row[i/2]; + float fci = freq_cis_imag_row[i/2]; + q[i] = q0 * fcr - q1 * fci; + q[i+1] = q0 * fci + q1 * fcr; + k[i] = k0 * fcr - k1 * fci; + k[i+1] = k0 * fci + k1 * fcr; + } + } + + // save key,value at this time step (pos) to our kv cache + int loff = l * p->seq_len * dim; // kv cache layer offset for convenience + float* key_cache_row = s->key_cache + loff + pos * dim; + float* value_cache_row = s->value_cache + loff + pos * dim; + copy(key_cache_row, s->k, dim); + copy(value_cache_row, s->v, dim); + + // multihead attention. iterate over all heads + for (int h = 0; h < p->n_heads; h++) { + // get the query vector for this head + float* q = s->q + h * head_size; + // iterate over all timesteps, including the current one + for (int t = 0; t <= pos; t++) { + // get the key vector for this head and at this timestep + float* k = s->key_cache + loff + t * dim + h * head_size; + // calculate the attention score as the dot product of q and k + float score = 0.0f; + for (int i = 0; i < head_size; i++) { + score += q[i] * k[i]; + } + score /= sqrtf(head_size); + // save the score to the attention buffer + s->att[t] = score; + } + + // softmax the scores to get attention weights, from 0..pos inclusively + softmax(s->att, pos + 1); + + // weighted sum of the values, store back into xb + for (int i = 0; i < head_size; i++) { + float val = 0.0f; + for (int t = 0; t <= pos; t++) { + val += s->att[t] * s->value_cache[loff + t * dim + h * head_size + i]; // note bad locality + } + s->xb[h * head_size + i] = val; + } + } + + // final matmul to get the output of the attention + matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim); + + // residual connection back into x + accum(x, s->xb2, dim); + + // ffn rmsnorm + rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim); + + // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x)) + // first calculate self.w1(x) and self.w3(x) + matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim); + matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim); + + // F.silu; silu(x)=x*σ(x),where σ(x) is the logistic sigmoid + for (int i = 0; i < hidden_dim; i++) { + s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i]))); + } + + // elementwise multiply with w3(x) + for (int i = 0; i < hidden_dim; i++) { + s->hb[i] = s->hb[i] * s->hb2[i]; + } + + // final matmul to get the output of the ffn + matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim); + + // residual connection + accum(x, s->xb, dim); + } + + // final rmsnorm + rmsnorm(x, x, w->rms_final_weight, dim); + + // classifier into logits + matmul(s->logits, x, w->token_embedding_table, p->dim, p->vocab_size); +} + +int sample(float* probabilities, int n) { + // sample index from probabilities, they must sum to 1 + float r = (float)rand() / (float)RAND_MAX; + float cdf = 0.0f; + for (int i = 0; i < n; i++) { + cdf += probabilities[i]; + if (r < cdf) { + return i; + } + } + return n - 1; // in case of rounding errors +} + +int argmax(float* v, int n) { + // return argmax of v in elements 0..n + int max_i = 0; + float max_p = v[0]; + for (int i = 1; i < n; i++) { + if (v[i] > max_p) { + max_i = i; + max_p = v[i]; + } + } + return max_i; +} + +// ---------------------------------------------------------------------------- + +int main(int argc, char *argv[]) { + setbuf(stdout, NULL); // disable stdout buffering + + // poor man's C argparse + char *checkpoint = NULL; + float temperature = 0.9f; + // 'checkpoint' is necessary arg + if (argc < 2) { + printf("Usage: %s [temperature] [seed]\n", argv[0]); + return 1; + } + checkpoint = argv[1]; + // temperature is optional + if (argc >= 3) { + temperature = atof(argv[2]); + } + // seed is optional + if (argc >= 4) { + unsigned int seed = atoi(argv[3]); + srand(seed); + } else { + time_t current_time; + time(¤t_time); + srand((unsigned int)current_time); + } + + // read in the config header + Config config; + FILE *file = fopen(checkpoint, "rb"); + if (!file) { + printf("Unable to open file!"); + return 1; + } + fread(&config, sizeof(Config), 1, file); + + // create and init the Transformer + TransformerWeights weights; + malloc_weights(&weights, &config); + checkpoint_init_weights(&weights, &config, file); + fclose(file); + + // create and init the application RunState + RunState state; + malloc_run_state(&state, &config); + + // the current position we are in + int next; + int token = 1; // 1 = BOS token in Llama-2 sentencepiece + int pos = 0; + while (pos < config.seq_len) { + + // forward the transformer to get logits for the next token + transformer(token, pos, &config, &state, &weights); + + // sample the next token + if(temperature == 0.0f) { + // greedy argmax sampling + next = argmax(state.logits, config.vocab_size); + } else { + // apply the temperature to the logits + for (int q=0; q" or etc. Can also specify a file, use as: "FILE:prompt.txt" +num_samples = 1 # number of samples to draw +max_new_tokens = 100 # number of tokens generated in each sample +temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions +top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability +seed = 1337 +device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. +#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' +dtype = "float32" +compile = False # use PyTorch 2.0 to compile the model to be faster +exec(open('configurator.py').read()) # overrides from command line or config file +# ----------------------------------------------------------------------------- + +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul +torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn +device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast +ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] +ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) + +# init from a model saved in a specific directory +ckpt_path = os.path.join(out_dir, 'ckpt.pt') +checkpoint = torch.load(ckpt_path, map_location=device) +gptconf = ModelArgs(**checkpoint['model_args']) +model = Transformer(gptconf) +state_dict = checkpoint['model'] +unwanted_prefix = '_orig_mod.' +for k,v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) +model.load_state_dict(state_dict, strict=False) + +model.export() # model.bin + +model.eval() +model.to(device) +if compile: + print("Compiling the model...") + model = torch.compile(model) # requires PyTorch 2.0 (optional) + +# load the tokenizer +enc = Tokenizer() + +# encode the beginning of the prompt +if start.startswith('FILE:'): + with open(start[5:], 'r', encoding='utf-8') as f: + start = f.read() +start_ids = enc.encode(start, bos=True, eos=False) +x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) + +# run generation +with torch.no_grad(): + with ctx: + for k in range(num_samples): + y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) + print(enc.decode(y[0].tolist())) + print('---------------') diff --git a/tinystories.py b/tinystories.py new file mode 100644 index 00000000..9a69cb87 --- /dev/null +++ b/tinystories.py @@ -0,0 +1,166 @@ +""" +Download, preprocess and serve the TinyStories dataset as a DataLoader. +""" + +import argparse +import glob +import json +import os +import random +from typing import List +from concurrent.futures import ThreadPoolExecutor, as_completed + +import numpy as np +import requests +import torch +import torch.distributed as dist +from tqdm import tqdm + +from tokenizer import Tokenizer + +DATA_CACHE_DIR = "data" + +def download_file(url: str, fname: str, chunk_size=1024): + """Helper function to download a file from a given url""" + resp = requests.get(url, stream=True) + total = int(resp.headers.get("content-length", 0)) + with open(fname, "wb") as file, tqdm( + desc=fname, + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in resp.iter_content(chunk_size=chunk_size): + size = file.write(data) + bar.update(size) + + +def download(): + """Downloads the dataset to disk.""" + os.makedirs(DATA_CACHE_DIR, exist_ok=True) + + # download the TinyStories dataset, unless it's already downloaded + data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz" + data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz") + if not os.path.exists(data_filename): + print(f"Downloading {data_url} to {data_filename}...") + download_file(data_url, data_filename) + else: + print(f"{data_filename} already exists, skipping download...") + + # unpack the tar.gz file into all the data shards (json files) + data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") + if not os.path.exists(data_dir): + os.makedirs(data_dir, exist_ok=True) + print(f"Unpacking {data_filename}...") + os.system(f"tar -xzf {data_filename} -C {data_dir}") + else: + print(f"{data_dir} already exists, skipping unpacking...") + + # print a single example just for debugging and such + shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) + with open(shard_filenames[0], "r") as f: + data = json.load(f) + print("Download done.") + print(f"Number of shards: {len(shard_filenames)}") + print(f"Example story:\n{data[0]}") + +def pretokenize(): + enc = Tokenizer() + + def process_shard(shard): + with open(shard, "r") as f: + data = json.load(f) + all_tokens = [] + for example in tqdm(data): + text = example["story"] + text = text.strip() # get rid of leading/trailing whitespace + tokens = enc.encode(text, bos=True, eos=False) # encode the text, use BOS + all_tokens.extend(tokens) + # convert to uint16 nparray + all_tokens = np.array(all_tokens, dtype=np.uint16) + # write to disk + tokenized_filename = shard.replace(".json", ".bin") + with open(tokenized_filename, "wb") as f: + f.write(all_tokens.tobytes()) + print(f"Saved {tokenized_filename}") + + # iterate the shards and tokenize all of them one by one + data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") + shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) + + # process all the shards in a threadpool + with ThreadPoolExecutor(max_workers=8) as executor: + executor.map(process_shard, shard_filenames) + + print("Done.") + + +class PretokDataset(torch.utils.data.IterableDataset): + """Loads pretokenized examples from disk and yields them as PyTorch tensors.""" + + def __init__(self, split, max_seq_len): + super().__init__() + self.split = split + self.max_seq_len = max_seq_len + + def __iter__(self): + # get worker info within a DataLoader + worker_info = torch.utils.data.get_worker_info() + worker_id = worker_info.id if worker_info else 0 + # get DDP rank info + rank = dist.get_rank() if dist.is_initialized() else 0 + # combine the worker_id and worker_rank to create a unique seed for rng + seed = 42 + worker_id + 1337 * rank + rng = random.Random(seed) + print(f"Created a PretokDataset with rng seed {seed}") + data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") + shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.bin"))) + # train/test split. let's use only shard 0 for test split, rest train + shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1] + while True: + rng.shuffle(shard_filenames) + for shard in shard_filenames: + # open the dataset for reading but keep it on disk with memmap + m = np.memmap(shard, dtype=np.uint16, mode="r") + num_batches = len(m) // self.max_seq_len + num_batches -= 1 # drop the last partial batch + assert num_batches > 0, "this shard is way too small? investigate." + ixs = list(range(num_batches)) + rng.shuffle(ixs) + for ix in ixs: + start = ix * self.max_seq_len + end = start + self.max_seq_len + 1 + # calling .astype will copy the data into a new numpy array, now in RAM + chunk = torch.from_numpy((m[start:end]).astype(np.int64)) + x = chunk[:-1] + y = chunk[1:] + yield x, y + + +class Task: + + @staticmethod + def iter_batches(split, batch_size, max_seq_len, device, num_workers=0): + ds = PretokDataset(split, max_seq_len) + dl = torch.utils.data.DataLoader( + ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers + ) + for x, y in dl: + x = x.to(device, non_blocking=True) + y = y.to(device, non_blocking=True) + yield x, y + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"]) + args = parser.parse_args() + + # depending on the stage call the appropriate function + fun = { + "download": download, + "pretokenize": pretokenize, + } + fun[args.stage]() \ No newline at end of file diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 00000000..22bccbcb Binary files /dev/null and b/tokenizer.model differ diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 00000000..5466454e --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,40 @@ +# Taken from llama code and lightly modified +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +import os +from logging import getLogger +from typing import List + +from sentencepiece import SentencePieceProcessor + +TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model + +class Tokenizer: + def __init__(self): + model_path = TOKENIZER_MODEL + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + print(f"Loaded SentencePiece model from {model_path}") + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.pad_id() + print( + f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" + ) + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + return self.sp_model.decode(t) diff --git a/train.py b/train.py new file mode 100644 index 00000000..2488bb32 --- /dev/null +++ b/train.py @@ -0,0 +1,328 @@ +""" +This training script can be run both on a single gpu in debug mode, +and also in a larger training run with distributed data parallel (ddp). + +To run on a single GPU small debug run, example: +$ python -m train.py --compile=False --eval_iters=10 --batch_size=8 + +To run with DDP on 4 gpus on 1 node, example: +$ torchrun --standalone --nproc_per_node=4 train.py +PYTHONPATH=/home/ubuntu/miniconda3/envs/pytorch2/lib/python3.10/site-packages torchrun --standalone --nproc_per_node=4 train.py --compile=False --wandb_log=True + +To run with DDP on 4 gpus across 2 nodes, example: +- Run on the first (master) node with example IP 123.456.123.456: +$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py +- Run on the worker node: +$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py +(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1) +""" + +import math +import os +import time +from contextlib import nullcontext +from datetime import datetime +from functools import partial + +import torch +from model import Transformer, ModelArgs +from torch.distributed import destroy_process_group, init_process_group +from torch.nn.parallel import DistributedDataParallel as DDP + +from tinystories import Task + +# ----------------------------------------------------------------------------- +# I/O +out_dir = "out" +eval_interval = 2000 +log_interval = 1 +eval_iters = 100 +eval_only = False # if True, script exits right after the first eval +always_save_checkpoint = False # if True, always save a checkpoint after each eval +init_from = "scratch" # 'scratch' or 'resume' +# wandb logging +wandb_log = False # disabled by default +wandb_project = "llamac" +wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") +# data +batch_size = 128 # if gradient_accumulation_steps > 1, this is the micro-batch size +max_seq_len = 256 +# model +dim = 288 +n_layers = 6 +n_heads = 6 +multiple_of = 32 +dropout = 0.0 +# adamw optimizer +gradient_accumulation_steps = 4 # used to simulate larger batch sizes +learning_rate = 5e-4 # max learning rate +max_iters = 100000 # total number of training iterations +weight_decay = 1e-1 +beta1 = 0.9 +beta2 = 0.95 +grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 +# learning rate decay settings +decay_lr = True # whether to decay the learning rate +warmup_iters = 1000 # how many steps to warm up for +# system +device = "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks +dtype = "bfloat16" # float32|bfloat16|float16 +compile = True # use PyTorch 2.0 to compile the model to be faster +# ----------------------------------------------------------------------------- +config_keys = [ + k + for k, v in globals().items() + if not k.startswith("_") and isinstance(v, (int, float, bool, str)) +] +exec(open("configurator.py").read()) # overrides from command line or config file +config = {k: globals()[k] for k in config_keys} # will be useful for logging +# ----------------------------------------------------------------------------- + +# fixing some hyperparams to sensible defaults +lr_decay_iters = max_iters # should be ~= max_iters per Chinchilla +min_lr = 0.0 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla + +# various inits, derived attributes, I/O setup +ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? +if ddp: + init_process_group(backend="nccl") + ddp_rank = int(os.environ["RANK"]) + ddp_local_rank = int(os.environ["LOCAL_RANK"]) + ddp_world_size = int(os.environ["WORLD_SIZE"]) + device = f"cuda:{ddp_local_rank}" + torch.cuda.set_device(device) + master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. + seed_offset = ddp_rank # each process gets a different seed + # world_size number of processes will be training simultaneously, so we can scale + # down the desired gradient accumulation iterations per process proportionally + assert gradient_accumulation_steps % ddp_world_size == 0 + gradient_accumulation_steps //= ddp_world_size +else: + # if not ddp, we are running on a single gpu, and one process + master_process = True + seed_offset = 0 + ddp_world_size = 1 +tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len +if master_process: + print(f"tokens per iteration will be: {tokens_per_iter:,}") + print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len") + +if master_process: + os.makedirs(out_dir, exist_ok=True) +torch.manual_seed(1337 + seed_offset) +torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul +torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn +device_type = "cuda" if "cuda" in device else "cpu" # for later use in torch.autocast +# note: float16 data type will automatically use a GradScaler +ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype] +ctx = ( + nullcontext() + if device_type == "cpu" + else torch.amp.autocast(device_type=device_type, dtype=ptdtype) +) + +# task-specific setup +iter_batches = partial( + Task.iter_batches, + batch_size=batch_size, + max_seq_len=max_seq_len, + device=device, + num_workers=0, +) + +# init these up here, can override if init_from='resume' (i.e. from a checkpoint) +iter_num = 0 +best_val_loss = 1e9 + +# model init +model_args = dict( + dim=dim, + n_layers=n_layers, + n_heads=n_heads, + n_kv_heads=n_heads, + vocab_size=32000, + multiple_of=multiple_of, + max_seq_len=max_seq_len, + #dropout=dropout, +) # start with model_args from command line +if init_from == "scratch": + # init a new model from scratch + print("Initializing a new model from scratch") + gptconf = ModelArgs(**model_args) + model = Transformer(gptconf) +elif init_from == "resume": + print(f"Resuming training from {out_dir}") + # resume training from a checkpoint. + ckpt_path = os.path.join(out_dir, "ckpt.pt") + checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint_model_args = checkpoint["model_args"] + # force these config attributes to be equal otherwise we can't even resume training + # the rest of the attributes (e.g. dropout) can stay as desired from command line + for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]: + model_args[k] = checkpoint_model_args[k] + # create the model + gptconf = ModelArgs(**model_args) + model = Transformer(gptconf) + state_dict = checkpoint["model"] + # fix the keys of the state dictionary :( + # honestly no idea how checkpoints sometimes get this prefix, have to debug more + unwanted_prefix = "_orig_mod." + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k) + model.load_state_dict(state_dict) + iter_num = checkpoint["iter_num"] + best_val_loss = checkpoint["best_val_loss"] +model.to(device) + +# initialize a GradScaler. If enabled=False scaler is a no-op +scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16")) + +# optimizer +optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) +if init_from == "resume": + optimizer.load_state_dict(checkpoint["optimizer"]) +checkpoint = None # free up memory + +# compile the model +if compile: + print("compiling the model... (takes a ~minute)") + unoptimized_model = model + model = torch.compile(model) # requires PyTorch 2.0 + +# wrap model into DDP container +if ddp: + model = DDP(model, device_ids=[ddp_local_rank]) + +# helps estimate an arbitrarily accurate loss over either split using many batches +@torch.no_grad() +def estimate_loss(): + out = {} + model.eval() + for split in ["train", "val"]: + batch_iter = iter_batches(split) + losses = torch.zeros(eval_iters) # keep on CPU + for k in range(eval_iters): + X, Y = next(batch_iter) + with ctx: + logits, loss = model(X, Y) + losses[k] = loss.item() + out[split] = losses.mean() + model.train() + return out + +# learning rate decay scheduler (cosine with warmup) +def get_lr(it): + # 1) linear warmup for warmup_iters steps + if it < warmup_iters: + return learning_rate * it / warmup_iters + # 2) if it > lr_decay_iters, return min learning rate + if it > lr_decay_iters: + return min_lr + # 3) in between, use cosine decay down to min learning rate + decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) + assert 0 <= decay_ratio <= 1 + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 + return min_lr + coeff * (learning_rate - min_lr) + +# logging +if wandb_log and master_process: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name, config=config) + +# training loop +train_batch_iter = iter_batches("train") +X, Y = next(train_batch_iter) # fetch the very first batch +t0 = time.time() +local_iter_num = 0 # number of iterations in the lifetime of this process +raw_model = model.module if ddp else model # unwrap DDP container if needed +running_mfu = -1.0 +while True: + # determine and set the learning rate for this iteration + lr = get_lr(iter_num) if decay_lr else learning_rate + for param_group in optimizer.param_groups: + param_group["lr"] = lr + + # evaluate the loss on train/val sets and write checkpoints + if iter_num % eval_interval == 0 and master_process: + losses = estimate_loss() + print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") + if wandb_log: + try: + wandb.log( + { + "iter": iter_num, + "tokens": iter_num * tokens_per_iter, + "loss/train": losses["train"], + "loss/val": losses["val"], + "lr": lr, + "mfu": running_mfu * 100, # convert to percentage + } + ) + except Exception as e: + print(f"logging to wandb failed: {e}") + if losses["val"] < best_val_loss or always_save_checkpoint: + best_val_loss = losses["val"] + if iter_num > 0: + checkpoint = { + "model": raw_model.state_dict(), + "optimizer": optimizer.state_dict(), + "model_args": model_args, + "iter_num": iter_num, + "best_val_loss": best_val_loss, + "config": config, + } + print(f"saving checkpoint to {out_dir}") + torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt")) + raw_model.export(os.path.join(out_dir, "model.bin")) + if iter_num == 0 and eval_only: + break + + # forward backward update, with optional gradient accumulation to simulate larger batch size + # and using the GradScaler if data type is float16 + for micro_step in range(gradient_accumulation_steps): + if ddp: + # in DDP training we only need to sync gradients at the last micro step. + # the official way to do this is with model.no_sync() context manager, but + # I really dislike that this bloats the code and forces us to repeat code + # looking at the source of that context manager, it just toggles this variable + model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1 + with ctx: + logits, loss = model(X, Y) + loss = loss / gradient_accumulation_steps + # immediately async prefetch next batch while model is doing the forward pass on the GPU + X, Y = next(train_batch_iter) + # backward pass, with gradient scaling if training in fp16 + scaler.scale(loss).backward() + # clip the gradient + if grad_clip != 0.0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + # step the optimizer and scaler if training in fp16 + scaler.step(optimizer) + scaler.update() + # flush the gradients as soon as we can, no need for this memory anymore + optimizer.zero_grad(set_to_none=True) + + # timing and logging + t1 = time.time() + dt = t1 - t0 + t0 = t1 + if iter_num % log_interval == 0 and master_process: + # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point + lossf = loss.item() * gradient_accumulation_steps + if local_iter_num >= 5: # let the training loop settle a bit + mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt) + running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + print( + f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%" + ) + iter_num += 1 + local_iter_num += 1 + + # termination conditions + if iter_num > max_iters: + break + +if ddp: + destroy_process_group()