diff --git a/training/README.md b/training/README.md index 8ccde88..a3f33eb 100644 --- a/training/README.md +++ b/training/README.md @@ -83,15 +83,17 @@ Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Pr ### 2. Build & Train ```bash -# Static baseline (classifier + softmax on CPU) -make train_large -./train_large stories110M.bin 256 100 1e-4 -./train_large --model stories110M.bin --steps 100 --lr 1e-4 - -# PR#19: ANE-offloaded classifier + softmax + rmsnorm_bwd -make train_large_ane -./train_large_ane stories110M.bin 256 100 1e-4 -./train_large_ane --no-ane-extras --steps 100 # disable ANE extras +# Static baseline (classifier + softmax on CPU) +make train_large +./train_large stories110M.bin 256 100 1e-4 +./train_large --model stories110M.bin --steps 100 --lr 1e-4 +./train_large --data ./tinystories_data00.bin --steps 100 --lr 1e-4 + +# PR#19: ANE-offloaded classifier + softmax + rmsnorm_bwd +make train_large_ane +./train_large_ane stories110M.bin 256 100 1e-4 +./train_large_ane --no-ane-extras --steps 100 # disable ANE extras +./train_large_ane --data ./tinystories_data00.bin --steps 100 --lr 1e-4 # Dynamic pipeline (no recompilation) cd training_dynamic && make train @@ -100,13 +102,14 @@ cd training_dynamic && make train ./train --steps 200 --lr 1e-4 # custom steps/lr ``` -**CLI flags (all pipelines):** +**CLI flags (`train_large` / `train_large_ane`):** - `--steps N` (default 10000) -- `--lr F` (default 3e-4) -- `--model PATH` — pretrained weights file -- `--ckpt PATH` — checkpoint file (preserved across exec() restarts) -- `--resume` — resume from checkpoint -- `--no-ane-extras` — (train_large_ane only) disable ANE classifier/softmax/rmsnorm_bwd +- `--lr F` (default 3e-4) +- `--model PATH` — pretrained weights file +- `--data PATH` — tokenized TinyStories `.bin` file (default: `tinystories_data00.bin`) +- `--ckpt PATH` — checkpoint file (preserved across exec() restarts) +- `--resume` — resume from checkpoint +- `--no-ane-extras` — (train_large_ane only) disable ANE classifier/softmax/rmsnorm_bwd ### 3. Monitor with Dashboard diff --git a/training/train_large.m b/training/train_large.m index 8b83bfc..96f8f7a 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -5,9 +5,9 @@ #include "stories_mil.h" #include "stories_cpu_ops.h" -#define CKPT_PATH_DEFAULT "ane_stories110M_ckpt.bin" -#define MODEL_PATH_DEFAULT "stories110M.bin" -#define DATA_PATH "tinystories_data00.bin" +#define CKPT_PATH_DEFAULT "ane_stories110M_ckpt.bin" +#define MODEL_PATH_DEFAULT "stories110M.bin" +#define DATA_PATH_DEFAULT "tinystories_data00.bin" // ===== Weight loading from llama2.c format ===== static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) { @@ -192,22 +192,24 @@ int main(int argc, char *argv[]) { float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; int adam_t = 0, start_step = 0; - // Parse args - const char *ckpt_path = CKPT_PATH_DEFAULT; - const char *model_path = MODEL_PATH_DEFAULT; - bool do_resume = false; - int pos = 0; - for (int i=1; i