Merge 9fbd4dff5b into 20cd236f61

2026-03-09 23:22:49 -04:00 · 2026-03-09 23:22:49 -04:00 · 79aec4b028
parent 20cd236f61 9fbd4dff5b
commit 79aec4b028
5 changed files with 176 additions and 0 deletions
--- a/CONTRIBUTION_SUBMISSION.md
+++ b/CONTRIBUTION_SUBMISSION.md
@ -0,0 +1,95 @@
+# Contribution submission guide
+
+This file summarizes what was done on branch `contribution/benchmark-m5-and-fixes` and how to submit it.
+
+---
+
+## 1. Benchmark (submit to Issue #3)
+
+**Link:** https://github.com/maderix/ANE/issues/3
+
+**Post this as a new comment:**
+
+```
+## M5 MacBook Pro benchmark (static pipeline, 20 steps)
+
+- **Chip:** Apple M5, 10-core (4P+6E)
+- **RAM:** 24 GB
+- **macOS:** 26.3 (Build 25D125)
+- **Run:** `./train_large --data ./tinystories_data00.bin --steps 20 --lr 1e-4`
+
+### Efficiency report
+- Total steps: 20
+- Wall time: 10423 ms (10.4 s)
+- Compile time: 7187 ms (69.0%)
+- Train time: 2542 ms (24.4%)
+- **Avg train: 127.1 ms/step**
+- ANE TFLOPS: 0.73 sustained
+- ANE utilization: 4.6% of 15.8 TFLOPS
+
+Full output with JSON lines is in `benchmarks/my_m5_benchmark_output.txt` (or paste the contents below).
+```
+
+Then paste the contents of `benchmarks/my_m5_benchmark_output.txt` in the same comment, or attach it.
+
+---
+
+## 2. Bug fix (PR)
+
+**Fix:** Guard short token datasets in `train_large_ane.m` and `training/training_dynamic/train.m`.
+
+**Why:** When `n_tokens <= SEQ + 1`, the expression `max_pos = n_tokens - SEQ - 1` underflows (unsigned), leading to a huge random range and possible out-of-bounds reads. `train_large.m` already had this guard; the other two pipelines did not.
+
+**Changes:**
+- `training/train_large_ane.m`: After `n_tokens = data_len / 2`, add a check that fails early with a clear error, munmap and close the fd, and return 1.
+- `training/training_dynamic/train.m`: Same guard added.
+
+**Suggested PR title:** `fix: guard short token datasets in train_large_ane and dynamic pipeline`
+
+**Suggested PR description:**
+
+```markdown
+## Summary
+- Add a token dataset length guard in `training/train_large_ane.m`
+- Add the same guard in `training/training_dynamic/train.m`
+- Fail early with a clear error when the dataset is too short for one (input, target) window
+
+## Why
+Both paths use `max_pos = n_tokens - SEQ - 1`. When `n_tokens <= SEQ + 1`, this unsigned subtraction underflows, producing a huge range and potentially out-of-bounds reads. `train_large.m` already had this guard (lines 299–304); this PR aligns the other two pipelines.
+
+## Validation
+- `make -C training train_large_ane` — builds
+- `make -C training/training_dynamic train` — builds
+- With a too-short data file, both binaries exit with the new error message.
+```
+
+---
+
+## 3. Optional: benchmark data in repo
+
+Branch also adds:
+- `benchmarks/my_m5_benchmark_output.txt` — full benchmark log
+- One new entry in `benchmarks/community_results.json` for this M5 run (contributor: `log-wade`)
+
+You can either:
+- Include the `community_results.json` update in the same PR as the bug fix, or
+- Omit it and only post the benchmark to Issue #3 (maintainer may update the report from the issue).
+
+---
+
+## 4. Before opening the PR
+
+1. **Fork the repo** on GitHub (if you haven’t): https://github.com/maderix/ANE → Fork.
+2. **Add your fork as a remote and push:**
+   ```bash
+   git remote add myfork git@github.com:YOUR_USERNAME/ANE.git
+   git push myfork contribution/benchmark-m5-and-fixes
+   ```
+3. Open a PR from `myfork/contribution/benchmark-m5-and-fixes` to `maderix/ANE` main.
+4. Post the benchmark comment to Issue #3 (link above).
+
+---
+
+## 5. Replace contributor name
+
+In `benchmarks/community_results.json`, the new entry uses `"contributor": "log-wade"`. Change that to your GitHub username if different.
--- a/benchmarks/community_results.json
+++ b/benchmarks/community_results.json
@ -94,6 +94,19 @@
      "peak_tflops_inmem": 12.17,
      "notes": "inmem_peak only, no training data submitted.",
      "contributor": "elijah-pelton"
+    },
+    {
+      "chip": "M5",
+      "cores": "10-core (4P+6E)",
+      "ram_gb": 24,
+      "macos": "26.3",
+      "ms_per_step": [125, 128],
+      "ane_ms": [9.1, 9.2],
+      "compile_ms": [3554, 3633],
+      "ane_tflops": [0.72, 0.74],
+      "ane_util_pct": [4.57, 4.70],
+      "notes": "MacBook Pro, static pipeline train_large, 20 steps, random init.",
+      "contributor": "log-wade"
    }
  ],
  "neural_engine_specs": {
--- a/benchmarks/my_m5_benchmark_output.txt
+++ b/benchmarks/my_m5_benchmark_output.txt
@ -0,0 +1,56 @@
+=== ANE Training: Stories110M (12 layers) ===
+dim=768 hidden=2048 heads=12 seq=256 vocab=32000 layers=12
+Cannot open stories110M.bin
+Pretrained load failed, using random init
+Params: 109.53M (transformer 84.95M + embed 24.58M)
+Kernels: 72 (60 weight-bearing + 12 static sdpaBwd2)
+Accum 10 steps per recompile | Adam LR=1.0e-04 b1=0.9 b2=0.999
+FLOPs/step: fwd=43487M bwd_dx=43487M bwd_dW=43487M sdpa_bwd=6040M total=174248M
+ANE FLOPs/step: 93013M (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas)
+
+Token data: 20658981 tokens (41.3 MB)
+  Compiling layer 1/12... (12 compiles)
  Compiling layer 2/12... (17 compiles)
  Compiling layer 3/12... (22 compiles)
  Compiling layer 4/12... (27 compiles)
  Compiling layer 5/12... (32 compiles)
  Compiling layer 6/12... (37 compiles)
  Compiling layer 7/12... (42 compiles)
  Compiling layer 8/12... (47 compiles)
  Compiling layer 9/12... (52 compiles)
  Compiling layer 10/12... (57 compiles)
  Compiling layer 11/12... (62 compiles)
  Compiling layer 12/12... (67 compiles)
  Compiled 60 kernels in 3554ms                    
+step 0    loss=10.3907
+{"type":"step","step":0,"loss":10.390698,"t_ane":12.288,"t_io":14.233,"t_cls":30.426,"t_elem":21.143,"t_rms":0.094,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":1,"loss":10.434500,"t_ane":10.653,"t_io":13.757,"t_cls":20.472,"t_elem":18.814,"t_rms":0.070,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":2,"loss":10.484736,"t_ane":10.050,"t_io":10.094,"t_cls":16.495,"t_elem":17.783,"t_rms":0.061,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":3,"loss":10.417551,"t_ane":9.755,"t_io":8.214,"t_cls":14.512,"t_elem":16.853,"t_rms":0.068,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":4,"loss":10.392599,"t_ane":9.537,"t_io":7.032,"t_cls":13.297,"t_elem":16.319,"t_rms":0.063,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":5,"loss":10.392069,"t_ane":9.404,"t_io":6.251,"t_cls":12.475,"t_elem":15.887,"t_rms":0.060,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":6,"loss":10.382063,"t_ane":9.313,"t_io":5.697,"t_cls":11.874,"t_elem":15.678,"t_rms":0.058,"t_cblas_wait":0.001,"compiles":72}
+{"type":"step","step":7,"loss":10.377501,"t_ane":9.238,"t_io":5.293,"t_cls":11.437,"t_elem":15.556,"t_rms":0.056,"t_cblas_wait":0.001,"compiles":72}
+{"type":"step","step":8,"loss":10.409813,"t_ane":9.174,"t_io":4.967,"t_cls":11.101,"t_elem":15.372,"t_rms":0.055,"t_cblas_wait":0.001,"compiles":72}
+{"type":"step","step":9,"loss":10.395181,"t_ane":9.138,"t_io":4.720,"t_cls":10.819,"t_elem":15.289,"t_rms":0.054,"t_cblas_wait":0.001,"compiles":72}
+  [batch 10: compile=3554ms train=1253.8ms (125.4ms/step) compiles=72]
+    ane=9.1 io=4.7 cls=10.8 elem=15.3 rms=0.1 cblas_wait=0.0 ms/step
+{"type":"batch","batch":10,"compile_ms":3554.3,"train_ms":1253.8,"ms_per_step":125.4}
+{"type":"perf","ane_tflops":0.742,"ane_util_pct":4.70}
+[exec() restart step 10, 72 compiles, loss=10.3952]
+[RESUMED step 10, loss=10.3952]
+Token data: 20658981 tokens (41.3 MB)
+  Compiling layer 1/12... (12 compiles)
  Compiling layer 2/12... (17 compiles)
  Compiling layer 3/12... (22 compiles)
  Compiling layer 4/12... (27 compiles)
  Compiling layer 5/12... (32 compiles)
  Compiling layer 6/12... (37 compiles)
  Compiling layer 7/12... (42 compiles)
  Compiling layer 8/12... (47 compiles)
  Compiling layer 9/12... (52 compiles)
  Compiling layer 10/12... (57 compiles)
  Compiling layer 11/12... (62 compiles)
  Compiling layer 12/12... (67 compiles)
  Compiled 60 kernels in 3633ms                    
+step 10   loss=10.2671
+{"type":"step","step":10,"loss":10.267123,"t_ane":13.398,"t_io":14.979,"t_cls":29.723,"t_elem":22.190,"t_rms":0.109,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":11,"loss":10.389436,"t_ane":11.150,"t_io":13.816,"t_cls":19.297,"t_elem":17.862,"t_rms":0.078,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":12,"loss":10.246490,"t_ane":10.356,"t_io":10.036,"t_cls":15.691,"t_elem":16.749,"t_rms":0.067,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":13,"loss":10.322395,"t_ane":9.971,"t_io":8.113,"t_cls":13.880,"t_elem":16.200,"t_rms":0.061,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":14,"loss":10.280519,"t_ane":9.708,"t_io":7.002,"t_cls":12.817,"t_elem":15.972,"t_rms":0.061,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":15,"loss":10.202168,"t_ane":9.575,"t_io":6.212,"t_cls":12.096,"t_elem":15.716,"t_rms":0.059,"t_cblas_wait":0.003,"compiles":72}
+{"type":"step","step":16,"loss":10.306752,"t_ane":9.450,"t_io":5.685,"t_cls":11.577,"t_elem":15.530,"t_rms":0.057,"t_cblas_wait":0.003,"compiles":72}
+{"type":"step","step":17,"loss":10.293774,"t_ane":9.361,"t_io":5.280,"t_cls":11.209,"t_elem":15.392,"t_rms":0.055,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":18,"loss":10.263789,"t_ane":9.278,"t_io":4.976,"t_cls":10.908,"t_elem":15.263,"t_rms":0.054,"t_cblas_wait":0.002,"compiles":72}
+{"type":"step","step":19,"loss":10.307909,"t_ane":9.237,"t_io":4.751,"t_cls":10.657,"t_elem":15.160,"t_rms":0.053,"t_cblas_wait":0.002,"compiles":72}
+  [batch 10: compile=3633ms train=1287.8ms (128.8ms/step) compiles=72]
+    ane=9.2 io=4.8 cls=10.7 elem=15.2 rms=0.1 cblas_wait=0.0 ms/step
+{"type":"batch","batch":10,"compile_ms":3632.9,"train_ms":1287.8,"ms_per_step":128.8}
+{"type":"perf","ane_tflops":0.722,"ane_util_pct":4.57}
+
+=== Efficiency Report ===
+Total steps:     20
+Wall time:       10423 ms (10.4 s)
+Compile time:    7187 ms (69.0%)
+Train time:      2542 ms (24.4%)
+Avg train:       127.1 ms/step
+ANE TFLOPS:      0.73 sustained
+Total TFLOPS:    1.37 (ANE+CPU)
+ANE utilization: 4.6% of 15.8 TFLOPS
--- a/training/train_large_ane.m
+++ b/training/train_large_ane.m
@ -285,6 +285,12 @@ int main(int argc, char *argv[]) {
        uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
        if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
        size_t n_tokens = data_len / 2;
+        if (n_tokens <= (size_t)(SEQ + 1)) {
+            printf("Token data too short: need at least %d tokens, got %zu\n", SEQ + 2, n_tokens);
+            munmap(token_data, data_len);
+            close(data_fd);
+            return 1;
+        }
        printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);

        // Gradient buffers
--- a/training/training_dynamic/train.m
+++ b/training/training_dynamic/train.m
@ -294,6 +294,12 @@ int main(int argc, char *argv[]) {
        uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
        if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
        size_t n_tokens = data_len / 2;
+        if (n_tokens <= (size_t)(SEQ + 1)) {
+            printf("Token data too short: need at least %d tokens, got %zu\n", SEQ + 2, n_tokens);
+            munmap(token_data, data_len);
+            close(data_fd);
+            return 1;
+        }
        printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);

        // Vocab compaction