diff --git a/training/README.md b/training/README.md index 9c4fb00..8ccde88 100644 --- a/training/README.md +++ b/training/README.md @@ -8,43 +8,68 @@ Training a 109M-parameter Llama2-architecture transformer (Stories110M) directly - **Model**: Stories110M — dim=768, hidden=2048, heads=12, layers=12, vocab=32000, seq=256 - **109.53M params** (84.95M transformer + 24.58M embedding) -- **72 ANE kernels** per compile (60 weight-bearing, 12 weight-free sdpaBwd2) -- **6 kernel types per layer**: fwdAttn, fwdFFN, ffnBwd, sdpaBwd1, sdpaBwd2, qkvBwd +- **SDPA causal mask workaround**: ANE hardware ignores attn_mask — decompose into Q@K^T (ANE conv) + mask+softmax (CPU) + scores@V (ANE conv) -## Performance +## Three Training Pipelines -| Component | Time (ms/step) | -|-----------|---------------| -| ANE eval | 9.6 | -| IO (fp16 conversion) | 4.1 | -| Classifier (cblas) | 9.1 | -| Cross-entropy + residuals | 14.4 | -| RMSNorm | 0.1 | -| **Total** | **107 ms/step** | +### 1. Static Baseline (`train_large`) +Original pipeline. Weights baked as constants in MIL kernels — recompile every 10 steps via `exec()` restart. + +- 60 weight-bearing + 12 weight-free kernels = 72 per compile batch +- Classifier + softmax + RMSNorm backward on CPU +- **106.7 ms/step**, 7.6s compile per restart + +### 2. Static + ANE Extras (`train_large_ane`) — PR#19 +Offloads classifier forward (32K conv), softmax, final RMSNorm, and RMSNorm backward to ANE. Bridge API for C-callable ANE access. + +- 86 kernels per compile batch (+24 rmsnorm_bwd, +1 classifier, +1 finalRms) +- **91.8 ms/step** (14% faster), 9.6s compile per restart +- Use `--no-ane-extras` to disable and fall back to CPU (for debugging) + +### 3. Dynamic Weight Pipeline (`training_dynamic/`) +Weights passed via IOSurface spatial dimension — compile 9 kernels once at startup, no recompilation needed. + +- 9 shared kernels across all 12 layers +- **111 ms/step**, 0.4s one-time compile +- No exec() restart, no compile limit issues + +## Performance Comparison (20 Steps) + +| | Static Baseline | PR#19 + ANE extras | PR#19 no extras | Dynamic | +|---|---|---|---|---| +| **Wall time** | **10.1s** | **11.7s** | **10.7s** | **~2.6s** | +| Compile | 7.6s (75.7%) | 9.6s (81.6%) | 7.5s (69.7%) | 0.4s (15%) | +| Train | 2.1s (21.2%) | 1.8s (15.6%) | 2.9s (27.4%) | 2.2s (85%) | +| **ms/step** | **106.7** | **91.8** | **147.0** | **111** | +| Kernels/restart | 72 | 86 | 60 | 9 (once) | +| ANE TFLOPS | 0.87 | 1.15 | 0.72 | — | +| Total TFLOPS | 1.63 | 1.90 | 1.19 | — | + +**Key insights:** +- Dynamic wins on wall time for any practical run length (3.9x faster at 20 steps) +- PR#19 has the best per-step throughput (92ms) but compile overhead dominates short runs +- Static restarts every 10 steps, so dynamic's zero-recompile advantage compounds ## Files | File | Description | |------|-------------| -| `train_large.m` | Main training loop — 12-layer forward/backward, checkpoint, exec() restart | -| `stories_config.h` | Model config, structs, alloc helpers | +| `train_large.m` | Static baseline — 72 kernels, classifier/softmax on CPU | +| `train_large_ane.m` | PR#19 — 86 kernels, classifier/softmax/rmsnorm_bwd on ANE | +| `training_dynamic/train.m` | Dynamic pipeline — 9 kernels, weights via IOSurface | +| `training_dynamic/mil_dynamic.h` | MIL generators for dynamic weight kernels | +| `training_dynamic/config.h` | Model config (DIM=768, HIDDEN=2048, etc.) | +| `training_dynamic/io.h` | IOSurface I/O + MIL compilation helpers | +| `training_dynamic/cpu_ops.h` | CPU ops (SiLU backward, cross-entropy, Adam) | +| `stories_config.h` | Static pipeline config, structs, alloc helpers | | `stories_io.h` | IOSurface I/O, NEON fp16 conversion, kernel compile/eval | -| `stories_mil.h` | MIL program generators for all 6 ANE kernel types | -| `stories_cpu_ops.h` | vDSP-vectorized RMSNorm, cross-entropy, Adam, embedding ops | -| `dashboard.py` | TUI dashboard — loss curve, power/CPU/memory graphs, text generation | -| `tokenize.py` | Extract pretokenized TinyStories data | +| `stories_mil.h` | MIL generators for static pipeline (6 kernel types) | +| `stories_cpu_ops.h` | vDSP-vectorized RMSNorm, cross-entropy, Adam | +| `ane_classifier.h` | ANE classifier fwd (32K conv), softmax kernels | +| `ane_rmsnorm_bwd.h` | ANE rmsnorm backward kernel | +| `dashboard.py` | TUI dashboard — loss curve, power/CPU/memory graphs | | `Makefile` | Build targets | -## How it works - -1. **Forward pass**: Each layer runs fwdAttn (QKV + SDPA + Wo) and fwdFFN (W1 + SiLU(W3) + W2) on ANE via MIL-compiled kernels. Final RMSNorm + classifier matmul on CPU (cblas). - -2. **Backward pass**: Reverse layer order. ffnBwd, sdpaBwd1, sdpaBwd2, qkvBwd on ANE. Weight gradients (dW) via async cblas_sgemm on CPU. RMSNorm backward via vDSP. - -3. **Compile budget**: ANE has a ~119 compile limit per process. With 72 kernels per batch, we run 10 accumulation steps then `exec()` restart with checkpoint resume. - -4. **Data**: Real TinyStories text (20M tokens), mmap'd uint16 token IDs, random position sampling per step. - ## Usage ### 1. Download Training Data @@ -53,69 +78,63 @@ Training a 109M-parameter Llama2-architecture transformer (Stories110M) directly bash download_data.sh ``` -Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from [enio/TinyStories](https://huggingface.co/datasets/enio/TinyStories) on HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens). +Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens). ### 2. Build & Train ```bash -# Baseline: classifier + softmax on CPU +# Static baseline (classifier + softmax on CPU) make train_large -./train_large --steps 100 # quick test -./train_large # full 10k steps -./train_large --resume # resume from checkpoint +./train_large stories110M.bin 256 100 1e-4 +./train_large --model stories110M.bin --steps 100 --lr 1e-4 -# ANE-offloaded: classifier + softmax on ANE (faster) +# PR#19: ANE-offloaded classifier + softmax + rmsnorm_bwd make train_large_ane -./train_large_ane --steps 100 +./train_large_ane stories110M.bin 256 100 1e-4 +./train_large_ane --no-ane-extras --steps 100 # disable ANE extras + +# Dynamic pipeline (no recompilation) +cd training_dynamic && make train +./train --scratch # train from random init +./train # resume from checkpoint +./train --steps 200 --lr 1e-4 # custom steps/lr ``` -**CLI flags:** `--steps N` (default 10000), `--lr F` (default 3e-4), `--resume`. +**CLI flags (all pipelines):** +- `--steps N` (default 10000) +- `--lr F` (default 3e-4) +- `--model PATH` — pretrained weights file +- `--ckpt PATH` — checkpoint file (preserved across exec() restarts) +- `--resume` — resume from checkpoint +- `--no-ane-extras` — (train_large_ane only) disable ANE classifier/softmax/rmsnorm_bwd ### 3. Monitor with Dashboard ```bash pip install blessed psutil numpy -sudo python3 dashboard.py # live mode (needs powermetrics) -sudo python3 dashboard.py --resume # attach to resumed training +sudo python3 dashboard.py # static pipeline +sudo python3 dashboard.py --dynamic # dynamic pipeline ``` ### 4. Benchmarking -Both programs print an **Efficiency Report** at completion: +All programs print an **Efficiency Report** at completion: ``` === Efficiency Report === -Total steps: 100 -Avg train: 107.0 ms/step -ANE TFLOPS: 2.45 sustained -ANE utilization: 15.5% of 15.8 TFLOPS +Total steps: 20 +Wall time: 11738 ms (11.7 s) +Compile time: 9583 ms (81.6%) +Train time: 1835 ms (15.6%) +Avg train: 91.8 ms/step +ANE TFLOPS: 1.15 sustained ``` -Per-batch timing breakdown during training: +## Key Techniques -``` -ane=9.6 io=4.1 cls=9.1 elem=14.4 rms=0.1 cblas_wait=2.3 ms/step -``` - -| Metric | What it measures | -|--------|-----------------| -| `ane` | ANE kernel evaluation | -| `io` | fp16↔fp32 IOSurface transfer | -| `cls` | Classifier matmul (CPU cblas) | -| `elem` | Embedding, residual adds, cross-entropy | -| `rms` | RMSNorm forward/backward | -| `cblas_wait` | Waiting for async dW gradient sgemms | - -Compare baseline vs ANE-offloaded: - -```bash -make train_large && ./train_large --steps 100 -make train_large_ane && ./train_large_ane --steps 100 -``` - -## Key techniques - -- **NEON vectorized fp16<->fp32**: ARM NEON intrinsics for fast IOSurface data transfer +- **NEON vectorized fp16↔fp32**: ARM NEON intrinsics for fast IOSurface data transfer - **vDSP cross-entropy**: `vDSP_mtrans` + `vvexpf` + `vDSP_sve` — 8x faster than scalar - **Async weight gradients**: cblas_sgemm dispatched to background queue, overlapped with ANE -- **SDPA causal mask workaround**: ANE hardware ignores attn_mask, so we decompose attention into Q@K^T (ANE conv) + mask+softmax (CPU) + scores@V (ANE conv) +- **Vocab compaction** (dynamic): 32K → 9.2K active tokens, 3.5x reduction in classifier work +- **Dynamic weight packing**: Activations + weights concatenated in IOSurface spatial dimension — one kernel serves all 12 layers +- **exec() restart**: Workaround for ANE ~119 compile limit per process diff --git a/training/dashboard.py b/training/dashboard.py index b55c12e..18203d7 100644 --- a/training/dashboard.py +++ b/training/dashboard.py @@ -1,6 +1,6 @@ """TUI dashboard for ANE training (train_large). Uses blessed for terminal UI.""" -import argparse, fcntl, math, os, re, select, signal, struct, subprocess, sys, time, threading +import argparse, fcntl, json, math, os, re, select, signal, struct, subprocess, sys, time, threading from collections import deque from pathlib import Path @@ -20,7 +20,9 @@ except ImportError: DIM, HIDDEN, HEADS, SEQ, VOCAB, NLAYERS = 768, 2048, 12, 256, 32000, 12 HD = DIM // HEADS -CKPT_PATH = 'ane_stories110M_ckpt.bin' +CKPT_PATH_STATIC = 'ane_stories110M_ckpt.bin' +CKPT_PATH_DYNAMIC = 'training_dynamic/ane_stories110M_dyn_ckpt.bin' +CKPT_PATH = CKPT_PATH_STATIC # set in main() based on --dynamic TOKENIZER_PATH = str(Path(__file__).resolve().parent.parent.parent / 'assets' / 'models' / 'tokenizer.bin') @@ -56,6 +58,9 @@ class State: self.mem_mb_history = deque(maxlen=300) self.proc_mem_mb_history = deque(maxlen=300) self.train_pid = None + self.step_timestamps = [] # (step, time.monotonic()) for running ms/step + self.train_start = None # wall clock when first step seen + self.compile_ms = 0.0 # total compile time S = State() @@ -278,23 +283,69 @@ def sysmetrics_thread(): RE_CONFIG = re.compile(r'dim=(\d+) hidden=(\d+) heads=(\d+) seq=(\d+) vocab=(\d+) layers=(\d+)') RE_PARAMS = re.compile(r'Params: ([\d.]+)M \(transformer ([\d.]+)M \+ embed ([\d.]+)M\)') RE_KERNELS = re.compile(r'Kernels: (\d+).*?(\d+) weight-bearing') +RE_KERNELS_DYN = re.compile(r'Kernels: (\d+) compiled, (\d+) weight-bearing') RE_ACCUM = re.compile(r'Accum (\d+).*LR=([\d.e+-]+)') -RE_STEP = re.compile(r'step\s+(\d+)\s+loss=([\d.]+)') +RE_STEP = re.compile(r'step\s+(\d+)\s+loss=([\d.]+)(?:\s+lr=([\d.e+-]+))?(?:\s+([\d.]+)ms/step)?') RE_BATCH = re.compile(r'\[batch (\d+): compile=([\d.]+)ms train=([\d.]+)ms \(([\d.]+)ms/step\) compiles=(\d+)\]') RE_TIMING = re.compile(r'ane=([\d.]+) io=([\d.]+) cls=([\d.]+) elem=([\d.]+) rms=([\d.]+) cblas_wait=([\d.]+)') +RE_TIMING_DYN = re.compile(r'ane_fwd=([\d.]+) io_fwd=([\d.]+) rms=([\d.]+) ane_bwd=([\d.]+) io_bwd=([\d.]+) silu=([\d.]+) rms_bwd=([\d.]+) cls=([\d.]+) cblas_wait=([\d.]+) dw_copy=([\d.]+)') RE_RESTART = re.compile(r'\[exec\(\) restart step (\d+)') RE_RESUME = re.compile(r'\[RESUMED step (\d+), loss=([\d.]+)\]') RE_FLOPS = re.compile(r'FLOPs/step: fwd=([\d.]+)M bwd_dx=([\d.]+)M bwd_dW=([\d.]+)M sdpa_bwd=([\d.]+)M total=([\d.]+)M') RE_ANE_FLOPS = re.compile(r'ANE FLOPs/step: ([\d.]+)M') RE_ANE_TFLOPS = re.compile(r'ANE TFLOPS:\s+([\d.]+)') RE_ANE_UTIL = re.compile(r'ANE utilization:\s+([\d.]+)%') -RE_EFFICIENCY = re.compile(r'(Total steps|Wall time|Compile time|Train time|Avg compile|Avg train|ANE TFLOPS|Total TFLOPS|ANE utilization):?\s+(.+)') +RE_EFFICIENCY = re.compile(r'(Total steps|Wall time|Compile time|Compile|Train time|Avg compile|Avg train|ANE TFLOPS|Total TFLOPS|ANE utilization):?\s+(.+)') +RE_COMPILED = re.compile(r'Compiled (\d+) kernels in (\d+)ms') RE_ANE_POWER = re.compile(r'ANE Power:\s+([\d.]+)\s*mW') RE_CPU_POWER = re.compile(r'CPU Power:\s+([\d.]+)\s*mW') RE_GPU_POWER = re.compile(r'GPU Power:\s+([\d.]+)\s*mW') def parse_line(line): S.logs.append(line) + # Parse JSON lines from static pipeline ({"type":"step",...} or {"type":"batch",...}) + stripped = line.strip() + if stripped.startswith('{'): + try: + j = json.loads(stripped) + jt = j.get('type') + if jt == 'step': + S.step, S.loss = j['step'], j['loss'] + S.loss_history.append((S.step, S.loss)) + S.best_loss = min(S.best_loss, S.loss) + S.compiles = j.get('compiles', S.compiles) + now = time.monotonic() + if S.train_start is None: + S.train_start = now + S.step_timestamps.append((S.step, now)) + if len(S.step_timestamps) >= 2: + dt = S.step_timestamps[-1][1] - S.step_timestamps[-2][1] + if dt > 0: + S.ms_per_step = dt * 1000 + # Extract component timing from JSON + ct = {} + for k in ('t_ane', 't_io', 't_cls', 't_elem', 't_rms', 't_cblas_wait'): + if k in j: + ct[k[2:]] = j[k] # strip 't_' prefix + if ct: + S.component_timing = ct + return + elif jt == 'batch': + S.batch_num = j.get('batch', S.batch_num) + compile_ms = j.get('compile_ms', 0) + train_ms = j.get('train_ms', 0) + S.ms_per_step = j.get('ms_per_step', S.ms_per_step) + S.compile_ms += compile_ms + S.compile_pct = 100 * S.compile_ms / (S.compile_ms + train_ms) if S.compile_ms + train_ms > 0 else 0 + return + elif jt == 'perf': + if 'ane_tflops' in j: + S.flops['ane_tflops'] = j['ane_tflops'] + if 'ane_util_pct' in j: + S.flops['ane_util'] = j['ane_util_pct'] + return + except (json.JSONDecodeError, KeyError): + pass m = RE_CONFIG.search(line) if m: S.model_config = dict(zip(['dim', 'hidden', 'heads', 'seq', 'vocab', 'layers'], map(int, m.groups()))) @@ -303,7 +354,7 @@ def parse_line(line): if m: S.params = {'total': float(m[1]), 'transformer': float(m[2]), 'embed': float(m[3])} return - m = RE_KERNELS.search(line) + m = RE_KERNELS_DYN.search(line) or RE_KERNELS.search(line) if m: S.kernels = {'total': int(m[1]), 'weight_bearing': int(m[2])} return @@ -323,6 +374,18 @@ def parse_line(line): m = RE_STEP.search(line) if m: S.step, S.loss = int(m[1]), float(m[2]) + if m[3]: + S.training['lr'] = m[3] + if m[4]: + S.ms_per_step = float(m[4]) + now = time.monotonic() + if S.train_start is None: + S.train_start = now + S.step_timestamps.append((S.step, now)) + if not m[4] and len(S.step_timestamps) >= 2: + dt = S.step_timestamps[-1][1] - S.step_timestamps[-2][1] + if dt > 0: + S.ms_per_step = dt * 1000 S.loss_history.append((S.step, S.loss)) S.best_loss = min(S.best_loss, S.loss) return @@ -334,6 +397,16 @@ def parse_line(line): S.compiles = int(m[5]) S.compile_pct = 100 * compile_ms / (compile_ms + train_ms) if compile_ms + train_ms > 0 else 0 return + m = RE_TIMING_DYN.search(line) + if m: + vals = list(map(float, m.groups())) + S.component_timing = { + 'ane_fwd': vals[0], 'io_fwd': vals[1], 'rms': vals[2], + 'ane_bwd': vals[3], 'io_bwd': vals[4], 'silu': vals[5], + 'rms_bwd': vals[6], 'cls': vals[7], 'cblas_wait': vals[8], 'dw_copy': vals[9], + '_dynamic': True + } + return m = RE_TIMING.search(line) if m: S.component_timing = dict(zip(['ane', 'io', 'cls', 'elem', 'rms', 'cblas_wait'], map(float, m.groups()))) @@ -346,6 +419,11 @@ def parse_line(line): if m: S.flops['ane_util'] = float(m[1]) return + m = RE_COMPILED.search(line) + if m: + S.compiles = int(m[1]) + S.compile_ms += float(m[2]) + return m = RE_EFFICIENCY.search(line) if m: S.efficiency[m[1].strip()] = m[2].strip() @@ -514,23 +592,49 @@ def draw(term): # Training stats (right panel) sr = row step_str = f'{S.step}' + (f'/{S.total_steps}' if S.total_steps and S.total_steps < 999999 else '') - put(sr, mid_x + 1, f' Step: {step_str} Loss: {S.loss:.4f}' if S.loss else ' Step: --', term.yellow) + # Elapsed time + elapsed = 0.0 + if S.train_start: + elapsed = time.monotonic() - S.train_start + elapsed_str = f'{elapsed:.1f}s' if elapsed < 60 else f'{elapsed/60:.1f}m' + put(sr, mid_x + 1, f' Step: {step_str} Loss: {S.loss:.4f} [{elapsed_str}]' if S.loss else ' Step: --', term.yellow) sr += 1 - put(sr, mid_x + 1, f' Best: {S.best_loss:.4f} ms/step: {S.ms_per_step:.1f}' if S.best_loss < float('inf') else ' Best: --') + # ms/step + steps/sec + sps = 1000.0 / S.ms_per_step if S.ms_per_step > 0 else 0 + put(sr, mid_x + 1, f' Best: {S.best_loss:.4f} {S.ms_per_step:.1f}ms/step ({sps:.1f} steps/s)' if S.best_loss < float('inf') else ' Best: --') sr += 1 + # TFLOPS ane_tflops = S.flops.get('ane_tflops', 0) ane_util = S.flops.get('ane_util', 0) + total_tflops = 0 + if S.ms_per_step > 0 and S.flops.get('ane', 0) > 0: + if not ane_tflops: + ane_tflops = (S.flops['ane'] * 1e6) / (S.ms_per_step * 1e-3) / 1e12 + total_tflops = (S.flops.get('total', 0) * 1e6) / (S.ms_per_step * 1e-3) / 1e12 + if not ane_util and ane_tflops: + ane_util = 100.0 * ane_tflops / 15.8 + compile_str = f' Compile: {S.compile_ms/1000:.1f}s' if S.compile_ms > 0 else '' if ane_tflops: - put(sr, mid_x + 1, f' ANE: {ane_tflops:.2f}T Compile: {S.compile_pct:.0f}% Util: {ane_util:.1f}%') - else: - put(sr, mid_x + 1, f' Compile: {S.compile_pct:.0f}%') + tflops_str = f' ANE: {ane_tflops:.2f}T' + if total_tflops: + tflops_str += f' Total: {total_tflops:.2f}T' + tflops_str += f' Util: {ane_util:.1f}%{compile_str}' + put(sr, mid_x + 1, tflops_str) + elif compile_str: + put(sr, mid_x + 1, f'{compile_str}') sr += 1 ct = S.component_timing if ct: - put(sr, mid_x + 1, f' ane={ct.get("ane", 0):.1f} io={ct.get("io", 0):.1f} cls={ct.get("cls", 0):.1f} elem={ct.get("elem", 0):.1f}') - sr += 1 - put(sr, mid_x + 1, f' rms={ct.get("rms", 0):.1f} cblas_wait={ct.get("cblas_wait", 0):.1f} ms/step') - sr += 1 + if ct.get('_dynamic'): + put(sr, mid_x + 1, f' fwd={ct.get("ane_fwd",0):.1f} bwd={ct.get("ane_bwd",0):.1f} io={ct.get("io_fwd",0)+ct.get("io_bwd",0):.1f} silu={ct.get("silu",0):.1f}') + sr += 1 + put(sr, mid_x + 1, f' cls={ct.get("cls",0):.1f} rms={ct.get("rms",0)+ct.get("rms_bwd",0):.1f} dw={ct.get("dw_copy",0):.1f} ms/step') + sr += 1 + else: + put(sr, mid_x + 1, f' ane={ct.get("ane", 0):.1f} io={ct.get("io", 0):.1f} cls={ct.get("cls", 0):.1f} elem={ct.get("elem", 0):.1f}') + sr += 1 + put(sr, mid_x + 1, f' rms={ct.get("rms", 0):.1f} cblas_wait={ct.get("cblas_wait", 0):.1f} ms/step') + sr += 1 pw = S.power if any(pw.values()): put(sr, mid_x + 1, '\u2500 Power ' + '\u2500' * max(0, right_w - 9), term.cyan) @@ -659,10 +763,24 @@ def set_nonblock(fd): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) -def spawn_training(resume=False, steps=10000): - cmd = 'make train_large 2>&1 && ./train_large' +def spawn_training(resume=False, steps=10000, dynamic=False, ane=False, scratch=False, + lr=None, accum=None, no_ane_extras=False): + if dynamic: + cmd = 'cd training_dynamic && make 2>&1 && ./train' + elif ane: + cmd = 'make train_large_ane 2>&1 && ./train_large_ane' + else: + cmd = 'make train_large 2>&1 && ./train_large' if resume: cmd += ' --resume' + if scratch and dynamic: + cmd += ' --scratch' + if lr is not None: + cmd += f' --lr {lr}' + if accum is not None and dynamic: + cmd += f' --accum {accum}' + if no_ane_extras and ane: + cmd += ' --no-ane-extras' cmd += f' --steps {steps}' proc = subprocess.Popen( ['bash', '-c', cmd], @@ -686,6 +804,12 @@ def spawn_powermetrics(): def main(): parser = argparse.ArgumentParser(description='ANE Training Dashboard (stories110M)') parser.add_argument('--resume', action='store_true', help='Resume from checkpoint') + parser.add_argument('--dynamic', action='store_true', help='Dynamic weight pipeline (training_dynamic/)') + parser.add_argument('--ane', action='store_true', help='PR#19: ANE-offloaded classifier/softmax/rmsnorm_bwd') + parser.add_argument('--no-ane-extras', action='store_true', help='Disable ANE extras (use with --ane)') + parser.add_argument('--scratch', action='store_true', help='Train from scratch (random init)') + parser.add_argument('--lr', type=float, default=None, help='Learning rate') + parser.add_argument('--accum', type=int, default=None, help='Gradient accumulation steps') parser.add_argument('--infinite', action='store_true', help='Train indefinitely') parser.add_argument('--no-powermetrics', action='store_true') parser.add_argument('--no-generate', action='store_true', help='Disable text generation') @@ -696,10 +820,15 @@ def main(): args.steps = 999999999 S.total_steps = args.steps + global CKPT_PATH + CKPT_PATH = CKPT_PATH_DYNAMIC if args.dynamic else CKPT_PATH_STATIC + term = Terminal() procs = [] - train_proc = spawn_training(resume=args.resume, steps=args.steps) + train_proc = spawn_training(resume=args.resume, steps=args.steps, dynamic=args.dynamic, + scratch=args.scratch, lr=args.lr, accum=args.accum, + ane=args.ane, no_ane_extras=args.no_ane_extras) S.train_pid = train_proc.pid procs.append(train_proc) @@ -839,7 +968,9 @@ def main(): if train_proc: train_proc.terminate() train_proc.wait() - train_proc = spawn_training(resume=True, steps=args.steps) + train_proc = spawn_training(resume=True, steps=args.steps, dynamic=args.dynamic, + lr=args.lr, accum=args.accum, + ane=args.ane, no_ane_extras=args.no_ane_extras) S.train_pid = train_proc.pid procs = [p for p in procs if p.poll() is None] procs.append(train_proc) diff --git a/training/test_dynamic_matmul.m b/training/test_dynamic_matmul.m new file mode 100644 index 0000000..72addbd --- /dev/null +++ b/training/test_dynamic_matmul.m @@ -0,0 +1,333 @@ +// test_dynamic_matmul.m — Benchmark dynamic matmul on ANE (no recompile) +// Layout: input [1, D, 1, S+D] — activations in sp[0:S], weight rows in sp[S:S+D] +// MIL: slice → reshape → matmul → reshape → output +#import +#import +#import +#import +#import +#import +#include +#include + +#include "stories_io.h" + +// Generate MIL for y = x @ W where both come from input IOSurface +// Input: [1, IC, 1, SEQ+OC] fp32 +// sp[0:SEQ] = activations x[IC, SEQ] +// sp[SEQ:SEQ+OC] = weight W[IC, OC] (each channel d holds W[d, :]) +// Output: [1, OC, 1, SEQ] fp32 +static NSString *gen_dynamic_matmul_mil(int ic, int oc, int seq) { + NSMutableString *m = [NSMutableString string]; + [m appendString:@"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; + int sp_total = seq + oc; + [m appendFormat:@" func main(tensor x) {\n", ic, sp_total]; + // Cast to fp16 + [m appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; + [m appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", ic, sp_total]; + // Slice activations [1, IC, 1, SEQ] + [m appendString:@" tensor ba = const()[name = string(\"ba\"), val = tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor sa = const()[name = string(\"sa\"), val = tensor([1,%d,1,%d])];\n", ic, seq]; + [m appendFormat:@" tensor act = slice_by_size(x=xh,begin=ba,size=sa)[name=string(\"act\")];\n", ic, seq]; + // Slice weight [1, IC, 1, OC] + [m appendFormat:@" tensor bw = const()[name = string(\"bw\"), val = tensor([0,0,0,%d])];\n", seq]; + [m appendFormat:@" tensor sw = const()[name = string(\"sw\"), val = tensor([1,%d,1,%d])];\n", ic, oc]; + [m appendFormat:@" tensor wt = slice_by_size(x=xh,begin=bw,size=sw)[name=string(\"wt\")];\n", ic, oc]; + // Reshape act: [1,IC,1,SEQ] → [1,1,IC,SEQ] → transpose → [1,1,SEQ,IC] + [m appendFormat:@" tensor ra = const()[name = string(\"ra\"), val = tensor([1,1,%d,%d])];\n", ic, seq]; + [m appendFormat:@" tensor a2 = reshape(shape=ra,x=act)[name=string(\"a2\")];\n", ic, seq]; + [m appendString:@" tensor pm = const()[name = string(\"pm\"), val = tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor a3 = transpose(perm=pm,x=a2)[name=string(\"a3\")];\n", seq, ic]; + // Reshape weight: [1,IC,1,OC] → [1,1,IC,OC] + [m appendFormat:@" tensor rw = const()[name = string(\"rw\"), val = tensor([1,1,%d,%d])];\n", ic, oc]; + [m appendFormat:@" tensor W = reshape(shape=rw,x=wt)[name=string(\"W\")];\n", ic, oc]; + // matmul: [1,1,SEQ,IC] @ [1,1,IC,OC] → [1,1,SEQ,OC] + [m appendString:@" bool bF = const()[name = string(\"bF\"), val = bool(false)];\n"]; + [m appendFormat:@" tensor yh = matmul(transpose_x=bF,transpose_y=bF,x=a3,y=W)[name=string(\"mm\")];\n", seq, oc]; + // Reshape+transpose back: [1,1,SEQ,OC] → transpose → [1,1,OC,SEQ] → reshape → [1,OC,1,SEQ] + [m appendFormat:@" tensor yt = transpose(perm=pm,x=yh)[name=string(\"yt\")];\n", oc, seq]; + [m appendFormat:@" tensor ro = const()[name = string(\"ro\"), val = tensor([1,%d,1,%d])];\n", oc, seq]; + [m appendFormat:@" tensor yr = reshape(shape=ro,x=yt)[name=string(\"yr\")];\n", oc, seq]; + // Cast back to fp32 + [m appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; + [m appendFormat:@" tensor y = cast(dtype = to32, x = yr)[name = string(\"cout\")];\n", oc, seq]; + [m appendString:@" } -> (y);\n}\n"]; + return m; +} + +// Tiled version: splits OC into tiles, each tile is a separate kernel +// For W[IC, OC], tile along OC: each tile handles W[:, t*T:(t+1)*T] +// Input per tile: [1, IC, 1, SEQ+T] +// Output per tile: [1, T, 1, SEQ] +typedef struct { + Kern **tiles; + int n_tiles, tile_oc, ic, oc, seq; +} TiledMatmul; + +static TiledMatmul *compile_tiled_matmul(int ic, int oc, int tile_oc, int seq) { + TiledMatmul *tm = (TiledMatmul*)calloc(1, sizeof(TiledMatmul)); + tm->ic = ic; tm->oc = oc; tm->seq = seq; tm->tile_oc = tile_oc; + tm->n_tiles = (oc + tile_oc - 1) / tile_oc; + tm->tiles = (Kern**)calloc(tm->n_tiles, sizeof(Kern*)); + for (int t = 0; t < tm->n_tiles; t++) { + int this_oc = (t == tm->n_tiles-1 && oc % tile_oc) ? (oc % tile_oc) : tile_oc; + NSString *mil = gen_dynamic_matmul_mil(ic, this_oc, seq); + int in_bytes = ic * (seq + this_oc) * 4; + int out_bytes = this_oc * seq * 4; + tm->tiles[t] = compile_kern_mil_w(mil, @{}, in_bytes, out_bytes); + if (!tm->tiles[t]) { printf("Tile %d compile FAIL\n", t); return NULL; } + } + return tm; +} + +// Write activations + weight tile into IOSurface +// act: [IC, SEQ] column-major (channel-first) +// W: [IC, OC] — full weight matrix, we extract the tile +static void write_tile_input(TiledMatmul *tm, int tile_idx, const float *act, const float *W) { + Kern *k = tm->tiles[tile_idx]; + int ic = tm->ic, seq = tm->seq, toc = tm->tile_oc; + int oc_off = tile_idx * toc; + int this_oc = (tile_idx == tm->n_tiles-1 && tm->oc % toc) ? (tm->oc % toc) : toc; + + IOSurfaceLock(k->ioIn, 0, NULL); + float *buf = (float*)IOSurfaceGetBaseAddress(k->ioIn); + // Activations: buf[d * (seq+this_oc) + t] = act[d * seq + t] + for (int d = 0; d < ic; d++) { + memcpy(buf + d*(seq+this_oc), act + d*seq, seq*sizeof(float)); + // Weight: buf[d * (seq+this_oc) + seq + c] = W[d * oc + oc_off + c] + for (int c = 0; c < this_oc; c++) + buf[d*(seq+this_oc) + seq + c] = W[d*tm->oc + oc_off + c]; + } + IOSurfaceUnlock(k->ioIn, 0, NULL); +} + +// Read tile output into full output buffer +static void read_tile_output(TiledMatmul *tm, int tile_idx, float *out) { + Kern *k = tm->tiles[tile_idx]; + int seq = tm->seq, toc = tm->tile_oc; + int oc_off = tile_idx * toc; + int this_oc = (tile_idx == tm->n_tiles-1 && tm->oc % toc) ? (tm->oc % toc) : toc; + + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + float *obuf = (float*)IOSurfaceGetBaseAddress(k->ioOut); + for (int c = 0; c < this_oc; c++) + memcpy(out + (oc_off+c)*seq, obuf + c*seq, seq*sizeof(float)); + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); +} + +int main(int argc, char **argv) { + @autoreleasepool { + mach_timebase_info(&g_tb); + ane_init(); + + // === Test 1: Single 64×64 dynamic matmul (correctness) === + printf("=== Test 1: 64×64 dynamic matmul correctness ===\n"); + { + int D = 64, S = 64; + NSString *mil = gen_dynamic_matmul_mil(D, D, S); + int in_b = D * (S+D) * 4, out_b = D * S * 4; + Kern *k = compile_kern_mil_w(mil, @{}, in_b, out_b); + if (!k) { printf("FAIL\n"); return 1; } + + // Identity test + IOSurfaceLock(k->ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn); + memset(inp, 0, in_b); + for (int d = 0; d < D; d++) + for (int s = 0; s < S; s++) + inp[d*(S+D) + s] = (float)(d*S + s) * 0.001f; + for (int d = 0; d < D; d++) + for (int c = 0; c < D; c++) + inp[d*(S+D) + S + c] = (d == c) ? 1.0f : 0.0f; + IOSurfaceUnlock(k->ioIn, 0, NULL); + + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + float *out = (float*)IOSurfaceGetBaseAddress(k->ioOut); + float me = 0; + for (int d = 0; d < D; d++) + for (int s = 0; s < S; s++) { + float e = fabsf(out[d*S+s] - inp[d*(S+D)+s]); + if (e > me) me = e; + } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("Identity: max_err=%.6f %s\n", me, me < 0.01 ? "PASS" : "FAIL"); + + // 2× test + IOSurfaceLock(k->ioIn, 0, NULL); + for (int d = 0; d < D; d++) + for (int c = 0; c < D; c++) + inp[d*(S+D) + S + c] = (d == c) ? 2.0f : 0.0f; + IOSurfaceUnlock(k->ioIn, 0, NULL); + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + float sr = 0; int cnt = 0; + for (int i = 0; i < D*S; i++) + if (fabsf(inp[i/(S)*((S)+D) + i%S]) > 0.001f) { sr += out[i]/inp[i/S*(S+D)+i%S]; cnt++; } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("2× W: ratio=%.3f %s\n\n", cnt?sr/cnt:0, fabsf(sr/cnt-2.0f)<0.1?"PASS":"FAIL"); + free_kern(k); + } + + // === Test 2: 768×768 single kernel (if it compiles) === + printf("=== Test 2: 768×768 single dynamic matmul ===\n"); + { + int D = 768, S = 256; + int sp_total = S + D; // 256 + 768 = 1024 + int in_b = D * sp_total * 4; // 768 * 1024 * 4 = 3.1MB + int out_b = D * S * 4; // 768 * 256 * 4 = 786KB + printf("IOSurface: in=%.1fMB out=%.1fKB\n", in_b/1e6, out_b/1e3); + + NSString *mil = gen_dynamic_matmul_mil(D, D, S); + uint64_t t0 = mach_absolute_time(); + Kern *k = compile_kern_mil_w(mil, @{}, in_b, out_b); + double compile_ms = tb_ms(mach_absolute_time() - t0); + if (!k) { printf("768×768 compile FAIL\n"); } + else { + printf("Compile: %.1fms\n", compile_ms); + // Random weights + float *act = (float*)calloc(D*S, sizeof(float)); + float *W = (float*)calloc(D*D, sizeof(float)); + for (int i = 0; i < D*S; i++) act[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f; + for (int i = 0; i < D*D; i++) W[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f; + + // Write to IOSurface + IOSurfaceLock(k->ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn); + for (int d = 0; d < D; d++) { + memcpy(inp + d*(S+D), act + d*S, S*4); + memcpy(inp + d*(S+D) + S, W + d*D, D*4); + } + IOSurfaceUnlock(k->ioIn, 0, NULL); + + // Warmup + for (int i = 0; i < 3; i++) ane_eval(k); + + // Benchmark + int iters = 50; + t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) ane_eval(k); + double total_ms = tb_ms(mach_absolute_time() - t0); + double per_eval = total_ms / iters; + double flops = 2.0 * D * D * S; // matmul FLOPs + double gflops = flops / (per_eval * 1e6); + printf("768×768×256 matmul: %.3fms/eval %.1f GFLOP/s\n", per_eval, gflops); + + // Benchmark with IO write (simulating weight update) + t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) { + IOSurfaceLock(k->ioIn, 0, NULL); + float *p = (float*)IOSurfaceGetBaseAddress(k->ioIn); + for (int d = 0; d < D; d++) + memcpy(p + d*(S+D) + S, W + d*D, D*4); + IOSurfaceUnlock(k->ioIn, 0, NULL); + ane_eval(k); + } + total_ms = tb_ms(mach_absolute_time() - t0); + per_eval = total_ms / iters; + gflops = flops / (per_eval * 1e6); + printf("With weight IO: %.3fms/eval %.1f GFLOP/s\n", per_eval, gflops); + + free(act); free(W); free_kern(k); + } + } + + // === Test 3: Tiled matmul benchmark === + int tile_sizes[] = {64, 128, 256, 384, 768}; + int n_tiles_test = sizeof(tile_sizes)/sizeof(tile_sizes[0]); + printf("\n=== Test 3: Tiled 768×768 matmul (varying tile_oc) ===\n"); + printf("%-10s %-8s %-10s %-12s %-10s\n", "tile_oc", "tiles", "compile", "eval/ms", "GFLOP/s"); + { + int D = 768, S = 256; + float *act = (float*)calloc(D*S, sizeof(float)); + float *W = (float*)calloc(D*D, sizeof(float)); + float *out_full = (float*)calloc(D*S, sizeof(float)); + for (int i = 0; i < D*S; i++) act[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f; + for (int i = 0; i < D*D; i++) W[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f; + + for (int ti = 0; ti < n_tiles_test; ti++) { + int T = tile_sizes[ti]; + if (T > D) continue; + uint64_t t0 = mach_absolute_time(); + TiledMatmul *tm = compile_tiled_matmul(D, D, T, S); + double compile_ms = tb_ms(mach_absolute_time() - t0); + if (!tm) { printf("%-10d FAIL\n", T); continue; } + + // Warmup + for (int w = 0; w < 2; w++) { + for (int t = 0; t < tm->n_tiles; t++) { + write_tile_input(tm, t, act, W); + ane_eval(tm->tiles[t]); + } + } + + // Benchmark (with IO) + int iters = 20; + t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) { + for (int t = 0; t < tm->n_tiles; t++) { + write_tile_input(tm, t, act, W); + ane_eval(tm->tiles[t]); + read_tile_output(tm, t, out_full); + } + } + double total_ms = tb_ms(mach_absolute_time() - t0); + double per_matmul = total_ms / iters; + double flops = 2.0 * D * D * S; + double gflops = flops / (per_matmul * 1e6); + printf("%-10d %-8d %-10.0fms %-12.3fms %-10.1f\n", + T, tm->n_tiles, compile_ms, per_matmul, gflops); + + for (int t = 0; t < tm->n_tiles; t++) free_kern(tm->tiles[t]); + free(tm->tiles); free(tm); + } + + // === Correctness check: compare with cblas === + printf("\n=== Correctness: dynamic matmul vs cblas_sgemm ===\n"); + { + int T = 768; // full, no tiling + TiledMatmul *tm = compile_tiled_matmul(D, D, T, S); + if (tm) { + write_tile_input(tm, 0, act, W); + ane_eval(tm->tiles[0]); + read_tile_output(tm, 0, out_full); + + // Reference: cblas y = act^T @ W → y[s,oc] = sum_d act[d,s]*W[d,oc] + // act is [D,S] col-major, W is [D,D] row-major + // We want out[oc,s] = sum_d act[d,s] * W[d,oc] + // = W^T @ act where W^T is [D,D] and act is [D,S] → out is [D,S] + float *ref = (float*)calloc(D*S, sizeof(float)); + // out[oc*S+s] = sum_d W[d*D+oc] * act[d*S+s] + // This is: (W^T) @ act in column-major: M=D,N=S,K=D + // cblas: C = alpha*A*B + beta*C + // A=W^T [D×D], B=act [D×S], C=ref [D×S] + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, + D, S, D, 1.0f, W, D, act, D, 0.0f, ref, D); + float me = 0; + for (int i = 0; i < D*S; i++) { + float e = fabsf(out_full[i] - ref[i]); + if (e > me) me = e; + } + printf("vs cblas: max_err=%.6f %s\n", me, me < 1.0 ? "PASS" : "FAIL"); + free(ref); + for (int t = 0; t < tm->n_tiles; t++) free_kern(tm->tiles[t]); + free(tm->tiles); free(tm); + } + } + + free(act); free(W); free(out_full); + } + + // === Summary for training === + printf("\n=== Summary ===\n"); + printf("Stories110M: 12 layers × 10 matmuls/layer = 120 matmuls/step\n"); + printf("Sizes: Wq/Wk/Wv/Wo [768,768], W1/W3 [2048,768], W2 [768,2048]\n"); + printf("With dynamic weights: compile once, update IOSurface every step\n"); + + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_weight_patch.m b/training/test_weight_patch.m new file mode 100644 index 0000000..13473b7 --- /dev/null +++ b/training/test_weight_patch.m @@ -0,0 +1,450 @@ +// test_weight_patch.m — Test whether ANE weights can be patched after compile +#import +#import +#import +#import +#import +#import +#import +#import +#include +#include + +#include "stories_io.h" + +// MIL: fp32 in → cast fp16 → conv → cast fp32 out (matches inmem_peak.m pattern) +static NSString *gen_conv_mil(int ic, int oc, int sp) { + NSMutableString *m = [NSMutableString string]; + [m appendString:@"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; + [m appendFormat:@" func main(tensor x) {\n", ic, sp]; + [m appendString: + @" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; + [m appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cast_in\")];\n", ic, sp]; + [m appendFormat:@" tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n", + oc, ic, oc, ic]; + [m appendFormat:@" tensor yh = conv(dilations = dl, groups = gr, pad = pd, pad_type = pt, strides = st, weight = W, x = xh)" + "[name = string(\"conv\")];\n", oc, sp]; + [m appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; + [m appendFormat:@" tensor y = cast(dtype = to32, x = yh)[name = string(\"cast_out\")];\n", oc, sp]; + [m appendString:@" } -> (y);\n}\n"]; + return m; +} + +int main(int argc, char **argv) { + @autoreleasepool { + mach_timebase_info(&g_tb); + ane_init(); + + int IC = 256, OC = 256, SP = 64; + int io_bytes = IC * SP * 4; // fp32 + + // Identity weight + float *W_id = (float*)calloc(OC*IC, sizeof(float)); + for (int i = 0; i < IC; i++) W_id[i*IC+i] = 1.0f; + + NSString *mil = gen_conv_mil(IC, OC, SP); + NSDictionary *wd = @{@"@model_path/weights/w.bin": @{@"offset":@0, @"data":build_blob(W_id, OC, IC)}}; + + printf("=== Compiling conv %dx%d sp=%d ===\n", OC, IC, SP); + Kern *k = compile_kern_mil_w(mil, wd, io_bytes, io_bytes); + if (!k) { printf("COMPILE FAILED\n"); free(W_id); return 1; } + printf("Compile OK!\n"); + + // Write fp32 input + IOSurfaceLock(k->ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn); + for (int i = 0; i < IC*SP; i++) inp[i] = (i % 100) * 0.01f; + IOSurfaceUnlock(k->ioIn, 0, NULL); + + // Eval with identity + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + float *out = (float*)IOSurfaceGetBaseAddress(k->ioOut); + printf("In: [%.3f, %.3f, %.3f, %.3f]\n", inp[0], inp[1], inp[2], inp[3]); + printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); + float max_err = 0; + for (int i = 0; i < OC*SP; i++) { + float err = fabsf(out[i] - inp[i]); + if (err > max_err) max_err = err; + } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("Identity max_err=%.6f %s\n\n", max_err, max_err < 0.1 ? "PASS" : "FAIL"); + + // === Approach 1: Patch weight on disk, unload+reload === + printf("=== Approach 1: Disk patch + unload/reload ===\n"); + float *W_2x = (float*)calloc(OC*IC, sizeof(float)); + for (int i = 0; i < IC; i++) W_2x[i*IC+i] = 2.0f; + [build_blob(W_2x, OC, IC) writeToFile: + [(__bridge NSString*)k->tmpDir stringByAppendingPathComponent:@"weights/w.bin"] atomically:YES]; + + id mdl = (__bridge id)k->model; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + printf("Reload: %s\n", ok?"OK":"FAIL"); + if (ok) { + // Re-create request after reload + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut); + CFRelease(k->request); + k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0)); + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); + float sr = 0; int cnt = 0; + for (int i = 0; i < OC*SP; i++) + if (fabsf(inp[i]) > 0.01f) { sr += out[i]/inp[i]; cnt++; } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("Ratio: %.3f (2.0=patched, 1.0=cached)\n\n", cnt>0?sr/cnt:0); + } + + // === Approach 2: Memory scan === + printf("=== Approach 2: Memory scan ===\n"); + uint16_t pat1[8] = {0x3C00, 0, 0, 0, 0, 0, 0, 0}; + uint16_t pat2[8] = {0x4000, 0, 0, 0, 0, 0, 0, 0}; + mach_port_t task = mach_task_self(); + vm_address_t addr = 0; vm_size_t sz; natural_t depth = 1; + int f1 = 0, f2 = 0; + while (1) { + struct vm_region_submap_info_64 info; + mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64; + if (vm_region_recurse_64(task, &addr, &sz, &depth, (vm_region_recurse_info_t)&info, &count) != KERN_SUCCESS) break; + if (info.is_submap) { depth++; continue; } + if (!(info.protection & VM_PROT_READ) || sz < (size_t)(OC*IC*2)) { addr += sz; continue; } + uint8_t *base = (uint8_t*)addr; + for (size_t off = 0; off + OC*IC*2 <= sz; off += 2) { + int w = 0; + if (memcmp(base+off, pat1, 16) == 0) w = 1; + else if (memcmp(base+off, pat2, 16) == 0) w = 2; + if (!w) continue; + uint16_t *p = (uint16_t*)(base+off), diag = (w==1)?0x3C00:0x4000; + int ok2 = 1; + for (int r = 0; r < OC && ok2; r++) + for (int c = 0; c < IC && ok2; c++) + if (p[r*IC+c] != ((r==c)?diag:0)) ok2 = 0; + if (!ok2) continue; + if (w==1) f1++; else f2++; + printf(" FOUND %dx @%p prot=%d/%d %s\n", w, (void*)(addr+off), + info.protection, info.max_protection, (info.protection&VM_PROT_WRITE)?"WR":"RO"); + } + addr += sz; + } + printf("Found: 1x=%d 2x=%d\n", f1, f2); + + // Now patch ALL found weight patterns to 3× and re-eval + if (f1 > 0 || f2 > 0) { + printf("Patching all found patterns to 3x identity...\n"); + addr = 0; depth = 1; + while (1) { + struct vm_region_submap_info_64 info2; + mach_msg_type_number_t count2 = VM_REGION_SUBMAP_INFO_COUNT_64; + if (vm_region_recurse_64(task, &addr, &sz, &depth, (vm_region_recurse_info_t)&info2, &count2) != KERN_SUCCESS) break; + if (info2.is_submap) { depth++; continue; } + if (!(info2.protection & VM_PROT_READ) || sz < (size_t)(OC*IC*2)) { addr += sz; continue; } + uint8_t *base2 = (uint8_t*)addr; + for (size_t off = 0; off + OC*IC*2 <= sz; off += 2) { + int w2 = 0; + if (memcmp(base2+off, pat1, 16) == 0) w2 = 1; + else if (memcmp(base2+off, pat2, 16) == 0) w2 = 2; + if (!w2) continue; + uint16_t *p2 = (uint16_t*)(base2+off), diag2 = (w2==1)?0x3C00:0x4000; + int ok3 = 1; + for (int r = 0; r < OC && ok3; r++) + for (int c = 0; c < IC && ok3; c++) + if (p2[r*IC+c] != ((r==c)?diag2:0)) ok3 = 0; + if (!ok3) continue; + if (info2.protection & VM_PROT_WRITE) { + printf(" Patching %dx @%p to 3x\n", w2, (void*)(addr+off)); + for (int r = 0; r < OC; r++) + for (int c = 0; c < IC; c++) + p2[r*IC+c] = (r==c) ? 0x4200 : 0; // fp16(3.0) + } + } + addr += sz; + } + + printf("\n=== Eval after memory patch (expect 3x) ===\n"); + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); + float sr2 = 0; int cnt2 = 0; + for (int i = 0; i < OC*SP; i++) + if (fabsf(inp[i]) > 0.01f) { sr2 += out[i]/inp[i]; cnt2++; } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("Ratio: %.3f (3.0=mem patch works!, 1.0=ANE uses SRAM copy)\n", cnt2>0?sr2/cnt2:0); + } + printf("\n"); + + // === Approach 3: Explore classes === + printf("=== ANE classes ===\n"); + const char *cn[] = {"_ANEWeight", "_ANEProgramForEvaluation", "_ANEChainingRequest", NULL}; + for (int i = 0; cn[i]; i++) { + Class cls = NSClassFromString([NSString stringWithUTF8String:cn[i]]); + if (!cls) { printf("%s: NOT FOUND\n", cn[i]); continue; } + printf("%s:\n", cn[i]); + unsigned int mc = 0; Method *ms = class_copyMethodList(cls, &mc); + for (unsigned j = 0; j < mc; j++) printf(" - %s\n", sel_getName(method_getName(ms[j]))); + free(ms); + mc = 0; ms = class_copyMethodList(object_getClass(cls), &mc); + for (unsigned j = 0; j < mc; j++) printf(" + %s\n", sel_getName(method_getName(ms[j]))); + free(ms); printf("\n"); + } + @try { printf("programHandle: %s\n", [[[mdl valueForKey:@"programHandle"] description] UTF8String]); } @catch(id x) {} + @try { printf("intermediateBufferHandle: %s\n", [[[mdl valueForKey:@"intermediateBufferHandle"] description] UTF8String]); } @catch(id x) {} + + // === Approach 4: _ANEWeight + updateWeightURL === + printf("\n=== Approach 4: _ANEWeight API ===\n"); + Class AW = NSClassFromString(@"_ANEWeight"); + if (AW) { + // Write 5× identity weights to a new file + float *W_5x = (float*)calloc(OC*IC, sizeof(float)); + for (int i = 0; i < IC; i++) W_5x[i*IC+i] = 5.0f; + NSString *wpath = [NSTemporaryDirectory() stringByAppendingPathComponent:@"patched_w.bin"]; + [build_blob(W_5x, OC, IC) writeToFile:wpath atomically:YES]; + free(W_5x); + + NSURL *wurl = [NSURL fileURLWithPath:wpath]; + id wobj = ((id(*)(Class,SEL,id,id))objc_msgSend)(AW, + @selector(weightWithSymbolAndURL:weightURL:), @"W", wurl); + printf(" _ANEWeight: %s\n", wobj ? [[wobj description] UTF8String] : "nil"); + if (wobj) { + printf(" weightSymbol: %s\n", [((id(*)(id,SEL))objc_msgSend)(wobj, @selector(weightSymbol)) UTF8String]); + printf(" weightURL: %s\n", [[((id(*)(id,SEL))objc_msgSend)(wobj, @selector(weightURL)) description] UTF8String]); + } + + // Try to pass as weightsBuffer in request + printf("\n Trying weightsBuffer in request...\n"); + id wI2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn); + id wO2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut); + + // Try passing weight array as weightsBuffer + if (wobj) { + CFRelease(k->request); + k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI2], @[@0], @[wO2], @[@0], @[wobj], nil, @0)); + printf(" Request with weightsBuffer created\n"); + @try { + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf(" Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); + float sr3 = 0; int cnt3 = 0; + for (int i2 = 0; i2 < OC*SP; i2++) + if (fabsf(inp[i2]) > 0.01f) { sr3 += out[i2]/inp[i2]; cnt3++; } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf(" Ratio: %.3f (5.0=weightsBuffer works!)\n", cnt3>0?sr3/cnt3:0); + } @catch(NSException *ex) { + printf(" Eval exception: %s\n", [[ex description] UTF8String]); + } + } + + // Also try IOSurface as weightsBuffer + printf("\n Trying IOSurface as weightsBuffer...\n"); + IOSurfaceRef wSurf = make_surface(OC*IC*2); // fp16 weights + IOSurfaceLock(wSurf, 0, NULL); + _Float16 *wfp16 = (_Float16*)IOSurfaceGetBaseAddress(wSurf); + for (int r = 0; r < OC; r++) + for (int c2 = 0; c2 < IC; c2++) + wfp16[r*IC+c2] = (r==c2) ? (_Float16)7.0f : (_Float16)0.0f; // 7× identity + IOSurfaceUnlock(wSurf, 0, NULL); + id wSurfObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), wSurf); + CFRelease(k->request); + k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI2], @[@0], @[wO2], @[@0], wSurfObj, nil, @0)); + @try { + ane_eval(k); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf(" Out: [%.3f, %.3f, %.3f, %.3f]\n", out[0], out[1], out[2], out[3]); + float sr4 = 0; int cnt4 = 0; + for (int i3 = 0; i3 < OC*SP; i3++) + if (fabsf(inp[i3]) > 0.01f) { sr4 += out[i3]/inp[i3]; cnt4++; } + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + printf(" Ratio: %.3f (7.0=IOSurface weights work!)\n", cnt4>0?sr4/cnt4:0); + } @catch(NSException *ex) { + printf(" Eval exception: %s\n", [[ex description] UTF8String]); + } + CFRelease(wSurf); + } + + // === Approach 5: Weights packed into input IOSurface (fp16 with cast) === + printf("\n=== Approach 5: Dynamic weights via input IOSurface ===\n"); + // Element-wise mul: x * w where both come from input + // Input [1, IC*2, 1, SP] fp32 → cast fp16 → slice → mul → cast fp32 + { + int C5 = IC; + NSMutableString *m5 = [NSMutableString string]; + [m5 appendString:@"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; + [m5 appendFormat:@" func main(tensor x) {\n", C5*2, SP]; + [m5 appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; + [m5 appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", C5*2, SP]; + [m5 appendFormat:@" tensor b0 = const()[name = string(\"b0\"), val = tensor([0,0,0,0])];\n"]; + [m5 appendFormat:@" tensor s0 = const()[name = string(\"s0\"), val = tensor([1,%d,1,%d])];\n", C5, SP]; + [m5 appendFormat:@" tensor data = slice_by_size(x=xh,begin=b0,size=s0)[name=string(\"data\")];\n", C5, SP]; + [m5 appendFormat:@" tensor b1 = const()[name = string(\"b1\"), val = tensor([0,%d,0,0])];\n", C5]; + [m5 appendFormat:@" tensor wt = slice_by_size(x=xh,begin=b1,size=s0)[name=string(\"wt\")];\n", C5, SP]; + [m5 appendFormat:@" tensor yh = mul(x=data,y=wt)[name=string(\"mul\")];\n", C5, SP]; + [m5 appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; + [m5 appendFormat:@" tensor y = cast(dtype = to32, x = yh)[name = string(\"cout\")];\n", C5, SP]; + [m5 appendString:@" } -> (y);\n}\n"]; + + int io5_in = C5*2*SP*4; + int io5_out = C5*SP*4; + Kern *k5 = compile_kern_mil_w(m5, @{}, io5_in, io5_out); + if (k5) { + printf("Compile OK!\n"); + IOSurfaceLock(k5->ioIn, 0, NULL); + float *in5 = (float*)IOSurfaceGetBaseAddress(k5->ioIn); + for (int i = 0; i < C5*SP; i++) in5[i] = (i%100)*0.01f; + for (int i = 0; i < C5*SP; i++) in5[C5*SP+i] = 2.0f; + IOSurfaceUnlock(k5->ioIn, 0, NULL); + ane_eval(k5); + IOSurfaceLock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); + float *out5 = (float*)IOSurfaceGetBaseAddress(k5->ioOut); + printf("data=[%.3f,%.3f,%.3f], w=2.0 → out=[%.3f,%.3f,%.3f]\n", + in5[0],in5[1],in5[2], out5[0],out5[1],out5[2]); + IOSurfaceUnlock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); + + // Change weight dynamically — NO recompile! + IOSurfaceLock(k5->ioIn, 0, NULL); + for (int i = 0; i < C5*SP; i++) in5[C5*SP+i] = 5.0f; + IOSurfaceUnlock(k5->ioIn, 0, NULL); + ane_eval(k5); + IOSurfaceLock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("w=5.0 → out=[%.3f,%.3f,%.3f] (expect 5×)\n", out5[0],out5[1],out5[2]); + IOSurfaceUnlock(k5->ioOut, kIOSurfaceLockReadOnly, NULL); + free_kern(k5); + } else printf("Compile FAILED\n"); + } + + // === Approach 6: matmul with dynamic weights from input === + printf("\n=== Approach 6: matmul with dynamic W from input ===\n"); + // Pack x[1,D,S,1] and W[1,D,1,D] into input, then reshape+matmul + // Input shape: [1, D+D*D, 1, S] — first D channels=activations, rest=weight matrix flattened + // Actually, matmul needs [1,H,S,D] shapes. Let's try: + // Input: [1, D*(S+D), 1, 1] reshaped as needed + // Simpler: just test matmul with two sliced inputs + { + int D6 = 64, S6 = 64; // small for test + // Input: [1, D6+D6, S6, D6] — but that's 4D... + // Actually ANE matmul works on [1,H,M,K] @ [1,H,K,N] → [1,H,M,N] + // Let's pack x[1,1,S6,D6] and W[1,1,D6,D6] into [1,2,S6,D6] + // Then slice → matmul + NSMutableString *m6 = [NSMutableString string]; + [m6 appendString:@"program(1.3)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " + "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"9.0\"}})]\n{\n"]; + // Input: [1, D6+D6, 1, S6*D6] — flatten everything, then reshape + // Actually simplest: two separate regions in channel dim + // x_data: [1, D6, 1, S6] and W: [1, D6*D6, 1, 1] + // Total input channels: D6 + D6*D6 + int total_ch = D6 + D6*D6; + [m6 appendFormat:@" func main(tensor x) {\n", total_ch, S6]; + [m6 appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"]; + [m6 appendFormat:@" tensor xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", total_ch, S6]; + // Slice activations: [1, D6, 1, S6] + [m6 appendFormat:@" tensor b0 = const()[name = string(\"b0\"), val = tensor([0,0,0,0])];\n"]; + [m6 appendFormat:@" tensor sa = const()[name = string(\"sa\"), val = tensor([1,%d,1,%d])];\n", D6, S6]; + [m6 appendFormat:@" tensor act = slice_by_size(x=xh,begin=b0,size=sa)[name=string(\"act\")];\n", D6, S6]; + // Slice weight: [1, D6*D6, 1, S6] but we only need [D6, D6] → reshape + [m6 appendFormat:@" tensor bw = const()[name = string(\"bw\"), val = tensor([0,%d,0,0])];\n", D6]; + [m6 appendFormat:@" tensor sw = const()[name = string(\"sw\"), val = tensor([1,%d,1,%d])];\n", D6*D6, S6]; + [m6 appendFormat:@" tensor wf = slice_by_size(x=xh,begin=bw,size=sw)[name=string(\"wf\")];\n", D6*D6, S6]; + // Reshape weight to [1, D6, D6, S6] for matmul-like operation + // Actually for conv: weight needs to be [OC, IC, 1, 1] const. Can't use dynamic weight with conv. + // For matmul: need [1, 1, D6, D6] or similar + // Let's try: reshape wf to [1, D6, D6, S6], take first slice [:,:,:,0] → no, that's hard + // Simpler: reshape to [D6, D6] and use matmul + // But matmul expects specific ranks... let me try: + [m6 appendFormat:@" tensor ws = const()[name = string(\"ws\"), val = tensor([1, 1, %d, %d])];\n", D6, D6]; + // Only take first column of wf to get [1, D6*D6, 1, 1] + [m6 appendFormat:@" tensor sw1 = const()[name = string(\"sw1\"), val = tensor([1,%d,1,1])];\n", D6*D6]; + [m6 appendFormat:@" tensor wf1 = slice_by_size(x=wf,begin=b0,size=sw1)[name=string(\"wf1\")];\n", D6*D6]; + [m6 appendFormat:@" tensor W = reshape(shape=ws,x=wf1)[name=string(\"W\")];\n", D6, D6]; + // Reshape act to [1, 1, S6, D6] for matmul + [m6 appendFormat:@" tensor as2 = const()[name = string(\"as2\"), val = tensor([1, 1, %d, %d])];\n", D6, S6]; + [m6 appendFormat:@" tensor pm = const()[name = string(\"pm\"), val = tensor([0, 1, 3, 2])];\n"]; + [m6 appendFormat:@" tensor a2 = reshape(shape=as2,x=act)[name=string(\"a2\")];\n", D6, S6]; + [m6 appendFormat:@" tensor a3 = transpose(perm=pm,x=a2)[name=string(\"a3\")];\n", S6, D6]; + // matmul: [1,1,S6,D6] @ [1,1,D6,D6] → [1,1,S6,D6] + [m6 appendString:@" bool bF = const()[name = string(\"bF\"), val = bool(false)];\n"]; + [m6 appendFormat:@" tensor yh = matmul(transpose_x = bF, transpose_y = bF, x = a3, y = W)[name = string(\"mm\")];\n", S6, D6]; + // Reshape back to [1, D6, 1, S6] + [m6 appendFormat:@" tensor yt = transpose(perm=pm,x=yh)[name=string(\"yt\")];\n", D6, S6]; + [m6 appendFormat:@" tensor os = const()[name = string(\"os\"), val = tensor([1,%d,1,%d])];\n", D6, S6]; + [m6 appendFormat:@" tensor yr = reshape(shape=os,x=yt)[name=string(\"yr\")];\n", D6, S6]; + [m6 appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"]; + [m6 appendFormat:@" tensor y = cast(dtype = to32, x = yr)[name = string(\"cout\")];\n", D6, S6]; + [m6 appendString:@" } -> (y);\n}\n"]; + + int io6_in = total_ch * S6 * 4; + int io6_out = D6 * S6 * 4; + Kern *k6 = compile_kern_mil_w(m6, @{}, io6_in, io6_out); + if (k6) { + printf("Dynamic matmul compile OK!\n"); + // Set up: identity W, ramp input + IOSurfaceLock(k6->ioIn, 0, NULL); + float *in6 = (float*)IOSurfaceGetBaseAddress(k6->ioIn); + memset(in6, 0, io6_in); + // Activations: [D6, S6] in channel-first layout + for (int d = 0; d < D6; d++) + for (int s = 0; s < S6; s++) + in6[d*S6+s] = (d*S6+s) * 0.001f; + // Weight: identity matrix [D6, D6] packed in channels D6..D6+D6*D6, only col 0 + float *wbase = in6 + D6*S6; + for (int r = 0; r < D6; r++) + for (int c = 0; c < D6; c++) + wbase[(r*D6+c)*S6] = (r==c) ? 1.0f : 0.0f; // only sp=0 matters + IOSurfaceUnlock(k6->ioIn, 0, NULL); + + ane_eval(k6); + IOSurfaceLock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); + float *out6 = (float*)IOSurfaceGetBaseAddress(k6->ioOut); + printf("Identity W: in=[%.4f,%.4f,%.4f] out=[%.4f,%.4f,%.4f]\n", + in6[0],in6[1],in6[2], out6[0],out6[1],out6[2]); + + // Check + float me6 = 0; + for (int i = 0; i < D6*S6; i++) { + float e6 = fabsf(out6[i] - in6[i]); + if (e6 > me6) me6 = e6; + } + IOSurfaceUnlock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("max_err=%.6f %s\n", me6, me6 < 0.1 ? "PASS" : "FAIL"); + + // Now: 2× identity — just change the IOSurface weight, no recompile! + IOSurfaceLock(k6->ioIn, 0, NULL); + for (int r = 0; r < D6; r++) + for (int c = 0; c < D6; c++) + wbase[(r*D6+c)*S6] = (r==c) ? 2.0f : 0.0f; + IOSurfaceUnlock(k6->ioIn, 0, NULL); + ane_eval(k6); + IOSurfaceLock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); + printf("2× W: in=[%.4f,%.4f] out=[%.4f,%.4f] (expect 2×)\n", + in6[0],in6[1], out6[0],out6[1]); + IOSurfaceUnlock(k6->ioOut, kIOSurfaceLockReadOnly, NULL); + free_kern(k6); + } else printf("Dynamic matmul compile FAILED\n"); + } + + free_kern(k); free(W_id); free(W_2x); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/train_large.m b/training/train_large.m index f71bf52..dbe8ec9 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -5,19 +5,15 @@ #include "stories_mil.h" #include "stories_cpu_ops.h" -#define DEFAULT_CKPT_PATH "ane_stories110M_ckpt.bin" -#define DEFAULT_MODEL_PATH "../../assets/models/stories110M.bin" -#define DEFAULT_DATA_PATH "tinystories_data00.bin" +#define CKPT_PATH_DEFAULT "ane_stories110M_ckpt.bin" +#define MODEL_PATH_DEFAULT "../../assets/models/stories110M.bin" +#define DATA_PATH "tinystories_data00.bin" static const char *get_path(const char *env_var, const char *default_val) { const char *v = getenv(env_var); return (v && v[0]) ? v : default_val; } -#define CKPT_PATH get_path("ANE_CKPT_PATH", DEFAULT_CKPT_PATH) -#define MODEL_PATH get_path("ANE_MODEL_PATH", DEFAULT_MODEL_PATH) -#define DATA_PATH get_path("ANE_DATA_PATH", DEFAULT_DATA_PATH) - // ===== Weight loading from llama2.c format ===== static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) { FILE *f = fopen(path, "rb"); @@ -211,12 +207,24 @@ int main(int argc, char *argv[]) { float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; int adam_t = 0, start_step = 0; - // Parse args + // Parse args (env vars set defaults, CLI flags override) + const char *ckpt_path = get_path("ANE_CKPT_PATH", CKPT_PATH_DEFAULT); + const char *model_path = get_path("ANE_MODEL_PATH", MODEL_PATH_DEFAULT); bool do_resume = false; + int pos = 0; for (int i=1; i