mirror of https://github.com/maderix/ANE.git
334 lines
16 KiB
Objective-C
334 lines
16 KiB
Objective-C
// test_dynamic_matmul.m — Benchmark dynamic matmul on ANE (no recompile)
|
||
// Layout: input [1, D, 1, S+D] — activations in sp[0:S], weight rows in sp[S:S+D]
|
||
// MIL: slice → reshape → matmul → reshape → output
|
||
#import <Foundation/Foundation.h>
|
||
#import <objc/runtime.h>
|
||
#import <objc/message.h>
|
||
#import <dlfcn.h>
|
||
#import <IOSurface/IOSurface.h>
|
||
#import <mach/mach_time.h>
|
||
#include <arm_neon.h>
|
||
#include <Accelerate/Accelerate.h>
|
||
|
||
#include "stories_io.h"
|
||
|
||
// Generate MIL for y = x @ W where both come from input IOSurface
|
||
// Input: [1, IC, 1, SEQ+OC] fp32
|
||
// sp[0:SEQ] = activations x[IC, SEQ]
|
||
// sp[SEQ:SEQ+OC] = weight W[IC, OC] (each channel d holds W[d, :])
|
||
// Output: [1, OC, 1, SEQ] fp32
|
||
static NSString *gen_dynamic_matmul_mil(int ic, int oc, int seq) {
|
||
NSMutableString *m = [NSMutableString string];
|
||
[m appendString:@"program(1.3)\n"
|
||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"];
|
||
int sp_total = seq + oc;
|
||
[m appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ic, sp_total];
|
||
// Cast to fp16
|
||
[m appendString:@" string to16 = const()[name = string(\"to16\"), val = string(\"fp16\")];\n"];
|
||
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> xh = cast(dtype = to16, x = x)[name = string(\"cin\")];\n", ic, sp_total];
|
||
// Slice activations [1, IC, 1, SEQ]
|
||
[m appendString:@" tensor<int32, [4]> ba = const()[name = string(\"ba\"), val = tensor<int32, [4]>([0,0,0,0])];\n"];
|
||
[m appendFormat:@" tensor<int32, [4]> sa = const()[name = string(\"sa\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", ic, seq];
|
||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> act = slice_by_size(x=xh,begin=ba,size=sa)[name=string(\"act\")];\n", ic, seq];
|
||
// Slice weight [1, IC, 1, OC]
|
||
[m appendFormat:@" tensor<int32, [4]> bw = const()[name = string(\"bw\"), val = tensor<int32, [4]>([0,0,0,%d])];\n", seq];
|
||
[m appendFormat:@" tensor<int32, [4]> sw = const()[name = string(\"sw\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", ic, oc];
|
||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> wt = slice_by_size(x=xh,begin=bw,size=sw)[name=string(\"wt\")];\n", ic, oc];
|
||
// Reshape act: [1,IC,1,SEQ] → [1,1,IC,SEQ] → transpose → [1,1,SEQ,IC]
|
||
[m appendFormat:@" tensor<int32, [4]> ra = const()[name = string(\"ra\"), val = tensor<int32, [4]>([1,1,%d,%d])];\n", ic, seq];
|
||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> a2 = reshape(shape=ra,x=act)[name=string(\"a2\")];\n", ic, seq];
|
||
[m appendString:@" tensor<int32, [4]> pm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0,1,3,2])];\n"];
|
||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> a3 = transpose(perm=pm,x=a2)[name=string(\"a3\")];\n", seq, ic];
|
||
// Reshape weight: [1,IC,1,OC] → [1,1,IC,OC]
|
||
[m appendFormat:@" tensor<int32, [4]> rw = const()[name = string(\"rw\"), val = tensor<int32, [4]>([1,1,%d,%d])];\n", ic, oc];
|
||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> W = reshape(shape=rw,x=wt)[name=string(\"W\")];\n", ic, oc];
|
||
// matmul: [1,1,SEQ,IC] @ [1,1,IC,OC] → [1,1,SEQ,OC]
|
||
[m appendString:@" bool bF = const()[name = string(\"bF\"), val = bool(false)];\n"];
|
||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> yh = matmul(transpose_x=bF,transpose_y=bF,x=a3,y=W)[name=string(\"mm\")];\n", seq, oc];
|
||
// Reshape+transpose back: [1,1,SEQ,OC] → transpose → [1,1,OC,SEQ] → reshape → [1,OC,1,SEQ]
|
||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> yt = transpose(perm=pm,x=yh)[name=string(\"yt\")];\n", oc, seq];
|
||
[m appendFormat:@" tensor<int32, [4]> ro = const()[name = string(\"ro\"), val = tensor<int32, [4]>([1,%d,1,%d])];\n", oc, seq];
|
||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> yr = reshape(shape=ro,x=yt)[name=string(\"yr\")];\n", oc, seq];
|
||
// Cast back to fp32
|
||
[m appendString:@" string to32 = const()[name = string(\"to32\"), val = string(\"fp32\")];\n"];
|
||
[m appendFormat:@" tensor<fp32, [1,%d,1,%d]> y = cast(dtype = to32, x = yr)[name = string(\"cout\")];\n", oc, seq];
|
||
[m appendString:@" } -> (y);\n}\n"];
|
||
return m;
|
||
}
|
||
|
||
// Tiled version: splits OC into tiles, each tile is a separate kernel
|
||
// For W[IC, OC], tile along OC: each tile handles W[:, t*T:(t+1)*T]
|
||
// Input per tile: [1, IC, 1, SEQ+T]
|
||
// Output per tile: [1, T, 1, SEQ]
|
||
typedef struct {
|
||
Kern **tiles;
|
||
int n_tiles, tile_oc, ic, oc, seq;
|
||
} TiledMatmul;
|
||
|
||
static TiledMatmul *compile_tiled_matmul(int ic, int oc, int tile_oc, int seq) {
|
||
TiledMatmul *tm = (TiledMatmul*)calloc(1, sizeof(TiledMatmul));
|
||
tm->ic = ic; tm->oc = oc; tm->seq = seq; tm->tile_oc = tile_oc;
|
||
tm->n_tiles = (oc + tile_oc - 1) / tile_oc;
|
||
tm->tiles = (Kern**)calloc(tm->n_tiles, sizeof(Kern*));
|
||
for (int t = 0; t < tm->n_tiles; t++) {
|
||
int this_oc = (t == tm->n_tiles-1 && oc % tile_oc) ? (oc % tile_oc) : tile_oc;
|
||
NSString *mil = gen_dynamic_matmul_mil(ic, this_oc, seq);
|
||
int in_bytes = ic * (seq + this_oc) * 4;
|
||
int out_bytes = this_oc * seq * 4;
|
||
tm->tiles[t] = compile_kern_mil_w(mil, @{}, in_bytes, out_bytes);
|
||
if (!tm->tiles[t]) { printf("Tile %d compile FAIL\n", t); return NULL; }
|
||
}
|
||
return tm;
|
||
}
|
||
|
||
// Write activations + weight tile into IOSurface
|
||
// act: [IC, SEQ] column-major (channel-first)
|
||
// W: [IC, OC] — full weight matrix, we extract the tile
|
||
static void write_tile_input(TiledMatmul *tm, int tile_idx, const float *act, const float *W) {
|
||
Kern *k = tm->tiles[tile_idx];
|
||
int ic = tm->ic, seq = tm->seq, toc = tm->tile_oc;
|
||
int oc_off = tile_idx * toc;
|
||
int this_oc = (tile_idx == tm->n_tiles-1 && tm->oc % toc) ? (tm->oc % toc) : toc;
|
||
|
||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||
float *buf = (float*)IOSurfaceGetBaseAddress(k->ioIn);
|
||
// Activations: buf[d * (seq+this_oc) + t] = act[d * seq + t]
|
||
for (int d = 0; d < ic; d++) {
|
||
memcpy(buf + d*(seq+this_oc), act + d*seq, seq*sizeof(float));
|
||
// Weight: buf[d * (seq+this_oc) + seq + c] = W[d * oc + oc_off + c]
|
||
for (int c = 0; c < this_oc; c++)
|
||
buf[d*(seq+this_oc) + seq + c] = W[d*tm->oc + oc_off + c];
|
||
}
|
||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||
}
|
||
|
||
// Read tile output into full output buffer
|
||
static void read_tile_output(TiledMatmul *tm, int tile_idx, float *out) {
|
||
Kern *k = tm->tiles[tile_idx];
|
||
int seq = tm->seq, toc = tm->tile_oc;
|
||
int oc_off = tile_idx * toc;
|
||
int this_oc = (tile_idx == tm->n_tiles-1 && tm->oc % toc) ? (tm->oc % toc) : toc;
|
||
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
float *obuf = (float*)IOSurfaceGetBaseAddress(k->ioOut);
|
||
for (int c = 0; c < this_oc; c++)
|
||
memcpy(out + (oc_off+c)*seq, obuf + c*seq, seq*sizeof(float));
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
}
|
||
|
||
int main(int argc, char **argv) {
|
||
@autoreleasepool {
|
||
mach_timebase_info(&g_tb);
|
||
ane_init();
|
||
|
||
// === Test 1: Single 64×64 dynamic matmul (correctness) ===
|
||
printf("=== Test 1: 64×64 dynamic matmul correctness ===\n");
|
||
{
|
||
int D = 64, S = 64;
|
||
NSString *mil = gen_dynamic_matmul_mil(D, D, S);
|
||
int in_b = D * (S+D) * 4, out_b = D * S * 4;
|
||
Kern *k = compile_kern_mil_w(mil, @{}, in_b, out_b);
|
||
if (!k) { printf("FAIL\n"); return 1; }
|
||
|
||
// Identity test
|
||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||
float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn);
|
||
memset(inp, 0, in_b);
|
||
for (int d = 0; d < D; d++)
|
||
for (int s = 0; s < S; s++)
|
||
inp[d*(S+D) + s] = (float)(d*S + s) * 0.001f;
|
||
for (int d = 0; d < D; d++)
|
||
for (int c = 0; c < D; c++)
|
||
inp[d*(S+D) + S + c] = (d == c) ? 1.0f : 0.0f;
|
||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
float *out = (float*)IOSurfaceGetBaseAddress(k->ioOut);
|
||
float me = 0;
|
||
for (int d = 0; d < D; d++)
|
||
for (int s = 0; s < S; s++) {
|
||
float e = fabsf(out[d*S+s] - inp[d*(S+D)+s]);
|
||
if (e > me) me = e;
|
||
}
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("Identity: max_err=%.6f %s\n", me, me < 0.01 ? "PASS" : "FAIL");
|
||
|
||
// 2× test
|
||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||
for (int d = 0; d < D; d++)
|
||
for (int c = 0; c < D; c++)
|
||
inp[d*(S+D) + S + c] = (d == c) ? 2.0f : 0.0f;
|
||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||
ane_eval(k);
|
||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
float sr = 0; int cnt = 0;
|
||
for (int i = 0; i < D*S; i++)
|
||
if (fabsf(inp[i/(S)*((S)+D) + i%S]) > 0.001f) { sr += out[i]/inp[i/S*(S+D)+i%S]; cnt++; }
|
||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||
printf("2× W: ratio=%.3f %s\n\n", cnt?sr/cnt:0, fabsf(sr/cnt-2.0f)<0.1?"PASS":"FAIL");
|
||
free_kern(k);
|
||
}
|
||
|
||
// === Test 2: 768×768 single kernel (if it compiles) ===
|
||
printf("=== Test 2: 768×768 single dynamic matmul ===\n");
|
||
{
|
||
int D = 768, S = 256;
|
||
int sp_total = S + D; // 256 + 768 = 1024
|
||
int in_b = D * sp_total * 4; // 768 * 1024 * 4 = 3.1MB
|
||
int out_b = D * S * 4; // 768 * 256 * 4 = 786KB
|
||
printf("IOSurface: in=%.1fMB out=%.1fKB\n", in_b/1e6, out_b/1e3);
|
||
|
||
NSString *mil = gen_dynamic_matmul_mil(D, D, S);
|
||
uint64_t t0 = mach_absolute_time();
|
||
Kern *k = compile_kern_mil_w(mil, @{}, in_b, out_b);
|
||
double compile_ms = tb_ms(mach_absolute_time() - t0);
|
||
if (!k) { printf("768×768 compile FAIL\n"); }
|
||
else {
|
||
printf("Compile: %.1fms\n", compile_ms);
|
||
// Random weights
|
||
float *act = (float*)calloc(D*S, sizeof(float));
|
||
float *W = (float*)calloc(D*D, sizeof(float));
|
||
for (int i = 0; i < D*S; i++) act[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f;
|
||
for (int i = 0; i < D*D; i++) W[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f;
|
||
|
||
// Write to IOSurface
|
||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||
float *inp = (float*)IOSurfaceGetBaseAddress(k->ioIn);
|
||
for (int d = 0; d < D; d++) {
|
||
memcpy(inp + d*(S+D), act + d*S, S*4);
|
||
memcpy(inp + d*(S+D) + S, W + d*D, D*4);
|
||
}
|
||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||
|
||
// Warmup
|
||
for (int i = 0; i < 3; i++) ane_eval(k);
|
||
|
||
// Benchmark
|
||
int iters = 50;
|
||
t0 = mach_absolute_time();
|
||
for (int i = 0; i < iters; i++) ane_eval(k);
|
||
double total_ms = tb_ms(mach_absolute_time() - t0);
|
||
double per_eval = total_ms / iters;
|
||
double flops = 2.0 * D * D * S; // matmul FLOPs
|
||
double gflops = flops / (per_eval * 1e6);
|
||
printf("768×768×256 matmul: %.3fms/eval %.1f GFLOP/s\n", per_eval, gflops);
|
||
|
||
// Benchmark with IO write (simulating weight update)
|
||
t0 = mach_absolute_time();
|
||
for (int i = 0; i < iters; i++) {
|
||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||
float *p = (float*)IOSurfaceGetBaseAddress(k->ioIn);
|
||
for (int d = 0; d < D; d++)
|
||
memcpy(p + d*(S+D) + S, W + d*D, D*4);
|
||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||
ane_eval(k);
|
||
}
|
||
total_ms = tb_ms(mach_absolute_time() - t0);
|
||
per_eval = total_ms / iters;
|
||
gflops = flops / (per_eval * 1e6);
|
||
printf("With weight IO: %.3fms/eval %.1f GFLOP/s\n", per_eval, gflops);
|
||
|
||
free(act); free(W); free_kern(k);
|
||
}
|
||
}
|
||
|
||
// === Test 3: Tiled matmul benchmark ===
|
||
int tile_sizes[] = {64, 128, 256, 384, 768};
|
||
int n_tiles_test = sizeof(tile_sizes)/sizeof(tile_sizes[0]);
|
||
printf("\n=== Test 3: Tiled 768×768 matmul (varying tile_oc) ===\n");
|
||
printf("%-10s %-8s %-10s %-12s %-10s\n", "tile_oc", "tiles", "compile", "eval/ms", "GFLOP/s");
|
||
{
|
||
int D = 768, S = 256;
|
||
float *act = (float*)calloc(D*S, sizeof(float));
|
||
float *W = (float*)calloc(D*D, sizeof(float));
|
||
float *out_full = (float*)calloc(D*S, sizeof(float));
|
||
for (int i = 0; i < D*S; i++) act[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.1f;
|
||
for (int i = 0; i < D*D; i++) W[i] = ((float)arc4random() / UINT32_MAX - 0.5f) * 0.01f;
|
||
|
||
for (int ti = 0; ti < n_tiles_test; ti++) {
|
||
int T = tile_sizes[ti];
|
||
if (T > D) continue;
|
||
uint64_t t0 = mach_absolute_time();
|
||
TiledMatmul *tm = compile_tiled_matmul(D, D, T, S);
|
||
double compile_ms = tb_ms(mach_absolute_time() - t0);
|
||
if (!tm) { printf("%-10d FAIL\n", T); continue; }
|
||
|
||
// Warmup
|
||
for (int w = 0; w < 2; w++) {
|
||
for (int t = 0; t < tm->n_tiles; t++) {
|
||
write_tile_input(tm, t, act, W);
|
||
ane_eval(tm->tiles[t]);
|
||
}
|
||
}
|
||
|
||
// Benchmark (with IO)
|
||
int iters = 20;
|
||
t0 = mach_absolute_time();
|
||
for (int i = 0; i < iters; i++) {
|
||
for (int t = 0; t < tm->n_tiles; t++) {
|
||
write_tile_input(tm, t, act, W);
|
||
ane_eval(tm->tiles[t]);
|
||
read_tile_output(tm, t, out_full);
|
||
}
|
||
}
|
||
double total_ms = tb_ms(mach_absolute_time() - t0);
|
||
double per_matmul = total_ms / iters;
|
||
double flops = 2.0 * D * D * S;
|
||
double gflops = flops / (per_matmul * 1e6);
|
||
printf("%-10d %-8d %-10.0fms %-12.3fms %-10.1f\n",
|
||
T, tm->n_tiles, compile_ms, per_matmul, gflops);
|
||
|
||
for (int t = 0; t < tm->n_tiles; t++) free_kern(tm->tiles[t]);
|
||
free(tm->tiles); free(tm);
|
||
}
|
||
|
||
// === Correctness check: compare with cblas ===
|
||
printf("\n=== Correctness: dynamic matmul vs cblas_sgemm ===\n");
|
||
{
|
||
int T = 768; // full, no tiling
|
||
TiledMatmul *tm = compile_tiled_matmul(D, D, T, S);
|
||
if (tm) {
|
||
write_tile_input(tm, 0, act, W);
|
||
ane_eval(tm->tiles[0]);
|
||
read_tile_output(tm, 0, out_full);
|
||
|
||
// Reference: cblas y = act^T @ W → y[s,oc] = sum_d act[d,s]*W[d,oc]
|
||
// act is [D,S] col-major, W is [D,D] row-major
|
||
// We want out[oc,s] = sum_d act[d,s] * W[d,oc]
|
||
// = W^T @ act where W^T is [D,D] and act is [D,S] → out is [D,S]
|
||
float *ref = (float*)calloc(D*S, sizeof(float));
|
||
// out[oc*S+s] = sum_d W[d*D+oc] * act[d*S+s]
|
||
// This is: (W^T) @ act in column-major: M=D,N=S,K=D
|
||
// cblas: C = alpha*A*B + beta*C
|
||
// A=W^T [D×D], B=act [D×S], C=ref [D×S]
|
||
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
|
||
D, S, D, 1.0f, W, D, act, D, 0.0f, ref, D);
|
||
float me = 0;
|
||
for (int i = 0; i < D*S; i++) {
|
||
float e = fabsf(out_full[i] - ref[i]);
|
||
if (e > me) me = e;
|
||
}
|
||
printf("vs cblas: max_err=%.6f %s\n", me, me < 1.0 ? "PASS" : "FAIL");
|
||
free(ref);
|
||
for (int t = 0; t < tm->n_tiles; t++) free_kern(tm->tiles[t]);
|
||
free(tm->tiles); free(tm);
|
||
}
|
||
}
|
||
|
||
free(act); free(W); free(out_full);
|
||
}
|
||
|
||
// === Summary for training ===
|
||
printf("\n=== Summary ===\n");
|
||
printf("Stories110M: 12 layers × 10 matmuls/layer = 120 matmuls/step\n");
|
||
printf("Sizes: Wq/Wk/Wv/Wo [768,768], W1/W3 [2048,768], W2 [768,2048]\n");
|
||
printf("With dynamic weights: compile once, update IOSurface every step\n");
|
||
|
||
printf("\nDone.\n");
|
||
}
|
||
return 0;
|
||
}
|