This commit is contained in:
Darko 2026-03-04 15:30:17 +08:00 committed by GitHub
commit e626968d30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 1257 additions and 757 deletions

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
*.o
ane_probe
api_explore
inmem_basic
tiny_train
tiny_train_m1
train_large

View File

@ -8,6 +8,7 @@
static mach_timebase_info_data_t g_tb;
static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
NSData *buildWeightBlob(int ch, int depth) {
NSUInteger wsize = ch * ch * 2;
@ -27,28 +28,45 @@ NSData *buildWeightBlob(int ch, int depth) {
NSString *genMIL(int ch, int sp, int depth) {
NSMutableString *m = [NSMutableString string];
[m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
[m appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp];
[m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
@" tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
@" tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n"
@" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp];
[m appendString:@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"];
if (g_fp16_io) {
// fp16 I/O path no cast ops (M1/M2 compatible)
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ch, sp];
} else {
// fp32 I/O path cast to/from fp16 internally (M4+ native)
[m appendFormat:@" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp];
}
[m appendString:
@" tensor<string, []> c_pad_type_0 = const()[name = tensor<string, []>(\"c_pad_type_0\"), val = tensor<string, []>(\"valid\")];\n"
@" tensor<int32, [2]> c_strides_0 = const()[name = tensor<string, []>(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" tensor<int32, [4]> c_pad_0 = const()[name = tensor<string, []>(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
@" tensor<int32, [2]> c_dilations_0 = const()[name = tensor<string, []>(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" tensor<int32, []> c_groups_0 = const()[name = tensor<string, []>(\"c_groups_0\"), val = tensor<int32, []>(1)];\n"];
NSString *prev;
if (g_fp16_io) {
prev = @"x";
} else {
[m appendString:@" tensor<string, []> x_to_fp16_dtype_0 = const()[name = tensor<string, []>(\"x_to_fp16_dtype_0\"), val = tensor<string, []>(\"fp16\")];\n"];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor<string, []>(\"cast_in\")];\n", ch, sp];
prev = @"x_to_fp16";
}
NSUInteger cs = 64 + ch*ch*2;
NSString *prev = @"x_to_fp16";
for (int i = 0; i < depth; i++) {
[m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = string(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n",
[m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = tensor<string, []>(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n",
ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)];
NSString *out = [NSString stringWithFormat:@"c%d", i];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n",
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor<string, []>(\"%@\")];\n",
ch, sp, out, i, prev, out];
prev = out;
}
[m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"];
[m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev];
[m appendString:@" } -> (c);\n}\n"];
if (g_fp16_io) {
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> c = identity(x = %@)[name = tensor<string, []>(\"out\")];\n", ch, sp, prev];
[m appendString:@" } -> (c);\n}\n"];
} else {
[m appendString:@" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"];
[m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = tensor<string, []>(\"cast_out\")];\n", ch, sp, prev];
[m appendString:@" } -> (c);\n}\n"];
}
return m;
}
@ -68,9 +86,18 @@ double bench(int ch, int sp, int depth) {
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;}
if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){
[fm removeItemAtPath:td error:nil];
if (!g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
return bench(ch, sp, depth);
}
return -3;
}
if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;}
NSUInteger bytes=ch*sp*4;
size_t bpe = g_fp16_io ? 2 : 4;
NSUInteger bytes=ch*sp*bpe;
IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);

View File

@ -5,6 +5,9 @@
#include <string.h>
#include <math.h>
// Set by caller: 1 = fp16 I/O (M1/M2 fallback, no cast ops), 0 = fp32 I/O with cast (M4+)
extern int g_fp16_io;
// Build an FP16 weight blob with the required header structure.
// weights_f32: source weights in row-major [out_ch, in_ch]
// Returns NSData with header + FP16 weights
@ -30,21 +33,32 @@ static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int i
// Input W: [1, out_ch, in_ch] fp32
// Output: [1, out_ch, spatial] fp32
static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d]> x, tensor<fp16, [1, %d, %d]> W) {\n"
" tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
" tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(false)];\n"
" tensor<fp16, [1, %d, %d]> y = matmul(transpose_x = tx, transpose_y = ty, x = W, y = x)[name = tensor<string, []>(\"mm\")];\n"
" } -> (y);\n"
"}\n",
in_ch, spatial, out_ch, in_ch, out_ch, spatial];
}
return [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
" tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n"
" bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
" bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
" tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n"
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
" func main<ios16>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_x\")];\n"
" tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = tensor<string, []>(\"cast_W\")];\n"
" tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
" tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(false)];\n"
" tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = tensor<string, []>(\"mm\")];\n"
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = tensor<string, []>(\"cast_out\")];\n"
" } -> (y);\n"
"}\n",
in_ch, spatial, out_ch, in_ch,
@ -54,26 +68,45 @@ static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
// Keep the baked-weight version for reference (used in inference-only scenarios)
static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x)[name = tensor<string, []>(\"conv\")];\n"
" } -> (y);\n"
"}\n",
in_ch, spatial,
out_ch, in_ch, out_ch, in_ch,
out_ch, spatial];
}
return [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n"
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = tensor<string, []>(\"conv\")];\n"
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = tensor<string, []>(\"cast_out\")];\n"
" } -> (y);\n"
"}\n",
in_ch, spatial, in_ch, spatial,
@ -88,36 +121,65 @@ static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
// where cs = 64 + dim*dim*2
static NSString *mil_gen_qkv(int dim, int spatial) {
NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
" tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x)[name = tensor<string, []>(\"conv_q\")];\n"
" tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x)[name = tensor<string, []>(\"conv_k\")];\n"
" tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x)[name = tensor<string, []>(\"conv_v\")];\n"
" } -> (q, k, v);\n"
"}\n",
dim, spatial,
dim, dim, dim, dim,
dim, dim, dim, dim, (unsigned long)(64 + cs),
dim, dim, dim, dim, (unsigned long)(64 + 2*cs),
dim, spatial, dim, spatial, dim, spatial];
}
return [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
" tensor<fp16, [1, %d, 1, %d]> q16 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n"
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = tensor<string, []>(\"conv_q\")];\n"
" tensor<fp16, [1, %d, 1, %d]> k16 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n"
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = tensor<string, []>(\"conv_k\")];\n"
" tensor<fp16, [1, %d, 1, %d]> v16 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n"
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n"
" tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n"
" tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n"
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = tensor<string, []>(\"conv_v\")];\n"
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = tensor<string, []>(\"cast_q\")];\n"
" tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = tensor<string, []>(\"cast_k\")];\n"
" tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = tensor<string, []>(\"cast_v\")];\n"
" } -> (q, k, v);\n"
"}\n",
dim, spatial, dim, spatial,
@ -173,31 +235,55 @@ static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, in
// Generate MIL for fused FFN up: w1 + w3 parallel convs
static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
" tensor<fp16, [1, %d, 1, %d]> out1 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x)[name = tensor<string, []>(\"conv_w1\")];\n"
" tensor<fp16, [1, %d, 1, %d]> out3 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x)[name = tensor<string, []>(\"conv_w3\")];\n"
" } -> (out1, out3);\n"
"}\n",
dim, spatial,
hidden_dim, dim, hidden_dim, dim,
hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs),
hidden_dim, spatial, hidden_dim, spatial];
}
return [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
" tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n"
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = tensor<string, []>(\"conv_w1\")];\n"
" tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = c_dilations, groups = c_groups, "
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n"
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n"
" tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n"
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = tensor<string, []>(\"conv_w3\")];\n"
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = tensor<string, []>(\"cast_h1\")];\n"
" tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = tensor<string, []>(\"cast_h3\")];\n"
" } -> (out1, out3);\n"
"}\n",
dim, spatial, dim, spatial,

View File

@ -9,22 +9,43 @@
// Transpose back to [S, out_dim] row-major
static void ane_conv_eval(ANEKernel *kernel, const float *x, float *y,
int S, int in_dim, int out_dim) {
float *x_t = (float*)malloc(S * in_dim * sizeof(float));
for (int t = 0; t < S; t++)
for (int i = 0; i < in_dim; i++)
x_t[i*S + t] = x[t*in_dim + i];
if (g_fp16_io) {
// fp16 I/O path: transpose + convert float→fp16, write, eval, read fp16→float + transpose
_Float16 *x_t = (_Float16*)malloc(S * in_dim * sizeof(_Float16));
for (int t = 0; t < S; t++)
for (int i = 0; i < in_dim; i++)
x_t[i*S + t] = (_Float16)x[t*in_dim + i];
ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float));
ane_eval(kernel);
ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(_Float16));
ane_eval(kernel);
float *y_t = (float*)malloc(S * out_dim * sizeof(float));
ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float));
_Float16 *y_t = (_Float16*)malloc(S * out_dim * sizeof(_Float16));
ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(_Float16));
for (int t = 0; t < S; t++)
for (int i = 0; i < out_dim; i++)
y[t*out_dim + i] = y_t[i*S + t];
for (int t = 0; t < S; t++)
for (int i = 0; i < out_dim; i++)
y[t*out_dim + i] = (float)y_t[i*S + t];
free(x_t); free(y_t);
free(x_t); free(y_t);
} else {
// fp32 I/O path: transpose, write, eval, read, transpose back
float *x_t = (float*)malloc(S * in_dim * sizeof(float));
for (int t = 0; t < S; t++)
for (int i = 0; i < in_dim; i++)
x_t[i*S + t] = x[t*in_dim + i];
ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float));
ane_eval(kernel);
float *y_t = (float*)malloc(S * out_dim * sizeof(float));
ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float));
for (int t = 0; t < S; t++)
for (int i = 0; i < out_dim; i++)
y[t*out_dim + i] = y_t[i*S + t];
free(x_t); free(y_t);
}
}
// CPU matmul fallback: y = W @ x, W[out_dim, in_dim], x[S, in_dim] → y[S, out_dim]

View File

@ -151,8 +151,9 @@ static int model_load_weights(Model *m, const char *path) {
static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) {
NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch);
NSString *mil = mil_gen_conv(in_ch, out_ch, spatial);
size_t inBytes = (size_t)in_ch * spatial * 4;
size_t outBytes = (size_t)out_ch * spatial * 4;
size_t bpe = g_fp16_io ? 2 : 4;
size_t inBytes = (size_t)in_ch * spatial * bpe;
size_t outBytes = (size_t)out_ch * spatial * bpe;
return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes);
}
@ -161,9 +162,31 @@ static int model_compile_kernels(Model *m, int seq_len) {
m->seq_len = seq_len;
int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size;
int S = seq_len;
printf("Compiling %d ANE conv kernels (S=%d)...\n", N_LAYERS * 7 + 1, S);
printf("Compiling %d ANE conv kernels (S=%d, %s I/O)...\n",
N_LAYERS * 7 + 1, S, g_fp16_io ? "fp16" : "fp32");
for (int l = 0; l < N_LAYERS; l++) {
// Try first layer as canary — if cast op fails, retry with fp16 I/O
m->kern_q[0] = compile_conv_kernel(m->wq[0], d, d, S);
if (!m->kern_q[0] && !g_fp16_io) {
printf(" Compile failed, retrying with fp16 I/O (M1/M2 fallback)...\n");
g_fp16_io = 1;
m->kern_q[0] = compile_conv_kernel(m->wq[0], d, d, S);
}
if (!m->kern_q[0]) { fprintf(stderr, "L0 kern_q fail\n"); return -1; }
m->kern_k[0] = compile_conv_kernel(m->wk[0], d, d, S);
m->kern_v[0] = compile_conv_kernel(m->wv[0], d, d, S);
m->kern_o[0] = compile_conv_kernel(m->wo[0], d, d, S);
m->kern_w1[0] = compile_conv_kernel(m->w1[0], d, hd, S);
m->kern_w2[0] = compile_conv_kernel(m->w2[0], hd, d, S);
m->kern_w3[0] = compile_conv_kernel(m->w3[0], d, hd, S);
if (!m->kern_k[0] || !m->kern_v[0] || !m->kern_o[0] ||
!m->kern_w1[0] || !m->kern_w2[0] || !m->kern_w3[0]) {
fprintf(stderr, "L0 compile fail\n"); return -1;
}
printf(" Layer 0 OK\n");
for (int l = 1; l < N_LAYERS; l++) {
m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S);
m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S);
m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S);
@ -171,20 +194,18 @@ static int model_compile_kernels(Model *m, int seq_len) {
m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S);
m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S);
m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S);
if (!m->kern_q[l]) { fprintf(stderr, "L%d kern_q fail\n",l); return -1; }
if (!m->kern_k[l]) { fprintf(stderr, "L%d kern_k fail\n",l); return -1; }
if (!m->kern_v[l]) { fprintf(stderr, "L%d kern_v fail\n",l); return -1; }
if (!m->kern_o[l]) { fprintf(stderr, "L%d kern_o fail\n",l); return -1; }
if (!m->kern_w1[l]) { fprintf(stderr, "L%d kern_w1 fail\n",l); return -1; }
if (!m->kern_w2[l]) { fprintf(stderr, "L%d kern_w2 fail\n",l); return -1; }
if (!m->kern_w3[l]) { fprintf(stderr, "L%d kern_w3 fail\n",l); return -1; }
if (!m->kern_q[l] || !m->kern_k[l] || !m->kern_v[l] || !m->kern_o[l] ||
!m->kern_w1[l] || !m->kern_w2[l] || !m->kern_w3[l]) {
fprintf(stderr, "L%d compile fail\n", l); return -1;
}
printf(" Layer %d OK\n", l);
}
m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S);
if (!m->kern_cls) {
fprintf(stderr, "Classifier kernel compile failed (dim=%d→vocab=%d too large?), using CPU for cls\n", d, vs);
}
printf(" All kernels compiled (%d conv + %s)\n", N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU");
printf(" All kernels compiled (%d conv + %s, %s I/O)\n",
N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU", g_fp16_io ? "fp16" : "fp32");
return 0;
}

View File

@ -4,15 +4,13 @@
#include "stories_io.h"
#define MIL_HDR \
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
#define CONV_CONST \
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n" \
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm)
static NSString *gen_sdpa_fwd_taps(void) {
@ -20,53 +18,53 @@ static NSString *gen_sdpa_fwd_taps(void) {
float invd = 1.0f/(float)DIM;
NSMutableString *m = [NSMutableString string];
[m appendString:MIL_HDR];
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
[m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
[m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
[m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
[m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=tensor<string, []>(\"sq\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
[m appendFormat:@" tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"ss\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, []> invd = const()[name=tensor<string, []>(\"invd\"), val=tensor<fp16, []>(%f)];\n", invd];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=tensor<string, []>(\"ss2\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, []> eps = const()[name=tensor<string, []>(\"eps\"), val=tensor<fp16, []>(0.00001)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=tensor<string, []>(\"ss3\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, []> nhalf = const()[name=tensor<string, []>(\"nhalf\"), val=tensor<fp16, []>(-0.5)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=tensor<string, []>(\"rrms\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=tensor<string, []>(\"xr\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=tensor<string, []>(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/rms1.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=tensor<string, []>(\"xn\")];\n", DIM, SEQ];
[m appendString:@CONV_CONST];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
[m appendString:@" tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
[m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
[m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wq = const()[name=tensor<string, []>(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wq.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wk = const()[name=tensor<string, []>(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wk.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wv = const()[name=tensor<string, []>(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wv.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wo = const()[name=tensor<string, []>(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wo.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=tensor<string, []>(\"cq\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=tensor<string, []>(\"ck\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=tensor<string, []>(\"cv\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> qsh = const()[name=tensor<string, []>(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
[m appendString:@" tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=tensor<string, []>(\"rv\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=tensor<string, []>(\"tv\")];\n", HEADS,SEQ,HD];
[m appendString:@" tensor<bool, []> tx = const()[name=tensor<string, []>(\"tx\"), val=tensor<bool, []>(false)];\n"];
[m appendString:@" tensor<bool, []> ty = const()[name=tensor<string, []>(\"ty\"), val=tensor<bool, []>(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=tensor<string, []>(\"mm1\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=tensor<string, []>(\"scl\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=tensor<string, []>(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/mask.bin\"), offset=tensor<uint64, []>(64)))];\n", SEQ,SEQ,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=tensor<string, []>(\"msk\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" tensor<int32, []> sax = const()[name=tensor<string, []>(\"sax\"), val=tensor<int32, []>(-1)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=tensor<string, []>(\"sm\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=tensor<string, []>(\"mm2\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=tensor<string, []>(\"ta\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<int32, [4]> os = const()[name=tensor<string, []>(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=tensor<string, []>(\"ra\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=tensor<string, []>(\"co\")];\n", DIM,SEQ];
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=tensor<string, []>(\"cat\")];\n", 6*DIM,SEQ];
[m appendString:@" } -> (out);\n}\n"];
return m;
}
@ -76,33 +74,33 @@ static NSString *gen_ffn_fwd_taps(void) {
float invd = 1.0f/(float)DIM;
NSMutableString *m = [NSMutableString string];
[m appendString:MIL_HDR];
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
[m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
[m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
[m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
[m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=tensor<string, []>(\"sq\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
[m appendFormat:@" tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"ss\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, []> invd = const()[name=tensor<string, []>(\"invd\"), val=tensor<fp16, []>(%f)];\n", invd];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=tensor<string, []>(\"ss2\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, []> eps = const()[name=tensor<string, []>(\"eps\"), val=tensor<fp16, []>(0.00001)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=tensor<string, []>(\"ss3\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, []> nhalf = const()[name=tensor<string, []>(\"nhalf\"), val=tensor<fp16, []>(-0.5)];\n"];
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=tensor<string, []>(\"rrms\")];\n", SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=tensor<string, []>(\"xr\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=tensor<string, []>(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/rms2.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=tensor<string, []>(\"xn\")];\n", DIM, SEQ];
[m appendString:@CONV_CONST];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1 = const()[name=tensor<string, []>(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w1.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3 = const()[name=tensor<string, []>(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w3.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2 = const()[name=tensor<string, []>(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w2.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=tensor<string, []>(\"c1\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=tensor<string, []>(\"c3\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=tensor<string, []>(\"sg\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=tensor<string, []>(\"si\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=tensor<string, []>(\"gt\")];\n", HIDDEN,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=tensor<string, []>(\"c2\")];\n", DIM,SEQ];
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=tensor<string, []>(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
[m appendString:@" } -> (out);\n}\n"];
return m;
}
@ -111,36 +109,36 @@ static NSString *gen_ffn_fwd_taps(void) {
static NSString *gen_ffn_bwd(void) {
NSMutableString *m = [NSMutableString string];
[m appendString:MIL_HDR];
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
[m appendString:@CONV_CONST];
[m appendString:@" tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
[m appendFormat:@" tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
[m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
[m appendString:@" tensor<int32, [4]> bd = const()[name=tensor<string, []>(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<int32, [4]> sd = const()[name=tensor<string, []>(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=tensor<string, []>(\"s0\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
[m appendFormat:@" tensor<int32, [4]> s1 = const()[name=tensor<string, []>(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=tensor<string, []>(\"s1x\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=tensor<string, []>(\"s3x\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2t = const()[name=tensor<string, []>(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w2t.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=tensor<string, []>(\"cw2\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=tensor<string, []>(\"sg\")];\n", HIDDEN, SEQ];
[m appendString:@" tensor<fp16, []> one = const()[name=tensor<string, []>(\"one\"), val=tensor<fp16, []>(1.0)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=tensor<string, []>(\"oms\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=tensor<string, []>(\"homs\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=tensor<string, []>(\"brk\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=tensor<string, []>(\"dsd\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=tensor<string, []>(\"t1\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=tensor<string, []>(\"dh1\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=tensor<string, []>(\"slh\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=tensor<string, []>(\"dh3\")];\n", HIDDEN, SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1t = const()[name=tensor<string, []>(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3t = const()[name=tensor<string, []>(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=tensor<string, []>(\"cw1\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=tensor<string, []>(\"cw3\")];\n", DIM, SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=tensor<string, []>(\"adx\")];\n", DIM, SEQ];
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=tensor<string, []>(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
[m appendString:@" } -> (out);\n}\n"];
return m;
}
@ -149,23 +147,23 @@ static NSString *gen_ffn_bwd(void) {
static NSString *gen_qkvb(void) {
NSMutableString *m = [NSMutableString string];
[m appendString:MIL_HDR];
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
[m appendString:@CONV_CONST];
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=tensor<string, []>(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendString:@" tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=tensor<string, []>(\"s0\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=tensor<string, []>(\"s1\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=tensor<string, []>(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wqt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=tensor<string, []>(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wkt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=tensor<string, []>(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wvt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=tensor<string, []>(\"cq\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=tensor<string, []>(\"ck\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=tensor<string, []>(\"cv\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=tensor<string, []>(\"aqk\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=tensor<string, []>(\"out\")];\n", DIM,SEQ];
[m appendString:@" } -> (out);\n}\n"];
return m;
}
@ -175,49 +173,49 @@ static NSString *gen_sdpa_bwd1(void) {
float sc = 1.0f/sqrtf((float)HD);
NSMutableString *m = [NSMutableString string];
[m appendString:MIL_HDR];
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
[m appendString:@CONV_CONST];
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
[m appendString:@" tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
[m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
[m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=tensor<string, []>(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendString:@" tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=tensor<string, []>(\"s0\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=tensor<string, []>(\"s1\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=tensor<string, []>(\"s3\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wot = const()[name=tensor<string, []>(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wot.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=tensor<string, []>(\"cwo\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=tensor<string, []>(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
[m appendString:@" tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=tensor<string, []>(\"rv\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=tensor<string, []>(\"tv\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=tensor<string, []>(\"rd\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=tensor<string, []>(\"td\")];\n", HEADS,SEQ,HD];
[m appendString:@" tensor<bool, []> bF = const()[name=tensor<string, []>(\"bF\"), val=tensor<bool, []>(false)];\n"];
[m appendString:@" tensor<bool, []> bT = const()[name=tensor<string, []>(\"bT\"), val=tensor<bool, []>(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=tensor<string, []>(\"mm1\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=tensor<string, []>(\"scl\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=tensor<string, []>(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/mask.bin\"), offset=tensor<uint64, []>(64)))];\n", SEQ,SEQ,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=tensor<string, []>(\"msk\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" tensor<int32, []> sax = const()[name=tensor<string, []>(\"sax\"), val=tensor<int32, []>(-1)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=tensor<string, []>(\"sm\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=tensor<string, []>(\"dv\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=tensor<string, []>(\"dp\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=tensor<string, []>(\"dvt\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<int32, [4]> dvs = const()[name=tensor<string, []>(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=tensor<string, []>(\"dvf\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> scs = const()[name=tensor<string, []>(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=tensor<string, []>(\"pf\")];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=tensor<string, []>(\"dpf\")];\n", SCORE_CH,SEQ];
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=tensor<string, []>(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
[m appendString:@" } -> (out);\n}\n"];
return m;
}
@ -228,46 +226,46 @@ static NSString *gen_sdpa_bwd2(void) {
int bwd2_in = 2*SCORE_CH + 2*DIM;
NSMutableString *m = [NSMutableString string];
[m appendString:MIL_HDR];
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
[m appendFormat:@" tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
[m appendString:@" tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
[m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
[m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
[m appendFormat:@" tensor<int32, [4]> sz_sc = const()[name=tensor<string, []>(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
[m appendString:@" tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=tensor<string, []>(\"s0\")];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=tensor<string, []>(\"s1\")];\n", SCORE_CH,SEQ];
[m appendFormat:@" tensor<int32, [4]> sz_d = const()[name=tensor<string, []>(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=tensor<string, []>(\"s3\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<int32, [4]> ssh = const()[name=tensor<string, []>(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=tensor<string, []>(\"rp\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=tensor<string, []>(\"rdp\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=tensor<string, []>(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
[m appendString:@" tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=tensor<string, []>(\"pdp\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
[m appendString:@" tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"rs\")];\n", HEADS,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=tensor<string, []>(\"dps\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=tensor<string, []>(\"ds0\")];\n", HEADS,SEQ,SEQ];
[m appendFormat:@" tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=tensor<string, []>(\"ds\")];\n", HEADS,SEQ,SEQ];
[m appendString:@" tensor<bool, []> bF = const()[name=tensor<string, []>(\"bF\"), val=tensor<bool, []>(false)];\n"];
[m appendString:@" tensor<bool, []> bT = const()[name=tensor<string, []>(\"bT\"), val=tensor<bool, []>(true)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=tensor<string, []>(\"dq\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=tensor<string, []>(\"dk\")];\n", HEADS,SEQ,HD];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=tensor<string, []>(\"dqt\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=tensor<string, []>(\"dkt\")];\n", HEADS,HD,SEQ];
[m appendFormat:@" tensor<int32, [4]> fs = const()[name=tensor<string, []>(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=tensor<string, []>(\"dqf\")];\n", DIM,SEQ];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=tensor<string, []>(\"dkf\")];\n", DIM,SEQ];
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=tensor<string, []>(\"cat\")];\n", 2*DIM,SEQ];
[m appendString:@" } -> (out);\n}\n"];
return m;
}

View File

@ -50,6 +50,8 @@ static IOSurfaceRef make_surface(size_t bytes) {
(id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
}
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
int main() {
@autoreleasepool {
setbuf(stdout, NULL);
@ -106,28 +108,43 @@ int main() {
memcpy(blob+128, w, ws);
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=string(\"conv\")];\n"
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
" } -> (y);\n"
"}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
NSFileManager *fm = [NSFileManager defaultManager];
retry_compile:;
NSString *mil;
if (g_fp16_io) {
mil = [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=tensor<string, []>(\"conv\")];\n"
" } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
} else {
mil = [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=tensor<string, []>(\"conv\")];\n"
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
" } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
}
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
@ -135,23 +152,33 @@ int main() {
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
withIntermediateDirectories:YES attributes:nil error:nil];
[md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
if (!compiled && !g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
[fm removeItemAtPath:td error:nil];
goto retry_compile;
}
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
int ioBytes = CH * SP * 4;
int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
IOSurfaceRef ioIn = make_surface(ioBytes);
IOSurfaceRef ioOut = make_surface(ioBytes);
IOSurfaceLock(ioIn, 0, NULL);
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
if (g_fp16_io) {
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)((float)(s+1) * 0.1f);
} else {
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
}
IOSurfaceUnlock(ioIn, 0, NULL);
// Baseline eval
@ -165,9 +192,16 @@ int main() {
printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL");
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
float baseline_0 = out0[0], baseline_1 = out0[1];
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
float baseline_0, baseline_1;
if (g_fp16_io) {
_Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
baseline_0 = (float)out0[0]; baseline_1 = (float)out0[1];
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]);
} else {
float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
baseline_0 = out0[0]; baseline_1 = out0[1];
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
}
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
// Test weightsBuffer: IOSurface with 3x identity weights
@ -194,10 +228,18 @@ int main() {
printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL");
if (ok) {
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
bool changed = fabsf(outW[0] - baseline_0) > 0.001f;
bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f;
float outW_0;
if (g_fp16_io) {
_Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
outW_0 = (float)outW[0];
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]);
} else {
float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
outW_0 = outW[0];
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
}
bool changed = fabsf(outW_0 - baseline_0) > 0.001f;
bool is_3x = fabsf(outW_0 - baseline_0 * 3.0f) < 0.1f;
printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged");
if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected");
printf("\n");

View File

@ -81,13 +81,11 @@ int main() {
// === Approach 1: Non-causal SDPA (baseline) ===
printf("=== Non-causal SDPA (baseline) ===\n");
NSString *sdpa_mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
"query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
"query = q, key = k, value = v)[name = tensor<string, []>(\"sdpa\")];\n"
" } -> (att);\n}\n",
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
Kern kSDPA = compile_mil(sdpa_mil);
@ -100,13 +98,11 @@ int main() {
// scores = Q @ K^T [1, HEADS, SEQ, SEQ]
printf("\n=== Decomposed causal attention ===\n");
NSString *qkt_mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
"tensor<fp16, [1, %d, %d, %d]> k) {\n"
" tensor<fp16, [1, %d, %d, %d]> scores = matmul("
"x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n"
"x = q, y = k, transpose_y = true)[name = tensor<string, []>(\"qkt\")];\n"
" } -> (scores);\n}\n",
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ];
Kern kQKT = compile_mil(qkt_mil);
@ -114,13 +110,11 @@ int main() {
// Step 3: scores_softmax @ V output [1, HEADS, SEQ, HD]
NSString *sv_mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> s, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> s, "
"tensor<fp16, [1, %d, %d, %d]> v) {\n"
" tensor<fp16, [1, %d, %d, %d]> out = matmul("
"x = s, y = v)[name = string(\"sv\")];\n"
"x = s, y = v)[name = tensor<string, []>(\"sv\")];\n"
" } -> (out);\n}\n",
HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD];
Kern kSV = compile_mil(sv_mil);

View File

@ -187,13 +187,11 @@ int main() {
printf("Test 1: no mask\n");
{
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
"query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
"query = q, key = k, value = v)[name = tensor<string, []>(\"sdpa\")];\n"
" } -> (att);\n}\n",
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
Model m = compile_model(mil, nil);
@ -209,14 +207,12 @@ int main() {
{
NSString *maskStr = build_inline_causal_mask(SEQ);
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
" %@ mask = const()[name = string(\"mask\"), val = %@];\n"
" %@ mask = const()[name = tensor<string, []>(\"mask\"), val = %@];\n"
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
"query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
"query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
" } -> (att);\n}\n",
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
[NSString stringWithFormat:@"tensor<fp16, [1, 1, %d, %d]>", SEQ, SEQ], maskStr,
@ -233,15 +229,13 @@ int main() {
printf("\nTest 3: BLOBFILE causal mask\n");
{
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
" tensor<fp16, [1, 1, %d, %d]> mask = const()[name = string(\"mask\"), "
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [1, 1, %d, %d]> mask = const()[name = tensor<string, []>(\"mask\"), "
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/mask.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
"query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
"query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
" } -> (att);\n}\n",
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD];
@ -258,14 +252,12 @@ int main() {
printf("\nTest 4: mask as runtime input\n");
{
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v, "
"tensor<fp16, [1, 1, %d, %d]> mask) {\n"
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
"query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
"query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
" } -> (att);\n}\n",
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
SEQ, SEQ, HEADS, SEQ, HD];

View File

@ -82,19 +82,17 @@ static void cleanup_kern(Kern *k) {
static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) {
return [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(%d)];\n"
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n"
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
" } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp];
}

View File

@ -130,64 +130,62 @@ int main() {
float scale_val = 1.0f / sqrtf((float)HD);
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
// Conv boilerplate
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr1 = const()[name = tensor<string, []>(\"g1\"), val = tensor<int32, []>(1)];\n"
// QKV weights
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = string(\"Wo\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = tensor<string, []>(\"Wo\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wo.bin\"), offset = tensor<uint64, []>(64)))];\n"
// QKV projections
" tensor<fp16, [1, %d, 1, %d]> q_flat = conv(dilations = dl, groups = gr1, pad = pd, "
"pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n"
"pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor<string, []>(\"cq\")];\n"
" tensor<fp16, [1, %d, 1, %d]> k_flat = conv(dilations = dl, groups = gr1, pad = pd, "
"pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n"
"pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor<string, []>(\"ck\")];\n"
" tensor<fp16, [1, %d, 1, %d]> v_flat = conv(dilations = dl, groups = gr1, pad = pd, "
"pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n"
"pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor<string, []>(\"cv\")];\n"
// Reshape: [1, DIM, 1, SEQ] [1, HEADS, HD, SEQ] transpose [1, HEADS, SEQ, HD]
" tensor<int32, [4]> qsh = const()[name = string(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
" tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n"
" tensor<int32, [4]> perm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
" tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n"
" tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n"
" tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n"
" tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n"
" tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n"
" tensor<int32, [4]> qsh = const()[name = tensor<string, []>(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
" tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = tensor<string, []>(\"rq\")];\n"
" tensor<int32, [4]> perm = const()[name = tensor<string, []>(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
" tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = tensor<string, []>(\"tq\")];\n"
" tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = tensor<string, []>(\"rk\")];\n"
" tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = tensor<string, []>(\"tk\")];\n"
" tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = tensor<string, []>(\"rv\")];\n"
" tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = tensor<string, []>(\"tv\")];\n"
// Q @ K^T
" bool ty = const()[name = string(\"ty\"), val = bool(true)];\n"
" bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
" tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n"
" tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(true)];\n"
" tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
" tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = tensor<string, []>(\"mm1\")];\n"
// Scale
" fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n"
" tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n"
" tensor<fp16, []> sc = const()[name = tensor<string, []>(\"sc\"), val = fp16(%f)];\n"
" tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = tensor<string, []>(\"scl\")];\n"
// Causal mask
" tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = string(\"cm\"), "
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n"
" tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = tensor<string, []>(\"cm\"), "
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/mask.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = tensor<string, []>(\"msk\")];\n"
// Softmax
" int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n"
" tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n"
" tensor<int32, []> sax = const()[name = tensor<string, []>(\"sax\"), val = tensor<int32, []>(-1)];\n"
" tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = tensor<string, []>(\"sm\")];\n"
// scores @ V
" tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n"
" tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = tensor<string, []>(\"mm2\")];\n"
// Reshape back: [1, HEADS, SEQ, HD] transpose [1, HEADS, HD, SEQ] reshape [1, DIM, 1, SEQ]
" tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n"
" tensor<int32, [4]> osh = const()[name = string(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n"
" tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = tensor<string, []>(\"ta\")];\n"
" tensor<int32, [4]> osh = const()[name = tensor<string, []>(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = tensor<string, []>(\"ra\")];\n"
// Wo projection
" tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr1, pad = pd, "
"pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n"
"pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = tensor<string, []>(\"co\")];\n"
" } -> (out);\n}\n",
DIM, SEQ, // input
DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk
@ -317,30 +315,28 @@ int main() {
printf("\n=== Test 2: Fused FFN benchmark ===\n");
{
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = string(\"W2\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = tensor<string, []>(\"W2\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w2.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n"
"pad_type = pt, strides = st, weight = W1, x = x)[name = tensor<string, []>(\"c1\")];\n"
" tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n"
" tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = string(\"sg\")];\n"
" tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = string(\"si\")];\n"
" tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n"
"pad_type = pt, strides = st, weight = W3, x = x)[name = tensor<string, []>(\"c3\")];\n"
" tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = tensor<string, []>(\"sg\")];\n"
" tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = tensor<string, []>(\"si\")];\n"
" tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = tensor<string, []>(\"gt\")];\n"
" tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n"
"pad_type = pt, strides = st, weight = W2, x = gate)[name = tensor<string, []>(\"c2\")];\n"
" } -> (out);\n}\n",
DIM, SEQ,
HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN,

View File

@ -15,6 +15,8 @@
#define HIDDEN 2048
#define SEQ 64
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
static Class g_D, g_I, g_AR, g_AIO;
static void ane_init(void) {
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
@ -58,47 +60,77 @@ int main() {
// MIL: slice input 2 convs add
printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n");
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n" // [1, HIDDEN*2, 1, SEQ]
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
// Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :]
" tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [4]> s1 = const()[name = string(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n"
" tensor<int32, [4]> b3 = const()[name = string(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
" tensor<int32, [4]> s3 = const()[name = string(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n"
// Conv: W1^T @ dh1, W3^T @ dh3
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
// W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM])
" tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = string(\"W1t\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = string(\"W3t\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n"
" tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n"
// Add
" tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n"
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n"
" } -> (y);\n}\n",
HIDDEN*2, SEQ, HIDDEN*2, SEQ,
HIDDEN, SEQ, HIDDEN, SEQ, // slice1
HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3
DIM, HIDDEN, DIM, HIDDEN, // W1t
DIM, HIDDEN, DIM, HIDDEN, // W3t
DIM, SEQ, DIM, SEQ, // dx1, dx3
DIM, SEQ, DIM, SEQ]; // sum, y
retry_compile:;
NSString *mil;
if (g_fp16_io) {
mil = [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<int32, [4]> b1 = const()[name = tensor<string, []>(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [4]> s1 = const()[name = tensor<string, []>(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x, begin = b1, size = s1)[name = tensor<string, []>(\"sl1\")];\n"
" tensor<int32, [4]> b3 = const()[name = tensor<string, []>(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
" tensor<int32, [4]> s3 = const()[name = tensor<string, []>(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x, begin = b3, size = s3)[name = tensor<string, []>(\"sl3\")];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = tensor<string, []>(\"W1t\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = tensor<string, []>(\"W3t\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor<string, []>(\"cv1\")];\n"
" tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor<string, []>(\"cv3\")];\n"
" tensor<fp16, [1, %d, 1, %d]> y = add(x = dx1, y = dx3)[name = tensor<string, []>(\"ad\")];\n"
" } -> (y);\n}\n",
HIDDEN*2, SEQ,
HIDDEN, SEQ, HIDDEN, SEQ,
HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,
DIM, HIDDEN, DIM, HIDDEN,
DIM, HIDDEN, DIM, HIDDEN,
DIM, SEQ, DIM, SEQ,
DIM, SEQ];
} else {
mil = [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
" tensor<int32, [4]> b1 = const()[name = tensor<string, []>(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [4]> s1 = const()[name = tensor<string, []>(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = tensor<string, []>(\"sl1\")];\n"
" tensor<int32, [4]> b3 = const()[name = tensor<string, []>(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
" tensor<int32, [4]> s3 = const()[name = tensor<string, []>(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
" tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = tensor<string, []>(\"sl3\")];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = tensor<string, []>(\"W1t\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = tensor<string, []>(\"W3t\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor<string, []>(\"cv1\")];\n"
" tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor<string, []>(\"cv3\")];\n"
" tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = tensor<string, []>(\"ad\")];\n"
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = tensor<string, []>(\"co\")];\n"
" } -> (y);\n}\n",
HIDDEN*2, SEQ, HIDDEN*2, SEQ,
HIDDEN, SEQ, HIDDEN, SEQ,
HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,
DIM, HIDDEN, DIM, HIDDEN,
DIM, HIDDEN, DIM, HIDDEN,
DIM, SEQ, DIM, SEQ,
DIM, SEQ, DIM, SEQ];
}
NSDictionary *wd = @{
@"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)},
@ -119,6 +151,12 @@ int main() {
NSError *e = nil;
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
if (!ok && !g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
goto retry_compile;
}
printf("Compile: %s\n", ok?"OK":"FAIL");
if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; }
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
@ -130,13 +168,21 @@ int main() {
float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); }
IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4);
size_t bpe = g_fp16_io ? 2 : 4;
IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*bpe), ioO = make_surface(DIM*SEQ*bpe);
IOSurfaceLock(ioI, 0, NULL);
float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
// Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3
for (int t = 0; t < SEQ; t++) {
for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
if (g_fp16_io) {
_Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioI);
for (int t = 0; t < SEQ; t++) {
for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = (_Float16)dh1[t*HIDDEN+c];
for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = (_Float16)dh3[t*HIDDEN+c];
}
} else {
float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
for (int t = 0; t < SEQ; t++) {
for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
}
}
IOSurfaceUnlock(ioI, 0, NULL);
@ -164,13 +210,22 @@ int main() {
}
IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
float *src = (float*)IOSurfaceGetBaseAddress(ioO);
float maxd = 0;
for (int t = 0; t < SEQ; t++)
for (int c = 0; c < DIM; c++) {
float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
if (d > maxd) maxd = d;
}
if (g_fp16_io) {
_Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(ioO);
for (int t = 0; t < SEQ; t++)
for (int c = 0; c < DIM; c++) {
float d = fabsf((float)src[c*SEQ+t] - ref[t*DIM+c]);
if (d > maxd) maxd = d;
}
} else {
float *src = (float*)IOSurfaceGetBaseAddress(ioO);
for (int t = 0; t < SEQ; t++)
for (int c = 0; c < DIM; c++) {
float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
if (d > maxd) maxd = d;
}
}
IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
printf("dx max diff: %.6f\n", maxd);

View File

@ -12,6 +12,8 @@
#define DIM 768
#define SEQ 64
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
static Class g_D, g_I, g_AR, g_AIO;
static mach_timebase_info_data_t g_tb;
static void ane_init(void) {
@ -56,7 +58,10 @@ static Kern compile_mil(NSString *mil, NSDictionary *wd) {
}
NSError *e = nil;
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k;
printf("compile %s: %s\n", g_fp16_io ? "FAIL" : "failed (will retry)",
e ? [[e localizedDescription] UTF8String] : "");
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
return k;
}
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
k.model = mdl; k.td = td;
@ -85,67 +90,108 @@ static void cleanup_kern(Kern *k) {
// Fused QKV: 3 convs + concat in one MIL
static NSString *gen_fused_qkv_mil(void) {
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor<string, []>(\"cq\")];\n"
" tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor<string, []>(\"ck\")];\n"
" tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor<string, []>(\"cv\")];\n"
" tensor<int32, []> ax = const()[name = tensor<string, []>(\"ax\"), val = tensor<int32, []>(1)];\n"
" tensor<bool, []> inter = const()[name = tensor<string, []>(\"il\"), val = tensor<bool, []>(false)];\n"
" tensor<fp16, [1, %d, 1, %d]> y = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor<string, []>(\"cat\")];\n"
" } -> (y);\n}\n",
DIM, SEQ,
DIM, DIM, DIM, DIM,
DIM, DIM, DIM, DIM,
DIM, DIM, DIM, DIM,
DIM, SEQ, DIM, SEQ, DIM, SEQ,
DIM*3, SEQ];
}
return [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n"
"pad_type = pt, strides = st, weight = Wq, x = x16)[name = tensor<string, []>(\"cq\")];\n"
" tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n"
"pad_type = pt, strides = st, weight = Wk, x = x16)[name = tensor<string, []>(\"ck\")];\n"
" tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n"
" int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n"
" bool inter = const()[name = string(\"il\"), val = bool(false)];\n"
" tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n"
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n"
"pad_type = pt, strides = st, weight = Wv, x = x16)[name = tensor<string, []>(\"cv\")];\n"
" tensor<int32, []> ax = const()[name = tensor<string, []>(\"ax\"), val = tensor<int32, []>(1)];\n"
" tensor<bool, []> inter = const()[name = tensor<string, []>(\"il\"), val = tensor<bool, []>(false)];\n"
" tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor<string, []>(\"cat\")];\n"
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = tensor<string, []>(\"co\")];\n"
" } -> (y);\n}\n",
DIM, SEQ, DIM, SEQ,
DIM, DIM, DIM, DIM, // Wq
DIM, DIM, DIM, DIM, // Wk
DIM, DIM, DIM, DIM, // Wv
DIM, SEQ, // q
DIM, SEQ, // k
DIM, SEQ, // v
DIM*3, SEQ, // concat
DIM*3, SEQ]; // output
DIM, DIM, DIM, DIM,
DIM, DIM, DIM, DIM,
DIM, DIM, DIM, DIM,
DIM, SEQ, DIM, SEQ, DIM, SEQ,
DIM*3, SEQ, DIM*3, SEQ];
}
// Single conv MIL for comparison
static NSString *gen_single_mil(void) {
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
" } -> (y);\n}\n",
DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ];
}
return [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
"pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
" } -> (y);\n}\n",
DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ];
}
@ -170,12 +216,18 @@ int main() {
for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1);
// === Compile fused QKV ===
retry_compile:;
NSDictionary *fused_wd = @{
@"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)},
@"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)},
@"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)},
};
Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd);
if (!kFused.model && !g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
goto retry_compile;
}
printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL");
// === Compile 3 separate ===
@ -187,16 +239,24 @@ int main() {
if (!kFused.model || !kQ.model) goto done;
// IOSurfaces
size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4;
size_t bpe = g_fp16_io ? 2 : 4;
size_t in_bytes = DIM*SEQ*bpe, out1_bytes = DIM*SEQ*bpe, out3_bytes = DIM*3*SEQ*bpe;
IOSurfaceRef ioIn = make_surface(in_bytes);
IOSurfaceRef ioFused = make_surface(out3_bytes);
IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes);
IOSurfaceLock(ioIn, 0, NULL);
float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int t = 0; t < SEQ; t++)
for (int c = 0; c < DIM; c++)
dst[c*SEQ+t] = x[t*DIM+c];
if (g_fp16_io) {
_Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int t = 0; t < SEQ; t++)
for (int c = 0; c < DIM; c++)
dst[c*SEQ+t] = (_Float16)x[t*DIM+c];
} else {
float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int t = 0; t < SEQ; t++)
for (int c = 0; c < DIM; c++)
dst[c*SEQ+t] = x[t*DIM+c];
}
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval fused
@ -212,17 +272,30 @@ int main() {
IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL);
IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL);
IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL);
float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
float dq=0, dk=0, dv=0;
for (int c = 0; c < DIM; c++)
for (int t = 0; t < SEQ; t++) {
float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
}
if (g_fp16_io) {
_Float16 *fo = (_Float16*)IOSurfaceGetBaseAddress(ioFused);
_Float16 *qo = (_Float16*)IOSurfaceGetBaseAddress(ioQ);
_Float16 *ko = (_Float16*)IOSurfaceGetBaseAddress(ioK);
_Float16 *vo = (_Float16*)IOSurfaceGetBaseAddress(ioV);
for (int c = 0; c < DIM; c++)
for (int t = 0; t < SEQ; t++) {
float d1 = fabsf((float)fo[c*SEQ+t] - (float)qo[c*SEQ+t]); if(d1>dq) dq=d1;
float d2 = fabsf((float)fo[(DIM+c)*SEQ+t] - (float)ko[c*SEQ+t]); if(d2>dk) dk=d2;
float d3 = fabsf((float)fo[(DIM*2+c)*SEQ+t] - (float)vo[c*SEQ+t]); if(d3>dv) dv=d3;
}
} else {
float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
for (int c = 0; c < DIM; c++)
for (int t = 0; t < SEQ; t++) {
float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
}
}
IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL);
IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL);
IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL);

View File

@ -10,6 +10,8 @@
static mach_timebase_info_data_t g_tb;
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
static void dump_class(const char *name) {
Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
if (!cls) { printf(" %s: NOT FOUND\n", name); return; }
@ -118,28 +120,43 @@ int main() {
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
free(w);
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=string(\"conv\")];\n"
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
" } -> (y);\n"
"}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
retry_compile:;
NSString *mil;
if (g_fp16_io) {
mil = [NSString stringWithFormat:
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=tensor<string, []>(\"conv\")];\n"
" } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
} else {
mil = [NSString stringWithFormat:
@"program(1.0)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=tensor<string, []>(\"conv\")];\n"
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
" } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
}
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
@ -153,10 +170,15 @@ int main() {
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
if (!compiled && !g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
goto retry_compile;
}
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
int ioBytes = CH * SP * 4; // fp32
int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
IOSurfaceRef ioIn = make_surface(ioBytes);
IOSurfaceRef ioOut = make_surface(ioBytes);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
@ -174,8 +196,13 @@ int main() {
if (req) {
IOSurfaceLock(ioIn, 0, NULL);
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
if (g_fp16_io) {
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f;
} else {
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
}
IOSurfaceUnlock(ioIn, 0, NULL);
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(

View File

@ -10,6 +10,8 @@
static mach_timebase_info_data_t g_tb;
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
static IOSurfaceRef make_surface(size_t bytes) {
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
(id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
@ -38,37 +40,49 @@ int main() {
for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50));
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
NSString *mil = [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=string(\"conv\")];\n"
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
" } -> (y);\n"
"}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}};
NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
NSFileManager *fm = [NSFileManager defaultManager];
printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n");
printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6);
printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status");
retry_mil:;
NSString *mil;
if (g_fp16_io) {
mil = [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=tensor<string, []>(\"conv\")];\n"
" } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
} else {
mil = [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=tensor<string, []>(\"conv\")];\n"
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
" } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
}
NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63};
int n_qos = sizeof(qos_values)/sizeof(qos_values[0]);
@ -98,6 +112,12 @@ int main() {
double cms = tb_ms(mach_absolute_time() - t0);
if (!cok) {
if (!g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
[fm removeItemAtPath:td error:nil];
goto retry_mil;
}
printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-");
[fm removeItemAtPath:td error:nil];
continue;
@ -115,7 +135,7 @@ int main() {
continue;
}
int ioBytes = CH * SP * 4;
int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
IOSurfaceRef ioIn = make_surface(ioBytes);
IOSurfaceRef ioOut = make_surface(ioBytes);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
@ -125,8 +145,13 @@ int main() {
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
IOSurfaceLock(ioIn, 0, NULL);
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
if (g_fp16_io) {
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)0.5f;
} else {
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
}
IOSurfaceUnlock(ioIn, 0, NULL);
t0 = mach_absolute_time();

View File

@ -34,30 +34,42 @@ static NSData *build_weight_blob(_Float16 *w, int rows, int cols) {
return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
}
// Generate MIL for a simple conv: fp32 in cast fp16 conv cast fp32 out
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
// Generate MIL for a simple conv (fp16 I/O when g_fp16_io, else fp32 with casts)
static NSString *gen_mil(int ch, int sp) {
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
"[name=tensor<string, []>(\"conv\")];\n"
" } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp];
}
return [NSString stringWithFormat:
@"program(1.3)\n"
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n"
"{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
"[name=string(\"conv\")];\n"
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
" } -> (y);\n"
"}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
"[name=tensor<string, []>(\"conv\")];\n"
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
" } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
}
int main() {
@ -88,6 +100,9 @@ int main() {
for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f;
NSData *wdataA = build_weight_blob(weightsA, CH, CH);
NSFileManager *fm = [NSFileManager defaultManager];
retry_compile:;
NSString *mil = gen_mil(CH, SP);
NSDictionary *weights = @{
@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA}
@ -103,13 +118,18 @@ int main() {
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
NSFileManager *fm = [NSFileManager defaultManager];
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
if (!ok && !g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
[fm removeItemAtPath:td error:nil];
goto retry_compile;
}
if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; }
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; }
@ -117,9 +137,10 @@ int main() {
printf(" Compile+load: %.1fms\n", compile_ms);
printf(" tmpDir: %s\n", [td UTF8String]);
// Build request and IOSurfaces (fp32 I/O)
int inBytes = CH * SP * 4; // fp32
int outBytes = CH * SP * 4;
// Build request and IOSurfaces
size_t bpe = g_fp16_io ? 2 : 4;
int inBytes = CH * SP * bpe;
int outBytes = CH * SP * bpe;
IOSurfaceRef ioIn = make_surface(inBytes);
IOSurfaceRef ioOut = make_surface(outBytes);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
@ -130,10 +151,17 @@ int main() {
// Write input: channel c, spatial s = (c*SP + s + 1) * 0.01
IOSurfaceLock(ioIn, 0, NULL);
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++)
for (int s = 0; s < SP; s++)
inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
if (g_fp16_io) {
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++)
for (int s = 0; s < SP; s++)
inp[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f);
} else {
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++)
for (int s = 0; s < SP; s++)
inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
}
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval with weights A
@ -142,13 +170,17 @@ int main() {
if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]);
float *outA_copy = (float*)malloc(CH * SP * sizeof(float));
if (g_fp16_io) {
_Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
for (int i = 0; i < CH*SP; i++) outA_copy[i] = (float)outA[i];
} else {
float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
memcpy(outA_copy, outA, CH * SP * sizeof(float));
}
printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA_copy[0], outA_copy[1], outA_copy[2], outA_copy[3]);
printf(" Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]);
// Save copy
float *outA_copy = (float*)malloc(outBytes);
memcpy(outA_copy, outA, outBytes);
outA_copy[CH*SP-4], outA_copy[CH*SP-3], outA_copy[CH*SP-2], outA_copy[CH*SP-1]);
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
// === Step 3: Overwrite weight file with B, unload+load ===
@ -189,10 +221,17 @@ int main() {
// Re-write same input
IOSurfaceLock(ioIn, 0, NULL);
inp = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++)
for (int s = 0; s < SP; s++)
inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
if (g_fp16_io) {
_Float16 *inp2 = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++)
for (int s = 0; s < SP; s++)
inp2[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f);
} else {
float *inp2 = (float*)IOSurfaceGetBaseAddress(ioIn);
for (int c = 0; c < CH; c++)
for (int s = 0; s < SP; s++)
inp2[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
}
IOSurfaceUnlock(ioIn, 0, NULL);
// Eval with (possibly reloaded) weights B
@ -201,16 +240,23 @@ int main() {
if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]);
float *outB_f = (float*)malloc(CH * SP * sizeof(float));
if (g_fp16_io) {
_Float16 *outB = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
for (int i = 0; i < CH*SP; i++) outB_f[i] = (float)outB[i];
} else {
float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
memcpy(outB_f, outB, CH * SP * sizeof(float));
}
printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB_f[0], outB_f[1], outB_f[2], outB_f[3]);
printf(" Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]);
outB_f[CH*SP-4], outB_f[CH*SP-3], outB_f[CH*SP-2], outB_f[CH*SP-1]);
// Check: did the output change?
bool changed = false;
float max_diff = 0;
for (int i = 0; i < CH*SP; i++) {
float d = fabsf(outB[i] - outA_copy[i]);
float d = fabsf(outB_f[i] - outA_copy[i]);
if (d > max_diff) max_diff = d;
if (d > 0.001f) changed = true;
}
@ -219,11 +265,12 @@ int main() {
float max_3x_err = 0;
for (int i = 0; i < CH*SP; i++) {
float expected = outA_copy[i] * 3.0f;
float err = fabsf(outB[i] - expected);
float err = fabsf(outB_f[i] - expected);
if (err > max_3x_err) max_3x_err = err;
if (err > 0.1f) correct_3x = false;
}
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
free(outB_f);
printf("\n=== RESULT ===\n");
printf(" Max A-B diff: %.6f\n", max_diff);

View File

@ -59,25 +59,43 @@ static NSData *build_blob_transposed(const float *w, int rows, int cols) {
return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
}
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
if (g_fp16_io) {
// fp16 I/O path no cast ops (M1/M2 compatible)
return [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
" } -> (y);\n}\n",
in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp];
}
// fp32 I/O path cast to/from fp16 internally (M4+ native)
return [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
"pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
" } -> (y);\n}\n",
in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
}
@ -106,10 +124,19 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
if (!g_fp16_io) {
// M1/M2 ANE doesn't support cast op retry with fp16 I/O
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
return compile_kern_with_blob(blob, in_ch, out_ch, sp);
}
return NULL;
}
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
__sync_fetch_and_add(&g_compile_count, 1);
size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
size_t bpe = g_fp16_io ? 2 : 4;
size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe;
IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
@ -140,27 +167,43 @@ static void free_kern(Kern *k) {
}
static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
for (int t = 0; t < sp; t++)
for (int c = 0; c < in_ch; c++)
tmp[c*sp + t] = in[t*in_ch + c];
// Transpose [S,C] -> [C,S] and write to IOSurface
IOSurfaceLock(k->ioIn, 0, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
void *base_in = IOSurfaceGetBaseAddress(k->ioIn);
if (g_fp16_io) {
_Float16 *dst = (_Float16*)base_in;
for (int t = 0; t < sp; t++)
for (int c = 0; c < in_ch; c++)
dst[c*sp + t] = (_Float16)in[t*in_ch + c];
} else {
float *dst = (float*)base_in;
for (int t = 0; t < sp; t++)
for (int c = 0; c < in_ch; c++)
dst[c*sp + t] = in[t*in_ch + c];
}
IOSurfaceUnlock(k->ioIn, 0, NULL);
free(tmp);
NSError *e = nil;
id mdl = (__bridge id)k->model;
id req = (__bridge id)k->request;
((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
// Read output, transpose [C,S] -> [S,C]
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
void *base_out = IOSurfaceGetBaseAddress(k->ioOut);
if (g_fp16_io) {
_Float16 *src = (_Float16*)base_out;
for (int t = 0; t < sp; t++)
for (int c = 0; c < out_ch; c++)
out[t*out_ch + c] = (float)src[c*sp + t];
} else {
float *src = (float*)base_out;
for (int t = 0; t < sp; t++)
for (int c = 0; c < out_ch; c++)
out[t*out_ch + c] = src[c*sp + t];
}
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
for (int t = 0; t < sp; t++)
for (int c = 0; c < out_ch; c++)
out[t*out_ch + c] = tmp2[c*sp + t];
free(tmp2);
}
// === Checkpoint: save/restore training state for exec() restart ===
@ -173,6 +216,7 @@ typedef struct {
float lr;
double cum_compile_ms, cum_train_ms, cum_wall_ms;
int cum_steps, cum_batches;
int fp16_io; // persisted: 1 if ANE needs fp16 I/O (M1/M2)
} CkptHeader;
static void save_checkpoint(const char *path, int step, float loss,
@ -180,7 +224,7 @@ static void save_checkpoint(const char *path, int step, float loss,
const float *W1, const float *W2,
double cc, double ct, double cw, int cs, int cb) {
FILE *f = fopen(path, "wb");
CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb};
CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb, g_fp16_io};
fwrite(&hdr, sizeof(hdr), 1, f);
fwrite(W1, sizeof(float), H * D, f);
fwrite(W2, sizeof(float), D * H, f);
@ -241,8 +285,9 @@ int main(int argc, char *argv[]) {
start_step = hdr.step;
total_steps = hdr.total_steps;
lr = hdr.lr;
g_fp16_io = hdr.fp16_io;
resuming = true;
printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss);
printf("[RESUMED at step %d, loss=%.6f, fp16_io=%d, compiles reset]\n", start_step, hdr.loss, g_fp16_io);
}
}

View File

@ -59,34 +59,50 @@ static NSData *build_blob_transposed(const float *w, int rows, int cols) {
return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
}
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
if (g_fp16_io) {
return [NSString stringWithFormat:
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
" } -> (y);\n}\n",
in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp];
}
return [NSString stringWithFormat:
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
"pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
"pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
" } -> (y);\n}\n",
in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
}
typedef struct {
id model;
void *model; // CFBridgingRetain'd _ANEInMemoryModel
IOSurfaceRef ioIn, ioOut;
id request;
NSString *tmpDir;
void *request; // CFBridgingRetain'd _ANERequest
void *tmpDir; // CFBridgingRetain'd NSString
} Kern;
static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
@ -103,9 +119,17 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
NSError *e = nil;
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
if (!g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
return compile_kern_with_blob(blob, in_ch, out_ch, sp);
}
return NULL;
}
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
size_t bpe = g_fp16_io ? 2 : 4;
size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe;
IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
@ -113,40 +137,60 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
Kern *k = calloc(1, sizeof(Kern));
k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td;
k->model = (void*)CFBridgingRetain(mdl);
k->ioIn = ioI; k->ioOut = ioO;
k->request = (void*)CFBridgingRetain(req);
k->tmpDir = (void*)CFBridgingRetain(td);
return k;
}
static void free_kern(Kern *k) {
if (!k) return;
id mdl = (__bridge id)k->model;
NSError *e = nil;
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
CFRelease(k->ioIn); CFRelease(k->ioOut);
[[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
NSString *td = (__bridge id)k->tmpDir;
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
free(k);
}
// ANE eval: input [S, in_ch] row-major [in_ch, S] channels-first
static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
for (int t = 0; t < sp; t++)
for (int c = 0; c < in_ch; c++)
tmp[c*sp + t] = in[t*in_ch + c];
IOSurfaceLock(k->ioIn, 0, NULL);
memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
void *base_in = IOSurfaceGetBaseAddress(k->ioIn);
if (g_fp16_io) {
_Float16 *dst = (_Float16*)base_in;
for (int t = 0; t < sp; t++)
for (int c = 0; c < in_ch; c++)
dst[c*sp + t] = (_Float16)in[t*in_ch + c];
} else {
float *dst = (float*)base_in;
for (int t = 0; t < sp; t++)
for (int c = 0; c < in_ch; c++)
dst[c*sp + t] = in[t*in_ch + c];
}
IOSurfaceUnlock(k->ioIn, 0, NULL);
free(tmp);
NSError *e = nil;
id mdl = (__bridge id)k->model;
id req = (__bridge id)k->request;
((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e);
float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
void *base_out = IOSurfaceGetBaseAddress(k->ioOut);
if (g_fp16_io) {
_Float16 *src = (_Float16*)base_out;
for (int t = 0; t < sp; t++)
for (int c = 0; c < out_ch; c++)
out[t*out_ch + c] = (float)src[c*sp + t];
} else {
float *src = (float*)base_out;
for (int t = 0; t < sp; t++)
for (int c = 0; c < out_ch; c++)
out[t*out_ch + c] = src[c*sp + t];
}
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
for (int t = 0; t < sp; t++)
for (int c = 0; c < out_ch; c++)
out[t*out_ch + c] = tmp2[c*sp + t];
free(tmp2);
}
int main(int argc, char *argv[]) {

View File

@ -8,6 +8,8 @@
#include <math.h>
#include "backward.h"
int g_fp16_io = 0; // M1/M2: use fp16 I/O when cast op unsupported
static mach_timebase_info_data_t g_tb;
static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }