mirror of https://github.com/maderix/ANE.git
Fix MIL syntax for cross-generation ANE compatibility
The MIL scalar types used shorthand syntax (string("x"), int32(1)) that
only works on M4. Changed to the canonical verbose format that CoreML's
own compiler emits (tensor<string, []>("x"), tensor<int32, []>(1)).
Also targets program(1.0) with <ios16> instead of program(1.3)/<ios18>,
and simplifies buildInfo to just coremlc-version.
For conv-based kernels, adds runtime fp16 I/O fallback — M1/M2 ANE
doesn't support the cast op (fp32<->fp16), so on first compile failure
it retries with native fp16 inputs/outputs and does the conversion on
the CPU side. The fallback is persisted across exec() restarts.
Note: matmul and scaled_dot_product_attention ops still fail on M1/M2 —
these are M4+ ANE ops. The attention tests (test_ane_causal_attn,
test_ane_sdpa5, test_full_fused attention part) require M4 hardware.
Conv-based kernels (training, QKV projections, FFN) work on all generations.
Tested on M1 Pro, macOS 26.3 (Tahoe).
This commit is contained in:
parent
893f58e725
commit
709b60208f
|
|
@ -0,0 +1,7 @@
|
|||
*.o
|
||||
ane_probe
|
||||
api_explore
|
||||
inmem_basic
|
||||
tiny_train
|
||||
tiny_train_m1
|
||||
train_large
|
||||
|
|
@ -5,6 +5,9 @@
|
|||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
// Set by caller: 1 = fp16 I/O (M1/M2 fallback, no cast ops), 0 = fp32 I/O with cast (M4+)
|
||||
extern int g_fp16_io;
|
||||
|
||||
// Build an FP16 weight blob with the required header structure.
|
||||
// weights_f32: source weights in row-major [out_ch, in_ch]
|
||||
// Returns NSData with header + FP16 weights
|
||||
|
|
@ -30,21 +33,32 @@ static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int i
|
|||
// Input W: [1, out_ch, in_ch] fp32
|
||||
// Output: [1, out_ch, spatial] fp32
|
||||
static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d]> x, tensor<fp16, [1, %d, %d]> W) {\n"
|
||||
" tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<fp16, [1, %d, %d]> y = matmul(transpose_x = tx, transpose_y = ty, x = W, y = x)[name = tensor<string, []>(\"mm\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n",
|
||||
in_ch, spatial, out_ch, in_ch, out_ch, spatial];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
|
||||
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
|
||||
" tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n"
|
||||
" bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
|
||||
" bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
|
||||
" tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n"
|
||||
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
|
||||
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_x\")];\n"
|
||||
" tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = tensor<string, []>(\"cast_W\")];\n"
|
||||
" tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = tensor<string, []>(\"mm\")];\n"
|
||||
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = tensor<string, []>(\"cast_out\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n",
|
||||
in_ch, spatial, out_ch, in_ch,
|
||||
|
|
@ -54,26 +68,45 @@ static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
|
|||
|
||||
// Keep the baked-weight version for reference (used in inference-only scenarios)
|
||||
static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x)[name = tensor<string, []>(\"conv\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n",
|
||||
in_ch, spatial,
|
||||
out_ch, in_ch, out_ch, in_ch,
|
||||
out_ch, spatial];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
|
||||
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n"
|
||||
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = tensor<string, []>(\"conv\")];\n"
|
||||
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = tensor<string, []>(\"cast_out\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n",
|
||||
in_ch, spatial, in_ch, spatial,
|
||||
|
|
@ -88,36 +121,65 @@ static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
|
|||
// where cs = 64 + dim*dim*2
|
||||
static NSString *mil_gen_qkv(int dim, int spatial) {
|
||||
NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x)[name = tensor<string, []>(\"conv_q\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x)[name = tensor<string, []>(\"conv_k\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x)[name = tensor<string, []>(\"conv_v\")];\n"
|
||||
" } -> (q, k, v);\n"
|
||||
"}\n",
|
||||
dim, spatial,
|
||||
dim, dim, dim, dim,
|
||||
dim, dim, dim, dim, (unsigned long)(64 + cs),
|
||||
dim, dim, dim, dim, (unsigned long)(64 + 2*cs),
|
||||
dim, spatial, dim, spatial, dim, spatial];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
|
||||
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> q16 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n"
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = tensor<string, []>(\"conv_q\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> k16 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n"
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = tensor<string, []>(\"conv_k\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> v16 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n"
|
||||
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n"
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = tensor<string, []>(\"conv_v\")];\n"
|
||||
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = tensor<string, []>(\"cast_q\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = tensor<string, []>(\"cast_k\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = tensor<string, []>(\"cast_v\")];\n"
|
||||
" } -> (q, k, v);\n"
|
||||
"}\n",
|
||||
dim, spatial, dim, spatial,
|
||||
|
|
@ -173,31 +235,55 @@ static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, in
|
|||
// Generate MIL for fused FFN up: w1 + w3 parallel convs
|
||||
static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
|
||||
NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> out1 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x)[name = tensor<string, []>(\"conv_w1\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> out3 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x)[name = tensor<string, []>(\"conv_w3\")];\n"
|
||||
" } -> (out1, out3);\n"
|
||||
"}\n",
|
||||
dim, spatial,
|
||||
hidden_dim, dim, hidden_dim, dim,
|
||||
hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs),
|
||||
hidden_dim, spatial, hidden_dim, spatial];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
|
||||
" string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n"
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = tensor<string, []>(\"conv_w1\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = c_dilations, groups = c_groups, "
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n"
|
||||
" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n"
|
||||
"pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = tensor<string, []>(\"conv_w3\")];\n"
|
||||
" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = tensor<string, []>(\"cast_h1\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = tensor<string, []>(\"cast_h3\")];\n"
|
||||
" } -> (out1, out3);\n"
|
||||
"}\n",
|
||||
dim, spatial, dim, spatial,
|
||||
|
|
|
|||
|
|
@ -4,15 +4,13 @@
|
|||
#include "stories_io.h"
|
||||
|
||||
#define MIL_HDR \
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
#define CONV_CONST \
|
||||
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
|
||||
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
|
||||
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
|
||||
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
|
||||
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n" \
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
|
||||
// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm)
|
||||
static NSString *gen_sdpa_fwd_taps(void) {
|
||||
|
|
@ -20,53 +18,53 @@ static NSString *gen_sdpa_fwd_taps(void) {
|
|||
float invd = 1.0f/(float)DIM;
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
|
||||
[m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=tensor<string, []>(\"sq\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
|
||||
[m appendFormat:@" tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"ss\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> invd = const()[name=tensor<string, []>(\"invd\"), val=tensor<fp16, []>(%f)];\n", invd];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=tensor<string, []>(\"ss2\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> eps = const()[name=tensor<string, []>(\"eps\"), val=tensor<fp16, []>(0.00001)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=tensor<string, []>(\"ss3\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> nhalf = const()[name=tensor<string, []>(\"nhalf\"), val=tensor<fp16, []>(-0.5)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=tensor<string, []>(\"rrms\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=tensor<string, []>(\"xr\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=tensor<string, []>(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/rms1.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=tensor<string, []>(\"xn\")];\n", DIM, SEQ];
|
||||
[m appendString:@CONV_CONST];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
|
||||
[m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
|
||||
[m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
|
||||
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
|
||||
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wq = const()[name=tensor<string, []>(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wq.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wk = const()[name=tensor<string, []>(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wk.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wv = const()[name=tensor<string, []>(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wv.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wo = const()[name=tensor<string, []>(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wo.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=tensor<string, []>(\"cq\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=tensor<string, []>(\"ck\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=tensor<string, []>(\"cv\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> qsh = const()[name=tensor<string, []>(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=tensor<string, []>(\"rv\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=tensor<string, []>(\"tv\")];\n", HEADS,SEQ,HD];
|
||||
[m appendString:@" tensor<bool, []> tx = const()[name=tensor<string, []>(\"tx\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendString:@" tensor<bool, []> ty = const()[name=tensor<string, []>(\"ty\"), val=tensor<bool, []>(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=tensor<string, []>(\"mm1\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=tensor<string, []>(\"scl\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=tensor<string, []>(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/mask.bin\"), offset=tensor<uint64, []>(64)))];\n", SEQ,SEQ,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=tensor<string, []>(\"msk\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" tensor<int32, []> sax = const()[name=tensor<string, []>(\"sax\"), val=tensor<int32, []>(-1)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=tensor<string, []>(\"sm\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=tensor<string, []>(\"mm2\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=tensor<string, []>(\"ta\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> os = const()[name=tensor<string, []>(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=tensor<string, []>(\"ra\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=tensor<string, []>(\"co\")];\n", DIM,SEQ];
|
||||
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
|
||||
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=tensor<string, []>(\"cat\")];\n", 6*DIM,SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -76,33 +74,33 @@ static NSString *gen_ffn_fwd_taps(void) {
|
|||
float invd = 1.0f/(float)DIM;
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
|
||||
[m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=tensor<string, []>(\"sq\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
|
||||
[m appendFormat:@" tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"ss\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> invd = const()[name=tensor<string, []>(\"invd\"), val=tensor<fp16, []>(%f)];\n", invd];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=tensor<string, []>(\"ss2\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> eps = const()[name=tensor<string, []>(\"eps\"), val=tensor<fp16, []>(0.00001)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=tensor<string, []>(\"ss3\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> nhalf = const()[name=tensor<string, []>(\"nhalf\"), val=tensor<fp16, []>(-0.5)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=tensor<string, []>(\"rrms\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=tensor<string, []>(\"xr\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=tensor<string, []>(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/rms2.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=tensor<string, []>(\"xn\")];\n", DIM, SEQ];
|
||||
[m appendString:@CONV_CONST];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
|
||||
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
|
||||
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1 = const()[name=tensor<string, []>(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w1.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3 = const()[name=tensor<string, []>(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w3.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2 = const()[name=tensor<string, []>(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w2.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=tensor<string, []>(\"c1\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=tensor<string, []>(\"c3\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=tensor<string, []>(\"sg\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=tensor<string, []>(\"si\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=tensor<string, []>(\"gt\")];\n", HIDDEN,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=tensor<string, []>(\"c2\")];\n", DIM,SEQ];
|
||||
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
|
||||
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=tensor<string, []>(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -111,36 +109,36 @@ static NSString *gen_ffn_fwd_taps(void) {
|
|||
static NSString *gen_ffn_bwd(void) {
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
|
||||
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
|
||||
[m appendString:@CONV_CONST];
|
||||
[m appendString:@" tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
|
||||
[m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
|
||||
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
|
||||
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> bd = const()[name=tensor<string, []>(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<int32, [4]> sd = const()[name=tensor<string, []>(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=tensor<string, []>(\"s0\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<int32, [4]> s1 = const()[name=tensor<string, []>(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=tensor<string, []>(\"s1x\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=tensor<string, []>(\"s3x\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W2t = const()[name=tensor<string, []>(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w2t.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=tensor<string, []>(\"cw2\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=tensor<string, []>(\"sg\")];\n", HIDDEN, SEQ];
|
||||
[m appendString:@" tensor<fp16, []> one = const()[name=tensor<string, []>(\"one\"), val=tensor<fp16, []>(1.0)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=tensor<string, []>(\"oms\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=tensor<string, []>(\"homs\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=tensor<string, []>(\"brk\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=tensor<string, []>(\"dsd\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=tensor<string, []>(\"t1\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=tensor<string, []>(\"dh1\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=tensor<string, []>(\"slh\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=tensor<string, []>(\"dh3\")];\n", HIDDEN, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W1t = const()[name=tensor<string, []>(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> W3t = const()[name=tensor<string, []>(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=tensor<string, []>(\"cw1\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=tensor<string, []>(\"cw3\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=tensor<string, []>(\"adx\")];\n", DIM, SEQ];
|
||||
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
|
||||
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=tensor<string, []>(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -149,23 +147,23 @@ static NSString *gen_ffn_bwd(void) {
|
|||
static NSString *gen_qkvb(void) {
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
|
||||
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
|
||||
[m appendString:@CONV_CONST];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=tensor<string, []>(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=tensor<string, []>(\"s0\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=tensor<string, []>(\"s1\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=tensor<string, []>(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wqt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=tensor<string, []>(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wkt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=tensor<string, []>(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wvt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=tensor<string, []>(\"cq\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=tensor<string, []>(\"ck\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=tensor<string, []>(\"cv\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=tensor<string, []>(\"aqk\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=tensor<string, []>(\"out\")];\n", DIM,SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -175,49 +173,49 @@ static NSString *gen_sdpa_bwd1(void) {
|
|||
float sc = 1.0f/sqrtf((float)HD);
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
|
||||
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
|
||||
[m appendString:@CONV_CONST];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
|
||||
[m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
|
||||
[m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
|
||||
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
|
||||
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=tensor<string, []>(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=tensor<string, []>(\"s0\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=tensor<string, []>(\"s1\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=tensor<string, []>(\"s3\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> Wot = const()[name=tensor<string, []>(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wot.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=tensor<string, []>(\"cwo\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=tensor<string, []>(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=tensor<string, []>(\"rv\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=tensor<string, []>(\"tv\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=tensor<string, []>(\"rd\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=tensor<string, []>(\"td\")];\n", HEADS,SEQ,HD];
|
||||
[m appendString:@" tensor<bool, []> bF = const()[name=tensor<string, []>(\"bF\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendString:@" tensor<bool, []> bT = const()[name=tensor<string, []>(\"bT\"), val=tensor<bool, []>(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=tensor<string, []>(\"mm1\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=tensor<string, []>(\"scl\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,%d,%d]> cm = const()[name=tensor<string, []>(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/mask.bin\"), offset=tensor<uint64, []>(64)))];\n", SEQ,SEQ,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=tensor<string, []>(\"msk\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" tensor<int32, []> sax = const()[name=tensor<string, []>(\"sax\"), val=tensor<int32, []>(-1)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=tensor<string, []>(\"sm\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=tensor<string, []>(\"dv\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=tensor<string, []>(\"dp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=tensor<string, []>(\"dvt\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> dvs = const()[name=tensor<string, []>(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=tensor<string, []>(\"dvf\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> scs = const()[name=tensor<string, []>(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=tensor<string, []>(\"pf\")];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=tensor<string, []>(\"dpf\")];\n", SCORE_CH,SEQ];
|
||||
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
|
||||
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=tensor<string, []>(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -228,46 +226,46 @@ static NSString *gen_sdpa_bwd2(void) {
|
|||
int bwd2_in = 2*SCORE_CH + 2*DIM;
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
|
||||
[m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
|
||||
[m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
|
||||
[m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
|
||||
[m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
|
||||
[m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz_sc = const()[name=tensor<string, []>(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=tensor<string, []>(\"s0\")];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=tensor<string, []>(\"s1\")];\n", SCORE_CH,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> sz_d = const()[name=tensor<string, []>(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=tensor<string, []>(\"s3\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> ssh = const()[name=tensor<string, []>(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=tensor<string, []>(\"rp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=tensor<string, []>(\"rdp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> rsh = const()[name=tensor<string, []>(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=tensor<string, []>(\"pdp\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
|
||||
[m appendString:@" tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"rs\")];\n", HEADS,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=tensor<string, []>(\"dps\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=tensor<string, []>(\"ds0\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=tensor<string, []>(\"ds\")];\n", HEADS,SEQ,SEQ];
|
||||
[m appendString:@" tensor<bool, []> bF = const()[name=tensor<string, []>(\"bF\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendString:@" tensor<bool, []> bT = const()[name=tensor<string, []>(\"bT\"), val=tensor<bool, []>(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=tensor<string, []>(\"dq\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=tensor<string, []>(\"dk\")];\n", HEADS,SEQ,HD];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=tensor<string, []>(\"dqt\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=tensor<string, []>(\"dkt\")];\n", HEADS,HD,SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> fs = const()[name=tensor<string, []>(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=tensor<string, []>(\"dqf\")];\n", DIM,SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=tensor<string, []>(\"dkf\")];\n", DIM,SEQ];
|
||||
[m appendString:@" tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
|
||||
[m appendString:@" tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=tensor<string, []>(\"cat\")];\n", 2*DIM,SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,6 +50,8 @@ static IOSurfaceRef make_surface(size_t bytes) {
|
|||
(id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
|
||||
}
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
int main() {
|
||||
@autoreleasepool {
|
||||
setbuf(stdout, NULL);
|
||||
|
|
@ -106,28 +108,43 @@ int main() {
|
|||
memcpy(blob+128, w, ws);
|
||||
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
|
||||
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
|
||||
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=string(\"conv\")];\n"
|
||||
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
|
||||
retry_compile:;
|
||||
NSString *mil;
|
||||
if (g_fp16_io) {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
|
||||
} else {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
|
||||
" } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
|
||||
}
|
||||
|
||||
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
|
||||
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
|
||||
|
|
@ -135,23 +152,33 @@ int main() {
|
|||
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
|
||||
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
|
||||
withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
|
||||
|
||||
NSError *e = nil;
|
||||
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
|
||||
BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!compiled && !g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
goto retry_compile;
|
||||
}
|
||||
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
|
||||
int ioBytes = CH * SP * 4;
|
||||
int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
|
||||
IOSurfaceRef ioIn = make_surface(ioBytes);
|
||||
IOSurfaceRef ioOut = make_surface(ioBytes);
|
||||
|
||||
IOSurfaceLock(ioIn, 0, NULL);
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)((float)(s+1) * 0.1f);
|
||||
} else {
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
|
||||
}
|
||||
IOSurfaceUnlock(ioIn, 0, NULL);
|
||||
|
||||
// Baseline eval
|
||||
|
|
@ -165,9 +192,16 @@ int main() {
|
|||
printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL");
|
||||
|
||||
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
float baseline_0 = out0[0], baseline_1 = out0[1];
|
||||
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
|
||||
float baseline_0, baseline_1;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
|
||||
baseline_0 = (float)out0[0]; baseline_1 = (float)out0[1];
|
||||
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]);
|
||||
} else {
|
||||
float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
baseline_0 = out0[0]; baseline_1 = out0[1];
|
||||
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
|
||||
}
|
||||
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
|
||||
// Test weightsBuffer: IOSurface with 3x identity weights
|
||||
|
|
@ -194,10 +228,18 @@ int main() {
|
|||
printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL");
|
||||
if (ok) {
|
||||
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
|
||||
bool changed = fabsf(outW[0] - baseline_0) > 0.001f;
|
||||
bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f;
|
||||
float outW_0;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
|
||||
outW_0 = (float)outW[0];
|
||||
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]);
|
||||
} else {
|
||||
float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
outW_0 = outW[0];
|
||||
printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
|
||||
}
|
||||
bool changed = fabsf(outW_0 - baseline_0) > 0.001f;
|
||||
bool is_3x = fabsf(outW_0 - baseline_0 * 3.0f) < 0.1f;
|
||||
printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged");
|
||||
if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected");
|
||||
printf("\n");
|
||||
|
|
|
|||
|
|
@ -81,13 +81,11 @@ int main() {
|
|||
// === Approach 1: Non-causal SDPA (baseline) ===
|
||||
printf("=== Non-causal SDPA (baseline) ===\n");
|
||||
NSString *sdpa_mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
|
||||
"query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
|
||||
"query = q, key = k, value = v)[name = tensor<string, []>(\"sdpa\")];\n"
|
||||
" } -> (att);\n}\n",
|
||||
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
|
||||
Kern kSDPA = compile_mil(sdpa_mil);
|
||||
|
|
@ -100,13 +98,11 @@ int main() {
|
|||
// scores = Q @ K^T → [1, HEADS, SEQ, SEQ]
|
||||
printf("\n=== Decomposed causal attention ===\n");
|
||||
NSString *qkt_mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> k) {\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> scores = matmul("
|
||||
"x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n"
|
||||
"x = q, y = k, transpose_y = true)[name = tensor<string, []>(\"qkt\")];\n"
|
||||
" } -> (scores);\n}\n",
|
||||
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ];
|
||||
Kern kQKT = compile_mil(qkt_mil);
|
||||
|
|
@ -114,13 +110,11 @@ int main() {
|
|||
|
||||
// Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD]
|
||||
NSString *sv_mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> s, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> s, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> v) {\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> out = matmul("
|
||||
"x = s, y = v)[name = string(\"sv\")];\n"
|
||||
"x = s, y = v)[name = tensor<string, []>(\"sv\")];\n"
|
||||
" } -> (out);\n}\n",
|
||||
HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD];
|
||||
Kern kSV = compile_mil(sv_mil);
|
||||
|
|
|
|||
|
|
@ -187,13 +187,11 @@ int main() {
|
|||
printf("Test 1: no mask\n");
|
||||
{
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
|
||||
"query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
|
||||
"query = q, key = k, value = v)[name = tensor<string, []>(\"sdpa\")];\n"
|
||||
" } -> (att);\n}\n",
|
||||
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
|
||||
Model m = compile_model(mil, nil);
|
||||
|
|
@ -209,14 +207,12 @@ int main() {
|
|||
{
|
||||
NSString *maskStr = build_inline_causal_mask(SEQ);
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
|
||||
" %@ mask = const()[name = string(\"mask\"), val = %@];\n"
|
||||
" %@ mask = const()[name = tensor<string, []>(\"mask\"), val = %@];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
|
||||
"query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
|
||||
"query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
|
||||
" } -> (att);\n}\n",
|
||||
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
|
||||
[NSString stringWithFormat:@"tensor<fp16, [1, 1, %d, %d]>", SEQ, SEQ], maskStr,
|
||||
|
|
@ -233,15 +229,13 @@ int main() {
|
|||
printf("\nTest 3: BLOBFILE causal mask\n");
|
||||
{
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
|
||||
" tensor<fp16, [1, 1, %d, %d]> mask = const()[name = string(\"mask\"), "
|
||||
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [1, 1, %d, %d]> mask = const()[name = tensor<string, []>(\"mask\"), "
|
||||
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/mask.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
|
||||
"query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
|
||||
"query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
|
||||
" } -> (att);\n}\n",
|
||||
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
|
||||
SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD];
|
||||
|
|
@ -258,14 +252,12 @@ int main() {
|
|||
printf("\nTest 4: mask as runtime input\n");
|
||||
{
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
|
||||
"tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v, "
|
||||
"tensor<fp16, [1, 1, %d, %d]> mask) {\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
|
||||
"query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
|
||||
"query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
|
||||
" } -> (att);\n}\n",
|
||||
HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
|
||||
SEQ, SEQ, HEADS, SEQ, HD];
|
||||
|
|
|
|||
|
|
@ -82,19 +82,17 @@ static void cleanup_kern(Kern *k) {
|
|||
|
||||
static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(%d)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp];
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -130,64 +130,62 @@ int main() {
|
|||
float scale_val = 1.0f / sqrtf((float)HD);
|
||||
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
// Conv boilerplate
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr1 = const()[name = tensor<string, []>(\"g1\"), val = tensor<int32, []>(1)];\n"
|
||||
// QKV weights
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = string(\"Wo\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = tensor<string, []>(\"Wo\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wo.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
// QKV projections
|
||||
" tensor<fp16, [1, %d, 1, %d]> q_flat = conv(dilations = dl, groups = gr1, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor<string, []>(\"cq\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> k_flat = conv(dilations = dl, groups = gr1, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor<string, []>(\"ck\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> v_flat = conv(dilations = dl, groups = gr1, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor<string, []>(\"cv\")];\n"
|
||||
// Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD]
|
||||
" tensor<int32, [4]> qsh = const()[name = string(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n"
|
||||
" tensor<int32, [4]> perm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n"
|
||||
" tensor<int32, [4]> qsh = const()[name = tensor<string, []>(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = tensor<string, []>(\"rq\")];\n"
|
||||
" tensor<int32, [4]> perm = const()[name = tensor<string, []>(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = tensor<string, []>(\"tq\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = tensor<string, []>(\"rk\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = tensor<string, []>(\"tk\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = tensor<string, []>(\"rv\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = tensor<string, []>(\"tv\")];\n"
|
||||
// Q @ K^T
|
||||
" bool ty = const()[name = string(\"ty\"), val = bool(true)];\n"
|
||||
" bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n"
|
||||
" tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(true)];\n"
|
||||
" tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = tensor<string, []>(\"mm1\")];\n"
|
||||
// Scale
|
||||
" fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n"
|
||||
" tensor<fp16, []> sc = const()[name = tensor<string, []>(\"sc\"), val = fp16(%f)];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = tensor<string, []>(\"scl\")];\n"
|
||||
// Causal mask
|
||||
" tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = string(\"cm\"), "
|
||||
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n"
|
||||
" tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = tensor<string, []>(\"cm\"), "
|
||||
"val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/mask.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = tensor<string, []>(\"msk\")];\n"
|
||||
// Softmax
|
||||
" int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n"
|
||||
" tensor<int32, []> sax = const()[name = tensor<string, []>(\"sax\"), val = tensor<int32, []>(-1)];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = tensor<string, []>(\"sm\")];\n"
|
||||
// scores @ V
|
||||
" tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = tensor<string, []>(\"mm2\")];\n"
|
||||
// Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ]
|
||||
" tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n"
|
||||
" tensor<int32, [4]> osh = const()[name = string(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n"
|
||||
" tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = tensor<string, []>(\"ta\")];\n"
|
||||
" tensor<int32, [4]> osh = const()[name = tensor<string, []>(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = tensor<string, []>(\"ra\")];\n"
|
||||
// Wo projection
|
||||
" tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr1, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = tensor<string, []>(\"co\")];\n"
|
||||
" } -> (out);\n}\n",
|
||||
DIM, SEQ, // input
|
||||
DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk
|
||||
|
|
@ -317,30 +315,28 @@ int main() {
|
|||
printf("\n=== Test 2: Fused FFN benchmark ===\n");
|
||||
{
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = string(\"W2\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = tensor<string, []>(\"W2\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w2.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W1, x = x)[name = tensor<string, []>(\"c1\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = string(\"sg\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = string(\"si\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W3, x = x)[name = tensor<string, []>(\"c3\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = tensor<string, []>(\"sg\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = tensor<string, []>(\"si\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = tensor<string, []>(\"gt\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W2, x = gate)[name = tensor<string, []>(\"c2\")];\n"
|
||||
" } -> (out);\n}\n",
|
||||
DIM, SEQ,
|
||||
HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN,
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@
|
|||
#define HIDDEN 2048
|
||||
#define SEQ 64
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
static Class g_D, g_I, g_AR, g_AIO;
|
||||
static void ane_init(void) {
|
||||
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
|
||||
|
|
@ -58,47 +60,77 @@ int main() {
|
|||
// MIL: slice input → 2 convs → add
|
||||
printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n");
|
||||
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n" // [1, HIDDEN*2, 1, SEQ]
|
||||
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
|
||||
// Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :]
|
||||
" tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [4]> s1 = const()[name = string(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n"
|
||||
" tensor<int32, [4]> b3 = const()[name = string(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
|
||||
" tensor<int32, [4]> s3 = const()[name = string(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n"
|
||||
// Conv: W1^T @ dh1, W3^T @ dh3
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||||
// W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM])
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = string(\"W1t\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = string(\"W3t\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n"
|
||||
// Add
|
||||
" tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n"
|
||||
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
HIDDEN*2, SEQ, HIDDEN*2, SEQ,
|
||||
HIDDEN, SEQ, HIDDEN, SEQ, // slice1
|
||||
HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3
|
||||
DIM, HIDDEN, DIM, HIDDEN, // W1t
|
||||
DIM, HIDDEN, DIM, HIDDEN, // W3t
|
||||
DIM, SEQ, DIM, SEQ, // dx1, dx3
|
||||
DIM, SEQ, DIM, SEQ]; // sum, y
|
||||
retry_compile:;
|
||||
NSString *mil;
|
||||
if (g_fp16_io) {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<int32, [4]> b1 = const()[name = tensor<string, []>(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [4]> s1 = const()[name = tensor<string, []>(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x, begin = b1, size = s1)[name = tensor<string, []>(\"sl1\")];\n"
|
||||
" tensor<int32, [4]> b3 = const()[name = tensor<string, []>(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
|
||||
" tensor<int32, [4]> s3 = const()[name = tensor<string, []>(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x, begin = b3, size = s3)[name = tensor<string, []>(\"sl3\")];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = tensor<string, []>(\"W1t\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = tensor<string, []>(\"W3t\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor<string, []>(\"cv1\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor<string, []>(\"cv3\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = add(x = dx1, y = dx3)[name = tensor<string, []>(\"ad\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
HIDDEN*2, SEQ,
|
||||
HIDDEN, SEQ, HIDDEN, SEQ,
|
||||
HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,
|
||||
DIM, HIDDEN, DIM, HIDDEN,
|
||||
DIM, HIDDEN, DIM, HIDDEN,
|
||||
DIM, SEQ, DIM, SEQ,
|
||||
DIM, SEQ];
|
||||
} else {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
|
||||
" tensor<int32, [4]> b1 = const()[name = tensor<string, []>(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [4]> s1 = const()[name = tensor<string, []>(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = tensor<string, []>(\"sl1\")];\n"
|
||||
" tensor<int32, [4]> b3 = const()[name = tensor<string, []>(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
|
||||
" tensor<int32, [4]> s3 = const()[name = tensor<string, []>(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = tensor<string, []>(\"sl3\")];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = tensor<string, []>(\"W1t\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = tensor<string, []>(\"W3t\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor<string, []>(\"cv1\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor<string, []>(\"cv3\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = tensor<string, []>(\"ad\")];\n"
|
||||
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = tensor<string, []>(\"co\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
HIDDEN*2, SEQ, HIDDEN*2, SEQ,
|
||||
HIDDEN, SEQ, HIDDEN, SEQ,
|
||||
HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,
|
||||
DIM, HIDDEN, DIM, HIDDEN,
|
||||
DIM, HIDDEN, DIM, HIDDEN,
|
||||
DIM, SEQ, DIM, SEQ,
|
||||
DIM, SEQ, DIM, SEQ];
|
||||
}
|
||||
|
||||
NSDictionary *wd = @{
|
||||
@"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)},
|
||||
|
|
@ -119,6 +151,12 @@ int main() {
|
|||
|
||||
NSError *e = nil;
|
||||
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!ok && !g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
|
||||
goto retry_compile;
|
||||
}
|
||||
printf("Compile: %s\n", ok?"OK":"FAIL");
|
||||
if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; }
|
||||
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
|
|
@ -130,13 +168,21 @@ int main() {
|
|||
float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
|
||||
for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); }
|
||||
|
||||
IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4);
|
||||
size_t bpe = g_fp16_io ? 2 : 4;
|
||||
IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*bpe), ioO = make_surface(DIM*SEQ*bpe);
|
||||
IOSurfaceLock(ioI, 0, NULL);
|
||||
float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
|
||||
// Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
|
||||
for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
|
||||
if (g_fp16_io) {
|
||||
_Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioI);
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = (_Float16)dh1[t*HIDDEN+c];
|
||||
for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = (_Float16)dh3[t*HIDDEN+c];
|
||||
}
|
||||
} else {
|
||||
float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
|
||||
for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
|
||||
}
|
||||
}
|
||||
IOSurfaceUnlock(ioI, 0, NULL);
|
||||
|
||||
|
|
@ -164,13 +210,22 @@ int main() {
|
|||
}
|
||||
|
||||
IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
|
||||
float *src = (float*)IOSurfaceGetBaseAddress(ioO);
|
||||
float maxd = 0;
|
||||
for (int t = 0; t < SEQ; t++)
|
||||
for (int c = 0; c < DIM; c++) {
|
||||
float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
|
||||
if (d > maxd) maxd = d;
|
||||
}
|
||||
if (g_fp16_io) {
|
||||
_Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(ioO);
|
||||
for (int t = 0; t < SEQ; t++)
|
||||
for (int c = 0; c < DIM; c++) {
|
||||
float d = fabsf((float)src[c*SEQ+t] - ref[t*DIM+c]);
|
||||
if (d > maxd) maxd = d;
|
||||
}
|
||||
} else {
|
||||
float *src = (float*)IOSurfaceGetBaseAddress(ioO);
|
||||
for (int t = 0; t < SEQ; t++)
|
||||
for (int c = 0; c < DIM; c++) {
|
||||
float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
|
||||
if (d > maxd) maxd = d;
|
||||
}
|
||||
}
|
||||
IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
|
||||
printf("dx max diff: %.6f\n", maxd);
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@
|
|||
#define DIM 768
|
||||
#define SEQ 64
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
static Class g_D, g_I, g_AR, g_AIO;
|
||||
static mach_timebase_info_data_t g_tb;
|
||||
static void ane_init(void) {
|
||||
|
|
@ -56,7 +58,10 @@ static Kern compile_mil(NSString *mil, NSDictionary *wd) {
|
|||
}
|
||||
NSError *e = nil;
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
|
||||
printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k;
|
||||
printf("compile %s: %s\n", g_fp16_io ? "FAIL" : "failed (will retry)",
|
||||
e ? [[e localizedDescription] UTF8String] : "");
|
||||
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
|
||||
return k;
|
||||
}
|
||||
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
k.model = mdl; k.td = td;
|
||||
|
|
@ -85,67 +90,108 @@ static void cleanup_kern(Kern *k) {
|
|||
|
||||
// Fused QKV: 3 convs + concat in one MIL
|
||||
static NSString *gen_fused_qkv_mil(void) {
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor<string, []>(\"cq\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor<string, []>(\"ck\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" tensor<int32, []> ax = const()[name = tensor<string, []>(\"ax\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<bool, []> inter = const()[name = tensor<string, []>(\"il\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor<string, []>(\"cat\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
DIM, SEQ,
|
||||
DIM, DIM, DIM, DIM,
|
||||
DIM, DIM, DIM, DIM,
|
||||
DIM, DIM, DIM, DIM,
|
||||
DIM, SEQ, DIM, SEQ, DIM, SEQ,
|
||||
DIM*3, SEQ];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wq, x = x16)[name = tensor<string, []>(\"cq\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wk, x = x16)[name = tensor<string, []>(\"ck\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n"
|
||||
" int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n"
|
||||
" bool inter = const()[name = string(\"il\"), val = bool(false)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n"
|
||||
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n"
|
||||
"pad_type = pt, strides = st, weight = Wv, x = x16)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" tensor<int32, []> ax = const()[name = tensor<string, []>(\"ax\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<bool, []> inter = const()[name = tensor<string, []>(\"il\"), val = tensor<bool, []>(false)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor<string, []>(\"cat\")];\n"
|
||||
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = tensor<string, []>(\"co\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
DIM, SEQ, DIM, SEQ,
|
||||
DIM, DIM, DIM, DIM, // Wq
|
||||
DIM, DIM, DIM, DIM, // Wk
|
||||
DIM, DIM, DIM, DIM, // Wv
|
||||
DIM, SEQ, // q
|
||||
DIM, SEQ, // k
|
||||
DIM, SEQ, // v
|
||||
DIM*3, SEQ, // concat
|
||||
DIM*3, SEQ]; // output
|
||||
DIM, DIM, DIM, DIM,
|
||||
DIM, DIM, DIM, DIM,
|
||||
DIM, DIM, DIM, DIM,
|
||||
DIM, SEQ, DIM, SEQ, DIM, SEQ,
|
||||
DIM*3, SEQ, DIM*3, SEQ];
|
||||
}
|
||||
|
||||
// Single conv MIL for comparison
|
||||
static NSString *gen_single_mil(void) {
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
|
||||
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ];
|
||||
}
|
||||
|
|
@ -170,12 +216,18 @@ int main() {
|
|||
for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1);
|
||||
|
||||
// === Compile fused QKV ===
|
||||
retry_compile:;
|
||||
NSDictionary *fused_wd = @{
|
||||
@"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)},
|
||||
@"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)},
|
||||
@"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)},
|
||||
};
|
||||
Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd);
|
||||
if (!kFused.model && !g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
goto retry_compile;
|
||||
}
|
||||
printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL");
|
||||
|
||||
// === Compile 3 separate ===
|
||||
|
|
@ -187,16 +239,24 @@ int main() {
|
|||
if (!kFused.model || !kQ.model) goto done;
|
||||
|
||||
// IOSurfaces
|
||||
size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4;
|
||||
size_t bpe = g_fp16_io ? 2 : 4;
|
||||
size_t in_bytes = DIM*SEQ*bpe, out1_bytes = DIM*SEQ*bpe, out3_bytes = DIM*3*SEQ*bpe;
|
||||
IOSurfaceRef ioIn = make_surface(in_bytes);
|
||||
IOSurfaceRef ioFused = make_surface(out3_bytes);
|
||||
IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes);
|
||||
|
||||
IOSurfaceLock(ioIn, 0, NULL);
|
||||
float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int t = 0; t < SEQ; t++)
|
||||
for (int c = 0; c < DIM; c++)
|
||||
dst[c*SEQ+t] = x[t*DIM+c];
|
||||
if (g_fp16_io) {
|
||||
_Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int t = 0; t < SEQ; t++)
|
||||
for (int c = 0; c < DIM; c++)
|
||||
dst[c*SEQ+t] = (_Float16)x[t*DIM+c];
|
||||
} else {
|
||||
float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int t = 0; t < SEQ; t++)
|
||||
for (int c = 0; c < DIM; c++)
|
||||
dst[c*SEQ+t] = x[t*DIM+c];
|
||||
}
|
||||
IOSurfaceUnlock(ioIn, 0, NULL);
|
||||
|
||||
// Eval fused
|
||||
|
|
@ -212,17 +272,30 @@ int main() {
|
|||
IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL);
|
||||
IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL);
|
||||
IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL);
|
||||
float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
|
||||
float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
|
||||
float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
|
||||
float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
|
||||
float dq=0, dk=0, dv=0;
|
||||
for (int c = 0; c < DIM; c++)
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
|
||||
float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
|
||||
float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
|
||||
}
|
||||
if (g_fp16_io) {
|
||||
_Float16 *fo = (_Float16*)IOSurfaceGetBaseAddress(ioFused);
|
||||
_Float16 *qo = (_Float16*)IOSurfaceGetBaseAddress(ioQ);
|
||||
_Float16 *ko = (_Float16*)IOSurfaceGetBaseAddress(ioK);
|
||||
_Float16 *vo = (_Float16*)IOSurfaceGetBaseAddress(ioV);
|
||||
for (int c = 0; c < DIM; c++)
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
float d1 = fabsf((float)fo[c*SEQ+t] - (float)qo[c*SEQ+t]); if(d1>dq) dq=d1;
|
||||
float d2 = fabsf((float)fo[(DIM+c)*SEQ+t] - (float)ko[c*SEQ+t]); if(d2>dk) dk=d2;
|
||||
float d3 = fabsf((float)fo[(DIM*2+c)*SEQ+t] - (float)vo[c*SEQ+t]); if(d3>dv) dv=d3;
|
||||
}
|
||||
} else {
|
||||
float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
|
||||
float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
|
||||
float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
|
||||
float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
|
||||
for (int c = 0; c < DIM; c++)
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
|
||||
float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
|
||||
float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
|
||||
}
|
||||
}
|
||||
IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL);
|
||||
IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL);
|
||||
IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@
|
|||
static mach_timebase_info_data_t g_tb;
|
||||
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
static void dump_class(const char *name) {
|
||||
Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
|
||||
if (!cls) { printf(" %s: NOT FOUND\n", name); return; }
|
||||
|
|
@ -118,28 +120,43 @@ int main() {
|
|||
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
|
||||
free(w);
|
||||
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
|
||||
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=string(\"conv\")];\n"
|
||||
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
|
||||
retry_compile:;
|
||||
NSString *mil;
|
||||
if (g_fp16_io) {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
|
||||
} else {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n"
|
||||
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
|
||||
" } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
|
||||
}
|
||||
|
||||
NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
|
||||
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
|
||||
|
|
@ -153,10 +170,15 @@ int main() {
|
|||
[wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
|
||||
|
||||
NSError *e = nil;
|
||||
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
|
||||
BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!compiled && !g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
goto retry_compile;
|
||||
}
|
||||
((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
|
||||
int ioBytes = CH * SP * 4; // fp32
|
||||
int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
|
||||
IOSurfaceRef ioIn = make_surface(ioBytes);
|
||||
IOSurfaceRef ioOut = make_surface(ioBytes);
|
||||
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
|
||||
|
|
@ -174,8 +196,13 @@ int main() {
|
|||
|
||||
if (req) {
|
||||
IOSurfaceLock(ioIn, 0, NULL);
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f;
|
||||
} else {
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
|
||||
}
|
||||
IOSurfaceUnlock(ioIn, 0, NULL);
|
||||
|
||||
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@
|
|||
static mach_timebase_info_data_t g_tb;
|
||||
static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
static IOSurfaceRef make_surface(size_t bytes) {
|
||||
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
|
||||
(id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
|
||||
|
|
@ -38,37 +40,49 @@ int main() {
|
|||
for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50));
|
||||
NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
|
||||
|
||||
NSString *mil = [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
|
||||
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=string(\"conv\")];\n"
|
||||
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
|
||||
|
||||
NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}};
|
||||
NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
|
||||
printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n");
|
||||
printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6);
|
||||
printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status");
|
||||
|
||||
retry_mil:;
|
||||
NSString *mil;
|
||||
if (g_fp16_io) {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
|
||||
} else {
|
||||
mil = [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
|
||||
" } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
|
||||
}
|
||||
NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
|
||||
|
||||
unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63};
|
||||
int n_qos = sizeof(qos_values)/sizeof(qos_values[0]);
|
||||
|
||||
|
|
@ -98,6 +112,12 @@ int main() {
|
|||
double cms = tb_ms(mach_absolute_time() - t0);
|
||||
|
||||
if (!cok) {
|
||||
if (!g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
goto retry_mil;
|
||||
}
|
||||
printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-");
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
continue;
|
||||
|
|
@ -115,7 +135,7 @@ int main() {
|
|||
continue;
|
||||
}
|
||||
|
||||
int ioBytes = CH * SP * 4;
|
||||
int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
|
||||
IOSurfaceRef ioIn = make_surface(ioBytes);
|
||||
IOSurfaceRef ioOut = make_surface(ioBytes);
|
||||
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
|
||||
|
|
@ -125,8 +145,13 @@ int main() {
|
|||
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
|
||||
|
||||
IOSurfaceLock(ioIn, 0, NULL);
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)0.5f;
|
||||
} else {
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
|
||||
}
|
||||
IOSurfaceUnlock(ioIn, 0, NULL);
|
||||
|
||||
t0 = mach_absolute_time();
|
||||
|
|
|
|||
|
|
@ -34,30 +34,42 @@ static NSData *build_weight_blob(_Float16 *w, int rows, int cols) {
|
|||
return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
|
||||
}
|
||||
|
||||
// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
// Generate MIL for a simple conv (fp16 I/O when g_fp16_io, else fp32 with casts)
|
||||
static NSString *gen_mil(int ch, int sp) {
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n"
|
||||
"[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n"
|
||||
"{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
|
||||
" string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
|
||||
" tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
|
||||
" tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
|
||||
" tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
|
||||
" tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
|
||||
"[name=string(\"conv\")];\n"
|
||||
" string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
|
||||
" } -> (y);\n"
|
||||
"}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
|
||||
"[name=tensor<string, []>(\"conv\")];\n"
|
||||
" tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
|
||||
" } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
|
@ -88,6 +100,9 @@ int main() {
|
|||
for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f;
|
||||
|
||||
NSData *wdataA = build_weight_blob(weightsA, CH, CH);
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
|
||||
retry_compile:;
|
||||
NSString *mil = gen_mil(CH, SP);
|
||||
NSDictionary *weights = @{
|
||||
@"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA}
|
||||
|
|
@ -103,13 +118,18 @@ int main() {
|
|||
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
|
||||
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
[wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
|
||||
|
||||
NSError *e = nil;
|
||||
BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!ok && !g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
goto retry_compile;
|
||||
}
|
||||
if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; }
|
||||
ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; }
|
||||
|
|
@ -117,9 +137,10 @@ int main() {
|
|||
printf(" Compile+load: %.1fms\n", compile_ms);
|
||||
printf(" tmpDir: %s\n", [td UTF8String]);
|
||||
|
||||
// Build request and IOSurfaces (fp32 I/O)
|
||||
int inBytes = CH * SP * 4; // fp32
|
||||
int outBytes = CH * SP * 4;
|
||||
// Build request and IOSurfaces
|
||||
size_t bpe = g_fp16_io ? 2 : 4;
|
||||
int inBytes = CH * SP * bpe;
|
||||
int outBytes = CH * SP * bpe;
|
||||
IOSurfaceRef ioIn = make_surface(inBytes);
|
||||
IOSurfaceRef ioOut = make_surface(outBytes);
|
||||
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
|
||||
|
|
@ -130,10 +151,17 @@ int main() {
|
|||
|
||||
// Write input: channel c, spatial s = (c*SP + s + 1) * 0.01
|
||||
IOSurfaceLock(ioIn, 0, NULL);
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++)
|
||||
for (int s = 0; s < SP; s++)
|
||||
inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++)
|
||||
for (int s = 0; s < SP; s++)
|
||||
inp[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f);
|
||||
} else {
|
||||
float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++)
|
||||
for (int s = 0; s < SP; s++)
|
||||
inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
|
||||
}
|
||||
IOSurfaceUnlock(ioIn, 0, NULL);
|
||||
|
||||
// Eval with weights A
|
||||
|
|
@ -142,13 +170,17 @@ int main() {
|
|||
if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
|
||||
|
||||
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]);
|
||||
float *outA_copy = (float*)malloc(CH * SP * sizeof(float));
|
||||
if (g_fp16_io) {
|
||||
_Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
|
||||
for (int i = 0; i < CH*SP; i++) outA_copy[i] = (float)outA[i];
|
||||
} else {
|
||||
float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
memcpy(outA_copy, outA, CH * SP * sizeof(float));
|
||||
}
|
||||
printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA_copy[0], outA_copy[1], outA_copy[2], outA_copy[3]);
|
||||
printf(" Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
|
||||
outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]);
|
||||
// Save copy
|
||||
float *outA_copy = (float*)malloc(outBytes);
|
||||
memcpy(outA_copy, outA, outBytes);
|
||||
outA_copy[CH*SP-4], outA_copy[CH*SP-3], outA_copy[CH*SP-2], outA_copy[CH*SP-1]);
|
||||
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
|
||||
// === Step 3: Overwrite weight file with B, unload+load ===
|
||||
|
|
@ -189,10 +221,17 @@ int main() {
|
|||
|
||||
// Re-write same input
|
||||
IOSurfaceLock(ioIn, 0, NULL);
|
||||
inp = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++)
|
||||
for (int s = 0; s < SP; s++)
|
||||
inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
|
||||
if (g_fp16_io) {
|
||||
_Float16 *inp2 = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++)
|
||||
for (int s = 0; s < SP; s++)
|
||||
inp2[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f);
|
||||
} else {
|
||||
float *inp2 = (float*)IOSurfaceGetBaseAddress(ioIn);
|
||||
for (int c = 0; c < CH; c++)
|
||||
for (int s = 0; s < SP; s++)
|
||||
inp2[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
|
||||
}
|
||||
IOSurfaceUnlock(ioIn, 0, NULL);
|
||||
|
||||
// Eval with (possibly reloaded) weights B
|
||||
|
|
@ -201,16 +240,23 @@ int main() {
|
|||
if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
|
||||
|
||||
IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]);
|
||||
float *outB_f = (float*)malloc(CH * SP * sizeof(float));
|
||||
if (g_fp16_io) {
|
||||
_Float16 *outB = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
|
||||
for (int i = 0; i < CH*SP; i++) outB_f[i] = (float)outB[i];
|
||||
} else {
|
||||
float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
|
||||
memcpy(outB_f, outB, CH * SP * sizeof(float));
|
||||
}
|
||||
printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB_f[0], outB_f[1], outB_f[2], outB_f[3]);
|
||||
printf(" Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
|
||||
outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]);
|
||||
outB_f[CH*SP-4], outB_f[CH*SP-3], outB_f[CH*SP-2], outB_f[CH*SP-1]);
|
||||
|
||||
// Check: did the output change?
|
||||
bool changed = false;
|
||||
float max_diff = 0;
|
||||
for (int i = 0; i < CH*SP; i++) {
|
||||
float d = fabsf(outB[i] - outA_copy[i]);
|
||||
float d = fabsf(outB_f[i] - outA_copy[i]);
|
||||
if (d > max_diff) max_diff = d;
|
||||
if (d > 0.001f) changed = true;
|
||||
}
|
||||
|
|
@ -219,11 +265,12 @@ int main() {
|
|||
float max_3x_err = 0;
|
||||
for (int i = 0; i < CH*SP; i++) {
|
||||
float expected = outA_copy[i] * 3.0f;
|
||||
float err = fabsf(outB[i] - expected);
|
||||
float err = fabsf(outB_f[i] - expected);
|
||||
if (err > max_3x_err) max_3x_err = err;
|
||||
if (err > 0.1f) correct_3x = false;
|
||||
}
|
||||
IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
free(outB_f);
|
||||
|
||||
printf("\n=== RESULT ===\n");
|
||||
printf(" Max A-B diff: %.6f\n", max_diff);
|
||||
|
|
|
|||
|
|
@ -59,25 +59,43 @@ static NSData *build_blob_transposed(const float *w, int rows, int cols) {
|
|||
return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
|
||||
}
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
|
||||
if (g_fp16_io) {
|
||||
// fp16 I/O path — no cast ops (M1/M2 compatible)
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp];
|
||||
}
|
||||
// fp32 I/O path — cast to/from fp16 internally (M4+ native)
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
|
||||
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
|
||||
}
|
||||
|
|
@ -106,10 +124,19 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
|
|||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
[blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
|
||||
NSError *e = nil;
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
|
||||
if (!g_fp16_io) {
|
||||
// M1/M2 ANE doesn't support cast op — retry with fp16 I/O
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
return compile_kern_with_blob(blob, in_ch, out_ch, sp);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
|
||||
__sync_fetch_and_add(&g_compile_count, 1);
|
||||
size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
|
||||
size_t bpe = g_fp16_io ? 2 : 4;
|
||||
size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe;
|
||||
IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
|
||||
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
|
||||
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
|
||||
|
|
@ -140,27 +167,43 @@ static void free_kern(Kern *k) {
|
|||
}
|
||||
|
||||
static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
|
||||
float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < in_ch; c++)
|
||||
tmp[c*sp + t] = in[t*in_ch + c];
|
||||
// Transpose [S,C] -> [C,S] and write to IOSurface
|
||||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
|
||||
void *base_in = IOSurfaceGetBaseAddress(k->ioIn);
|
||||
if (g_fp16_io) {
|
||||
_Float16 *dst = (_Float16*)base_in;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < in_ch; c++)
|
||||
dst[c*sp + t] = (_Float16)in[t*in_ch + c];
|
||||
} else {
|
||||
float *dst = (float*)base_in;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < in_ch; c++)
|
||||
dst[c*sp + t] = in[t*in_ch + c];
|
||||
}
|
||||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||||
free(tmp);
|
||||
|
||||
NSError *e = nil;
|
||||
id mdl = (__bridge id)k->model;
|
||||
id req = (__bridge id)k->request;
|
||||
((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
|
||||
float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
|
||||
|
||||
// Read output, transpose [C,S] -> [S,C]
|
||||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
|
||||
void *base_out = IOSurfaceGetBaseAddress(k->ioOut);
|
||||
if (g_fp16_io) {
|
||||
_Float16 *src = (_Float16*)base_out;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < out_ch; c++)
|
||||
out[t*out_ch + c] = (float)src[c*sp + t];
|
||||
} else {
|
||||
float *src = (float*)base_out;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < out_ch; c++)
|
||||
out[t*out_ch + c] = src[c*sp + t];
|
||||
}
|
||||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < out_ch; c++)
|
||||
out[t*out_ch + c] = tmp2[c*sp + t];
|
||||
free(tmp2);
|
||||
}
|
||||
|
||||
// === Checkpoint: save/restore training state for exec() restart ===
|
||||
|
|
@ -173,6 +216,7 @@ typedef struct {
|
|||
float lr;
|
||||
double cum_compile_ms, cum_train_ms, cum_wall_ms;
|
||||
int cum_steps, cum_batches;
|
||||
int fp16_io; // persisted: 1 if ANE needs fp16 I/O (M1/M2)
|
||||
} CkptHeader;
|
||||
|
||||
static void save_checkpoint(const char *path, int step, float loss,
|
||||
|
|
@ -180,7 +224,7 @@ static void save_checkpoint(const char *path, int step, float loss,
|
|||
const float *W1, const float *W2,
|
||||
double cc, double ct, double cw, int cs, int cb) {
|
||||
FILE *f = fopen(path, "wb");
|
||||
CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb};
|
||||
CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb, g_fp16_io};
|
||||
fwrite(&hdr, sizeof(hdr), 1, f);
|
||||
fwrite(W1, sizeof(float), H * D, f);
|
||||
fwrite(W2, sizeof(float), D * H, f);
|
||||
|
|
@ -241,8 +285,9 @@ int main(int argc, char *argv[]) {
|
|||
start_step = hdr.step;
|
||||
total_steps = hdr.total_steps;
|
||||
lr = hdr.lr;
|
||||
g_fp16_io = hdr.fp16_io;
|
||||
resuming = true;
|
||||
printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss);
|
||||
printf("[RESUMED at step %d, loss=%.6f, fp16_io=%d, compiles reset]\n", start_step, hdr.loss, g_fp16_io);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,34 +59,50 @@ static NSData *build_blob_transposed(const float *w, int rows, int cols) {
|
|||
return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
|
||||
}
|
||||
|
||||
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly
|
||||
|
||||
static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
|
||||
if (g_fp16_io) {
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp];
|
||||
}
|
||||
return [NSString stringWithFormat:
|
||||
@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
|
||||
"{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
|
||||
"{\"coremltools-version\", \"9.0\"}})]\n{\n"
|
||||
" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
|
||||
" string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
|
||||
@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
|
||||
" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
|
||||
" tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
|
||||
" tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
|
||||
"val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
|
||||
" tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
|
||||
" tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
|
||||
" tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
|
||||
" tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
|
||||
" tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
|
||||
"pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
|
||||
" string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
|
||||
"pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
|
||||
" tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
|
||||
" tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
|
||||
" } -> (y);\n}\n",
|
||||
in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
id model;
|
||||
void *model; // CFBridgingRetain'd _ANEInMemoryModel
|
||||
IOSurfaceRef ioIn, ioOut;
|
||||
id request;
|
||||
NSString *tmpDir;
|
||||
void *request; // CFBridgingRetain'd _ANERequest
|
||||
void *tmpDir; // CFBridgingRetain'd NSString
|
||||
} Kern;
|
||||
|
||||
static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
|
||||
|
|
@ -103,9 +119,17 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
|
|||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
[blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
|
||||
NSError *e = nil;
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
|
||||
if (!g_fp16_io) {
|
||||
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
|
||||
g_fp16_io = 1;
|
||||
return compile_kern_with_blob(blob, in_ch, out_ch, sp);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
|
||||
size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
|
||||
size_t bpe = g_fp16_io ? 2 : 4;
|
||||
size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe;
|
||||
IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
|
||||
id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
|
||||
id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
|
||||
|
|
@ -113,40 +137,60 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
|
|||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||||
@[wI], @[@0], @[wO], @[@0], nil, nil, @0);
|
||||
Kern *k = calloc(1, sizeof(Kern));
|
||||
k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td;
|
||||
k->model = (void*)CFBridgingRetain(mdl);
|
||||
k->ioIn = ioI; k->ioOut = ioO;
|
||||
k->request = (void*)CFBridgingRetain(req);
|
||||
k->tmpDir = (void*)CFBridgingRetain(td);
|
||||
return k;
|
||||
}
|
||||
|
||||
static void free_kern(Kern *k) {
|
||||
if (!k) return;
|
||||
id mdl = (__bridge id)k->model;
|
||||
NSError *e = nil;
|
||||
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
|
||||
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
|
||||
CFRelease(k->ioIn); CFRelease(k->ioOut);
|
||||
[[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
|
||||
NSString *td = (__bridge id)k->tmpDir;
|
||||
[[NSFileManager defaultManager] removeItemAtPath:td error:nil];
|
||||
CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
|
||||
free(k);
|
||||
}
|
||||
|
||||
// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first
|
||||
static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
|
||||
float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < in_ch; c++)
|
||||
tmp[c*sp + t] = in[t*in_ch + c];
|
||||
IOSurfaceLock(k->ioIn, 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
|
||||
void *base_in = IOSurfaceGetBaseAddress(k->ioIn);
|
||||
if (g_fp16_io) {
|
||||
_Float16 *dst = (_Float16*)base_in;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < in_ch; c++)
|
||||
dst[c*sp + t] = (_Float16)in[t*in_ch + c];
|
||||
} else {
|
||||
float *dst = (float*)base_in;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < in_ch; c++)
|
||||
dst[c*sp + t] = in[t*in_ch + c];
|
||||
}
|
||||
IOSurfaceUnlock(k->ioIn, 0, NULL);
|
||||
free(tmp);
|
||||
NSError *e = nil;
|
||||
id mdl = (__bridge id)k->model;
|
||||
id req = (__bridge id)k->request;
|
||||
((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
|
||||
k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e);
|
||||
float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
|
||||
mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
|
||||
IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
|
||||
void *base_out = IOSurfaceGetBaseAddress(k->ioOut);
|
||||
if (g_fp16_io) {
|
||||
_Float16 *src = (_Float16*)base_out;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < out_ch; c++)
|
||||
out[t*out_ch + c] = (float)src[c*sp + t];
|
||||
} else {
|
||||
float *src = (float*)base_out;
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < out_ch; c++)
|
||||
out[t*out_ch + c] = src[c*sp + t];
|
||||
}
|
||||
IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
|
||||
for (int t = 0; t < sp; t++)
|
||||
for (int c = 0; c < out_ch; c++)
|
||||
out[t*out_ch + c] = tmp2[c*sp + t];
|
||||
free(tmp2);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue