diff --git a/inmem_peak.m b/inmem_peak.m index 87b8163..3334d01 100644 --- a/inmem_peak.m +++ b/inmem_peak.m @@ -8,6 +8,7 @@ static mach_timebase_info_data_t g_tb; static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly NSData *buildWeightBlob(int ch, int depth) { NSUInteger wsize = ch * ch * 2; @@ -27,28 +28,45 @@ NSData *buildWeightBlob(int ch, int depth) { NSString *genMIL(int ch, int sp, int depth) { NSMutableString *m = [NSMutableString string]; - [m appendString:@"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; - [m appendFormat:@" func main(tensor x) {\n", ch, sp]; - [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" - @" tensor c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor([1, 1])];\n" - @" tensor c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" - @" tensor c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor([1, 1])];\n" - @" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n" - @" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"]; - [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp]; + [m appendString:@"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"]; + if (g_fp16_io) { + // fp16 I/O path — no cast ops (M1/M2 compatible) + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + } else { + // fp32 I/O path — cast to/from fp16 internally (M4+ native) + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + } + [m appendString: + @" tensor c_pad_type_0 = const()[name = tensor(\"c_pad_type_0\"), val = tensor(\"valid\")];\n" + @" tensor c_strides_0 = const()[name = tensor(\"c_strides_0\"), val = tensor([1, 1])];\n" + @" tensor c_pad_0 = const()[name = tensor(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" + @" tensor c_dilations_0 = const()[name = tensor(\"c_dilations_0\"), val = tensor([1, 1])];\n" + @" tensor c_groups_0 = const()[name = tensor(\"c_groups_0\"), val = tensor(1)];\n"]; + NSString *prev; + if (g_fp16_io) { + prev = @"x"; + } else { + [m appendString:@" tensor x_to_fp16_dtype_0 = const()[name = tensor(\"x_to_fp16_dtype_0\"), val = tensor(\"fp16\")];\n"]; + [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor(\"cast_in\")];\n", ch, sp]; + prev = @"x_to_fp16"; + } NSUInteger cs = 64 + ch*ch*2; - NSString *prev = @"x_to_fp16"; for (int i = 0; i < depth; i++) { - [m appendFormat:@" tensor W%d = const()[name = string(\"W%d\"), val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n", + [m appendFormat:@" tensor W%d = const()[name = tensor(\"W%d\"), val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n", ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)]; NSString *out = [NSString stringWithFormat:@"c%d", i]; - [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n", + [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor(\"%@\")];\n", ch, sp, out, i, prev, out]; prev = out; } - [m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"]; - [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev]; - [m appendString:@" } -> (c);\n}\n"]; + if (g_fp16_io) { + [m appendFormat:@" tensor c = identity(x = %@)[name = tensor(\"out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + } else { + [m appendString:@" tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n"]; + [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = tensor(\"cast_out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + } return m; } @@ -68,9 +86,18 @@ double bench(int ch, int sp, int depth) { [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;} + if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){ + [fm removeItemAtPath:td error:nil]; + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return bench(ch, sp, depth); + } + return -3; + } if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;} - NSUInteger bytes=ch*sp*4; + size_t bpe = g_fp16_io ? 2 : 4; + NSUInteger bytes=ch*sp*bpe; IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);