// io.h — IOSurface helpers, NEON conversion, kernel compile/eval
#pragma once
#include "config.h"

static IOSurfaceRef make_surface(size_t bytes) {
    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
}

// Blob builders for const weights (mask, rms)
static NSData *build_blob(const float *w, int rows, int cols) {
    int ws=rows*cols*2, tot=128+ws;
    uint8_t *b=(uint8_t*)calloc(tot,1);
    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
    _Float16 *fp16=(_Float16*)(b+128);
    for(int i=0;i<rows*cols;i++) fp16[i]=(_Float16)w[i];
    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
}
static NSData *build_blob_fp16(_Float16 *d, int cnt) {
    int ws=cnt*2, tot=128+ws;
    uint8_t *b=(uint8_t*)calloc(tot,1);
    b[0]=1;b[4]=2;b[64]=0xEF;b[65]=0xBE;b[66]=0xAD;b[67]=0xDE;b[68]=1;
    *(uint32_t*)(b+72)=ws;*(uint32_t*)(b+80)=128;
    memcpy(b+128,d,ws);
    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
}

// NEON vectorized conversion
static void cvt_f16_f32(float *dst, const _Float16 *src, int n) {
    int i = 0;
    for (; i+7 < n; i += 8) {
        float16x8_t h = vld1q_f16((const __fp16*)(src+i));
        vst1q_f32(dst+i,   vcvt_f32_f16(vget_low_f16(h)));
        vst1q_f32(dst+i+4, vcvt_f32_f16(vget_high_f16(h)));
    }
    for (; i < n; i++) dst[i] = (float)src[i];
}
static void cvt_f32_f16(_Float16 *dst, const float *src, int n) {
    int i = 0;
    for (; i+7 < n; i += 8) {
        float16x8_t h = vcombine_f16(vcvt_f16_f32(vld1q_f32(src+i)),
                                      vcvt_f16_f32(vld1q_f32(src+i+4)));
        vst1q_f16((__fp16*)(dst+i), h);
    }
    for (; i < n; i++) dst[i] = (_Float16)src[i];
}

// IOSurface I/O (channel-first [C,S] layout, fp16 on surface)
static void io_write_fp16(IOSurfaceRef s, const float *data, int channels, int sp) {
    IOSurfaceLock(s, 0, NULL);
    cvt_f32_f16((_Float16*)IOSurfaceGetBaseAddress(s), data, channels * sp);
    IOSurfaceUnlock(s, 0, NULL);
}
static void io_read_fp16(IOSurfaceRef s, float *data, int ch_off, int channels, int sp) {
    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
    cvt_f16_f32(data, (_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp, channels * sp);
    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
}
static void io_copy(IOSurfaceRef dst, int dst_ch, IOSurfaceRef src, int src_ch, int channels, int sp) {
    IOSurfaceLock(dst, 0, NULL);
    IOSurfaceLock(src, kIOSurfaceLockReadOnly, NULL);
    memcpy((_Float16*)IOSurfaceGetBaseAddress(dst) + dst_ch*sp,
           (_Float16*)IOSurfaceGetBaseAddress(src) + src_ch*sp,
           channels * sp * sizeof(_Float16));
    IOSurfaceUnlock(src, kIOSurfaceLockReadOnly, NULL);
    IOSurfaceUnlock(dst, 0, NULL);
}
static void io_write_fp16_at(IOSurfaceRef s, int ch_off, const float *data, int channels, int sp) {
    IOSurfaceLock(s, 0, NULL);
    cvt_f32_f16((_Float16*)IOSurfaceGetBaseAddress(s) + ch_off * sp, data, channels * sp);
    IOSurfaceUnlock(s, 0, NULL);
}

// fp16 IOSurface I/O (for dynamic matmul kernels with fp16 input/output)
// Layout: [1, IC, 1, SP] where SP = SEQ + OC
// Write activations at sp[0:SEQ] and weights at sp[SEQ:SEQ+OC]
static void io_write_dyn(IOSurfaceRef s, const float *act, int ic, int seq,
                         const float *W, int oc) {
    int sp = seq + oc;
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    for (int d = 0; d < ic; d++) {
        cvt_f32_f16(buf + d*sp, act + d*seq, seq);
        cvt_f32_f16(buf + d*sp + seq, W + d*oc, oc);
    }
    IOSurfaceUnlock(s, 0, NULL);
}

// Read output from dynamic matmul kernel: [1, OC, 1, SEQ]
static void io_read_dyn(IOSurfaceRef s, float *out, int oc, int seq) {
    IOSurfaceLock(s, kIOSurfaceLockReadOnly, NULL);
    cvt_f16_f32(out, (_Float16*)IOSurfaceGetBaseAddress(s), oc * seq);
    IOSurfaceUnlock(s, kIOSurfaceLockReadOnly, NULL);
}

// Compile MIL to ANE kernel
static Kern *compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_bytes, int oc_bytes) {
    @autoreleasepool {
    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, weights, nil);
    if (!desc) { printf("  [compile] desc=NULL\n"); return NULL; }
    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
    for (NSString *path in weights) {
        NSString *rel = [path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""];
        [weights[path][@"data"] writeToFile:[td stringByAppendingPathComponent:rel] atomically:YES];
    }
    NSError *e = nil;
    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
        printf("  [compile] FAIL: %s\n", e ? [[e description] UTF8String] : "no error"); return NULL;
    }
    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
        printf("  [compile] load FAIL\n"); return NULL;
    }
    __sync_fetch_and_add(&g_compile_count, 1);
    Kern *k = (Kern*)calloc(1, sizeof(Kern));
    k->model = (void*)CFBridgingRetain(mdl);
    k->ioIn = make_surface(ic_bytes);
    k->ioOut = make_surface(oc_bytes);
    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn);
    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
    k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
        @[wI], @[@0], @[wO], @[@0], nil, nil, @0));
    k->tmpDir = (void*)CFBridgingRetain(td);
    return k;
    }
}
static void free_kern(Kern *k) {
    if (!k) return;
    id mdl = (__bridge id)k->model; NSError *e = nil;
    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
    CFRelease(k->ioIn); CFRelease(k->ioOut);
    [[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil];
    CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
    free(k);
}
static void ane_eval(Kern *k) {
    id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; NSError *e = nil;
    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
}

// Evaluate with a per-layer request (different ioIn, same model)
static void ane_eval_req(Kern *k, void *request) {
    id mdl = (__bridge id)k->model; id req = (__bridge id)request; NSError *e = nil;
    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
}

// Create an ANE request binding a custom ioIn to a kernel's model+ioOut
static void *make_request(Kern *k, IOSurfaceRef ioIn) {
    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut);
    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
    return (void*)CFBridgingRetain(req);
}

// ===== Per-layer weight staging (write once, reuse across steps) =====
// All surfaces are now fp16 — staging converts fp32 weights to fp16

// sdpaFwd: [1, DIM, 1, SEQ+4*DIM] fp16 — weights at sp[SEQ:]
static void stage_sdpa_fwd_weights(IOSurfaceRef s, const float *Wq, const float *Wk,
                                    const float *Wv, const float *Wo) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + 4*DIM;
    for (int d = 0; d < DIM; d++) {
        cvt_f32_f16(buf + d*sp + SEQ,       Wq + d*DIM, DIM);
        cvt_f32_f16(buf + d*sp + SEQ+DIM,   Wk + d*DIM, DIM);
        cvt_f32_f16(buf + d*sp + SEQ+2*DIM, Wv + d*DIM, DIM);
        cvt_f32_f16(buf + d*sp + SEQ+3*DIM, Wo + d*DIM, DIM);
    }
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_sdpa_fwd_acts(IOSurfaceRef s, const float *xnorm) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + 4*DIM;
    for (int d = 0; d < DIM; d++)
        cvt_f32_f16(buf + d*sp, xnorm + d*SEQ, SEQ);
    IOSurfaceUnlock(s, 0, NULL);
}

// ffnFused: [1, DIM, 1, 2*SEQ+3*HIDDEN] fp16
static void stage_ffn_fused_weights(IOSurfaceRef s,
                                     const float *W1t, const float *W3t, const float *W2_orig) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = 2*SEQ + 3*HIDDEN;
    for (int d = 0; d < DIM; d++) {
        cvt_f32_f16(buf + d*sp + 2*SEQ,          W1t + d*HIDDEN, HIDDEN);
        cvt_f32_f16(buf + d*sp + 2*SEQ+HIDDEN,   W3t + d*HIDDEN, HIDDEN);
        cvt_f32_f16(buf + d*sp + 2*SEQ+2*HIDDEN, W2_orig + d*HIDDEN, HIDDEN);
    }
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_ffn_fused_acts(IOSurfaceRef s, const float *x2norm, const float *x2) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = 2*SEQ + 3*HIDDEN;
    for (int d = 0; d < DIM; d++) {
        cvt_f32_f16(buf + d*sp,       x2norm + d*SEQ, SEQ);
        cvt_f32_f16(buf + d*sp + SEQ, x2 + d*SEQ, SEQ);
    }
    IOSurfaceUnlock(s, 0, NULL);
}

// ffnW13: [1, DIM, 1, SEQ+2*HIDDEN] fp16
static void stage_ffn_w13_weights(IOSurfaceRef s, const float *W1, const float *W3) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + 2*HIDDEN;
    for (int d = 0; d < DIM; d++) {
        cvt_f32_f16(buf + d*sp + SEQ,        W1 + d*HIDDEN, HIDDEN);
        cvt_f32_f16(buf + d*sp + SEQ+HIDDEN, W3 + d*HIDDEN, HIDDEN);
    }
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_ffn_w13_acts(IOSurfaceRef s, const float *xnorm) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + 2*HIDDEN;
    for (int d = 0; d < DIM; d++)
        cvt_f32_f16(buf + d*sp, xnorm + d*SEQ, SEQ);
    IOSurfaceUnlock(s, 0, NULL);
}

// ffnW2: [1, HIDDEN, 1, SEQ+DIM] fp16
static void stage_ffn_w2_weights(IOSurfaceRef s, const float *W2) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + DIM;
    for (int d = 0; d < HIDDEN; d++)
        cvt_f32_f16(buf + d*sp + SEQ, W2 + d*DIM, DIM);
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_ffn_w2_acts(IOSurfaceRef s, const float *gate) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + DIM;
    for (int d = 0; d < HIDDEN; d++)
        cvt_f32_f16(buf + d*sp, gate + d*SEQ, SEQ);
    IOSurfaceUnlock(s, 0, NULL);
}

// ffnBwdW2t: [1, DIM, 1, SEQ+HIDDEN] fp16
static void stage_ffn_bwd_w2t_weights(IOSurfaceRef s, const float *W2) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + HIDDEN;
    for (int d = 0; d < DIM; d++)
        cvt_f32_f16(buf + d*sp + SEQ, W2 + d*HIDDEN, HIDDEN);
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_ffn_bwd_w2t_acts(IOSurfaceRef s, const float *dffn) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + HIDDEN;
    for (int d = 0; d < DIM; d++)
        cvt_f32_f16(buf + d*sp, dffn + d*SEQ, SEQ);
    IOSurfaceUnlock(s, 0, NULL);
}

// ffnBwdW13t: [1, HIDDEN, 1, 2*SEQ+2*DIM] fp16
static void stage_ffn_bwd_w13t_weights(IOSurfaceRef s, const float *W1, const float *W3) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = 2*SEQ + 2*DIM;
    for (int d = 0; d < HIDDEN; d++) {
        cvt_f32_f16(buf + d*sp + 2*SEQ,       W1 + d*DIM, DIM);
        cvt_f32_f16(buf + d*sp + 2*SEQ + DIM, W3 + d*DIM, DIM);
    }
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_ffn_bwd_w13t_acts(IOSurfaceRef s, const float *dh1, const float *dh3) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = 2*SEQ + 2*DIM;
    for (int d = 0; d < HIDDEN; d++) {
        cvt_f32_f16(buf + d*sp,       dh1 + d*SEQ, SEQ);
        cvt_f32_f16(buf + d*sp + SEQ, dh3 + d*SEQ, SEQ);
    }
    IOSurfaceUnlock(s, 0, NULL);
}

// wotBwd: [1, DIM, 1, SEQ+DIM] fp16
static void stage_wot_bwd_weights(IOSurfaceRef s, const float *Wo) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + DIM;
    for (int d = 0; d < DIM; d++)
        cvt_f32_f16(buf + d*sp + SEQ, Wo + d*DIM, DIM);
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_wot_bwd_acts(IOSurfaceRef s, const float *dx2) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = SEQ + DIM;
    for (int d = 0; d < DIM; d++)
        cvt_f32_f16(buf + d*sp, dx2 + d*SEQ, SEQ);
    IOSurfaceUnlock(s, 0, NULL);
}

// qkvBwd: [1, DIM, 1, 3*SEQ+3*DIM] fp16
static void stage_qkv_bwd_weights(IOSurfaceRef s, const float *Wq, const float *Wk, const float *Wv) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = 3*SEQ + 3*DIM;
    for (int d = 0; d < DIM; d++) {
        cvt_f32_f16(buf + d*sp + 3*SEQ,         Wq + d*DIM, DIM);
        cvt_f32_f16(buf + d*sp + 3*SEQ + DIM,   Wk + d*DIM, DIM);
        cvt_f32_f16(buf + d*sp + 3*SEQ + 2*DIM, Wv + d*DIM, DIM);
    }
    IOSurfaceUnlock(s, 0, NULL);
}
static void write_qkv_bwd_acts(IOSurfaceRef s, const float *dq, const float *dk, const float *dv) {
    IOSurfaceLock(s, 0, NULL);
    _Float16 *buf = (_Float16*)IOSurfaceGetBaseAddress(s);
    int sp = 3*SEQ + 3*DIM;
    for (int d = 0; d < DIM; d++) {
        cvt_f32_f16(buf + d*sp,       dq + d*SEQ, SEQ);
        cvt_f32_f16(buf + d*sp + SEQ, dk + d*SEQ, SEQ);
        cvt_f32_f16(buf + d*sp + 2*SEQ, dv + d*SEQ, SEQ);
    }
    IOSurfaceUnlock(s, 0, NULL);
}

// Free per-layer surfaces and requests
static void free_per_layer(PerLayerSurfaces *pls, PerLayerRequests *plr) {
    for (int L = 0; L < NLAYERS; L++) {
        CFRelease(pls[L].sdpaFwd_in); CFRelease(pls[L].ffnFused_in);
        CFRelease(pls[L].ffnBwdW2t_in); CFRelease(pls[L].ffnBwdW13t_in);
        CFRelease(pls[L].wotBwd_in); CFRelease(pls[L].qkvBwd_in);
        CFRelease(plr[L].sdpaFwd); CFRelease(plr[L].ffnFused);
        CFRelease(plr[L].ffnBwdW2t); CFRelease(plr[L].ffnBwdW13t);
        CFRelease(plr[L].wotBwd); CFRelease(plr[L].qkvBwd);
    }
}