mirror of https://github.com/maderix/ANE.git
Python Bridge+Memory leak fix+More functions
This commit is contained in:
parent
1b792fce34
commit
ebac5dd73f
|
|
@ -0,0 +1,17 @@
|
|||
CC = xcrun clang
|
||||
CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc -fPIC
|
||||
FRAMEWORKS = -framework Foundation -framework IOSurface -ldl
|
||||
TARGET = libane_bridge.dylib
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): ane_bridge.m ane_bridge.h
|
||||
$(CC) $(CFLAGS) -dynamiclib -o $@ ane_bridge.m $(FRAMEWORKS)
|
||||
|
||||
test: test_bridge.m ane_bridge.h $(TARGET)
|
||||
$(CC) $(CFLAGS) -o test_bridge test_bridge.m -L. -lane_bridge $(FRAMEWORKS)
|
||||
|
||||
clean:
|
||||
rm -f $(TARGET) test_bridge
|
||||
|
||||
.PHONY: all clean test
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
// ane_bridge.h — C-callable bridge to ANE private APIs for Python ctypes
|
||||
// Wraps _ANEInMemoryModel via private AppleNeuralEngine.framework
|
||||
|
||||
#ifndef ANE_BRIDGE_H
|
||||
#define ANE_BRIDGE_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque kernel handle
|
||||
typedef struct ANEKernelHandle ANEKernelHandle;
|
||||
|
||||
// Initialize ANE runtime (load private framework, resolve classes)
|
||||
// Returns 0 on success, -1 on failure
|
||||
int ane_bridge_init(void);
|
||||
|
||||
// Compile a MIL program with weight blobs into an ANE kernel
|
||||
// mil_text: UTF-8 MIL program text
|
||||
// mil_len: length of MIL text
|
||||
// weight_data: raw weight blob (can be NULL)
|
||||
// weight_len: length of weight blob
|
||||
// n_inputs: number of input tensors
|
||||
// input_sizes: array of byte sizes for each input
|
||||
// n_outputs: number of output tensors
|
||||
// output_sizes: array of byte sizes for each output
|
||||
// Returns kernel handle or NULL on failure
|
||||
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
|
||||
const uint8_t *weight_data, size_t weight_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes);
|
||||
|
||||
// Compile with multiple named weight files (for transformer kernels)
|
||||
// weight_names: array of weight file paths (e.g. "@model_path/weights/wq.bin")
|
||||
// weight_datas: array of weight data pointers
|
||||
// weight_lens: array of weight data lengths
|
||||
// n_weights: number of weight files
|
||||
ANEKernelHandle *ane_bridge_compile_multi_weights(
|
||||
const char *mil_text, size_t mil_len,
|
||||
const char **weight_names, const uint8_t **weight_datas,
|
||||
const size_t *weight_lens, int n_weights,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes);
|
||||
|
||||
// Evaluate (run) a compiled kernel on ANE
|
||||
// Returns true on success
|
||||
bool ane_bridge_eval(ANEKernelHandle *kernel);
|
||||
|
||||
// Write data to kernel input tensor
|
||||
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
|
||||
const void *data, size_t bytes);
|
||||
|
||||
// Read data from kernel output tensor
|
||||
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
|
||||
void *data, size_t bytes);
|
||||
|
||||
// Free a compiled kernel and all associated resources
|
||||
void ane_bridge_free(ANEKernelHandle *kernel);
|
||||
|
||||
// Get compile count (for exec() restart budgeting)
|
||||
int ane_bridge_get_compile_count(void);
|
||||
|
||||
// Reset compile count
|
||||
void ane_bridge_reset_compile_count(void);
|
||||
|
||||
// Build a weight blob in ANE format (128-byte header + fp16 data)
|
||||
// src: float32 weights [rows x cols]
|
||||
// Returns allocated buffer and sets out_len. Caller must free().
|
||||
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
|
||||
size_t *out_len);
|
||||
|
||||
// Build a transposed weight blob in ANE format
|
||||
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
|
||||
size_t *out_len);
|
||||
|
||||
// Free a blob allocated by ane_bridge_build_weight_blob*
|
||||
void ane_bridge_free_blob(void *ptr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // ANE_BRIDGE_H
|
||||
|
|
@ -0,0 +1,328 @@
|
|||
// ane_bridge.m — Objective-C implementation of ANE bridge for Python ctypes
|
||||
// Wraps _ANEInMemoryModel private APIs into C-callable functions
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#import <objc/runtime.h>
|
||||
#import <objc/message.h>
|
||||
#import <dlfcn.h>
|
||||
#import <IOSurface/IOSurface.h>
|
||||
#include "ane_bridge.h"
|
||||
|
||||
// --- Private class references ---
|
||||
static Class g_ANEDesc = nil;
|
||||
static Class g_ANEInMem = nil;
|
||||
static Class g_ANEReq = nil;
|
||||
static Class g_ANEIO = nil;
|
||||
static bool g_initialized = false;
|
||||
static int g_compile_count = 0;
|
||||
|
||||
// --- Kernel handle struct ---
|
||||
struct ANEKernelHandle {
|
||||
id model; // _ANEInMemoryModel
|
||||
IOSurfaceRef *ioInputs;
|
||||
IOSurfaceRef *ioOutputs;
|
||||
id request; // _ANERequest
|
||||
NSString *tmpDir;
|
||||
int nInputs, nOutputs;
|
||||
size_t *inputBytes;
|
||||
size_t *outputBytes;
|
||||
};
|
||||
|
||||
// --- Public API ---
|
||||
|
||||
int ane_bridge_init(void) {
|
||||
if (g_initialized) return 0;
|
||||
|
||||
void *handle = dlopen(
|
||||
"/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",
|
||||
RTLD_NOW);
|
||||
if (!handle) {
|
||||
fprintf(stderr, "ane_bridge: Failed to load AppleNeuralEngine.framework\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor");
|
||||
g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel");
|
||||
g_ANEReq = NSClassFromString(@"_ANERequest");
|
||||
g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject");
|
||||
|
||||
if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) {
|
||||
fprintf(stderr, "ane_bridge: Failed to resolve ANE private classes\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
g_initialized = true;
|
||||
g_compile_count = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static IOSurfaceRef create_surface(size_t bytes) {
|
||||
return IOSurfaceCreate((__bridge CFDictionaryRef)@{
|
||||
(id)kIOSurfaceWidth: @(bytes),
|
||||
(id)kIOSurfaceHeight: @1,
|
||||
(id)kIOSurfaceBytesPerElement: @1,
|
||||
(id)kIOSurfaceBytesPerRow: @(bytes),
|
||||
(id)kIOSurfaceAllocSize: @(bytes),
|
||||
(id)kIOSurfacePixelFormat: @0
|
||||
});
|
||||
}
|
||||
|
||||
ANEKernelHandle *ane_bridge_compile_multi_weights(
|
||||
const char *mil_text, size_t mil_len,
|
||||
const char **weight_names, const uint8_t **weight_datas,
|
||||
const size_t *weight_lens, int n_weights,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes)
|
||||
{
|
||||
@autoreleasepool {
|
||||
if (!g_initialized) {
|
||||
fprintf(stderr, "ane_bridge: Not initialized\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
NSData *milData = [NSData dataWithBytes:mil_text length:mil_len];
|
||||
NSError *e = nil;
|
||||
|
||||
// Build weight dictionary
|
||||
NSMutableDictionary *wdict = [NSMutableDictionary dictionary];
|
||||
for (int i = 0; i < n_weights; i++) {
|
||||
NSString *name = [NSString stringWithUTF8String:weight_names[i]];
|
||||
NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
|
||||
wdict[name] = @{@"offset": @0, @"data": data};
|
||||
}
|
||||
|
||||
id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(
|
||||
g_ANEDesc, @selector(modelWithMILText:weights:optionsPlist:),
|
||||
milData, wdict.count > 0 ? wdict : nil, nil);
|
||||
if (!desc) {
|
||||
fprintf(stderr, "ane_bridge: modelWithMILText failed\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(
|
||||
g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc);
|
||||
if (!mdl) {
|
||||
fprintf(stderr, "ane_bridge: inMemoryModelWithDescriptor failed\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Pre-populate temp dir
|
||||
id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
|
||||
NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
|
||||
NSFileManager *fm = [NSFileManager defaultManager];
|
||||
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
|
||||
withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
|
||||
|
||||
for (int i = 0; i < n_weights; i++) {
|
||||
NSString *name = [NSString stringWithUTF8String:weight_names[i]];
|
||||
// Extract filename from path like "@model_path/weights/wq.bin" -> "weights/wq.bin"
|
||||
NSString *relPath = name;
|
||||
if ([name hasPrefix:@"@model_path/"]) {
|
||||
relPath = [name substringFromIndex:12];
|
||||
}
|
||||
NSString *fullPath = [td stringByAppendingPathComponent:relPath];
|
||||
NSString *dir = [fullPath stringByDeletingLastPathComponent];
|
||||
[fm createDirectoryAtPath:dir withIntermediateDirectories:YES attributes:nil error:nil];
|
||||
NSData *data = [NSData dataWithBytes:weight_datas[i] length:weight_lens[i]];
|
||||
[data writeToFile:fullPath atomically:YES];
|
||||
}
|
||||
|
||||
// Compile
|
||||
if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
|
||||
fprintf(stderr, "ane_bridge: ANE compile failed: %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Load (with one retry after a brief pause for ANE slot reclamation)
|
||||
BOOL loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
if (!loaded) {
|
||||
fprintf(stderr, "ane_bridge: ANE load failed (retrying in 100ms): %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
usleep(100000); // 100ms
|
||||
e = nil;
|
||||
loaded = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
|
||||
mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
|
||||
}
|
||||
if (!loaded) {
|
||||
fprintf(stderr, "ane_bridge: ANE load failed after retry: %s\n",
|
||||
e ? [[e description] UTF8String] : "unknown");
|
||||
[fm removeItemAtPath:td error:nil];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
g_compile_count++;
|
||||
|
||||
// Create kernel handle
|
||||
ANEKernelHandle *k = (ANEKernelHandle *)calloc(1, sizeof(ANEKernelHandle));
|
||||
k->model = mdl;
|
||||
k->tmpDir = td;
|
||||
k->nInputs = n_inputs;
|
||||
k->nOutputs = n_outputs;
|
||||
k->inputBytes = (size_t *)malloc(n_inputs * sizeof(size_t));
|
||||
k->outputBytes = (size_t *)malloc(n_outputs * sizeof(size_t));
|
||||
memcpy(k->inputBytes, input_sizes, n_inputs * sizeof(size_t));
|
||||
memcpy(k->outputBytes, output_sizes, n_outputs * sizeof(size_t));
|
||||
|
||||
// Create IOSurfaces
|
||||
k->ioInputs = (IOSurfaceRef *)malloc(n_inputs * sizeof(IOSurfaceRef));
|
||||
k->ioOutputs = (IOSurfaceRef *)malloc(n_outputs * sizeof(IOSurfaceRef));
|
||||
for (int i = 0; i < n_inputs; i++)
|
||||
k->ioInputs[i] = create_surface(input_sizes[i]);
|
||||
for (int i = 0; i < n_outputs; i++)
|
||||
k->ioOutputs[i] = create_surface(output_sizes[i]);
|
||||
|
||||
// Build request
|
||||
NSMutableArray *wIns = [NSMutableArray arrayWithCapacity:n_inputs];
|
||||
NSMutableArray *iIdx = [NSMutableArray arrayWithCapacity:n_inputs];
|
||||
for (int i = 0; i < n_inputs; i++) {
|
||||
[wIns addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
|
||||
g_ANEIO, @selector(objectWithIOSurface:), k->ioInputs[i])];
|
||||
[iIdx addObject:@(i)];
|
||||
}
|
||||
NSMutableArray *wOuts = [NSMutableArray arrayWithCapacity:n_outputs];
|
||||
NSMutableArray *oIdx = [NSMutableArray arrayWithCapacity:n_outputs];
|
||||
for (int i = 0; i < n_outputs; i++) {
|
||||
[wOuts addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(
|
||||
g_ANEIO, @selector(objectWithIOSurface:), k->ioOutputs[i])];
|
||||
[oIdx addObject:@(i)];
|
||||
}
|
||||
k->request = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(
|
||||
g_ANEReq,
|
||||
@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
|
||||
wIns, iIdx, wOuts, oIdx, nil, nil, @0);
|
||||
|
||||
return k;
|
||||
}
|
||||
}
|
||||
|
||||
ANEKernelHandle *ane_bridge_compile(const char *mil_text, size_t mil_len,
|
||||
const uint8_t *weight_data, size_t weight_len,
|
||||
int n_inputs, const size_t *input_sizes,
|
||||
int n_outputs, const size_t *output_sizes) {
|
||||
if (weight_data && weight_len > 0) {
|
||||
const char *name = "@model_path/weights/weight.bin";
|
||||
return ane_bridge_compile_multi_weights(
|
||||
mil_text, mil_len,
|
||||
&name, &weight_data, &weight_len, 1,
|
||||
n_inputs, input_sizes,
|
||||
n_outputs, output_sizes);
|
||||
} else {
|
||||
return ane_bridge_compile_multi_weights(
|
||||
mil_text, mil_len,
|
||||
NULL, NULL, NULL, 0,
|
||||
n_inputs, input_sizes,
|
||||
n_outputs, output_sizes);
|
||||
}
|
||||
}
|
||||
|
||||
bool ane_bridge_eval(ANEKernelHandle *kernel) {
|
||||
@autoreleasepool {
|
||||
if (!kernel || !kernel->model) return false;
|
||||
NSError *e = nil;
|
||||
return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
|
||||
kernel->model, @selector(evaluateWithQoS:options:request:error:),
|
||||
21, @{}, kernel->request, &e);
|
||||
}
|
||||
}
|
||||
|
||||
void ane_bridge_write_input(ANEKernelHandle *kernel, int idx,
|
||||
const void *data, size_t bytes) {
|
||||
if (!kernel || idx < 0 || idx >= kernel->nInputs) return;
|
||||
IOSurfaceLock(kernel->ioInputs[idx], 0, NULL);
|
||||
memcpy(IOSurfaceGetBaseAddress(kernel->ioInputs[idx]), data, bytes);
|
||||
IOSurfaceUnlock(kernel->ioInputs[idx], 0, NULL);
|
||||
}
|
||||
|
||||
void ane_bridge_read_output(ANEKernelHandle *kernel, int idx,
|
||||
void *data, size_t bytes) {
|
||||
if (!kernel || idx < 0 || idx >= kernel->nOutputs) return;
|
||||
IOSurfaceLock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
|
||||
memcpy(data, IOSurfaceGetBaseAddress(kernel->ioOutputs[idx]), bytes);
|
||||
IOSurfaceUnlock(kernel->ioOutputs[idx], kIOSurfaceLockReadOnly, NULL);
|
||||
}
|
||||
|
||||
void ane_bridge_free(ANEKernelHandle *kernel) {
|
||||
@autoreleasepool {
|
||||
if (!kernel) return;
|
||||
NSError *e = nil;
|
||||
if (kernel->model) {
|
||||
((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(
|
||||
kernel->model, @selector(unloadWithQoS:error:), 21, &e);
|
||||
}
|
||||
for (int i = 0; i < kernel->nInputs; i++)
|
||||
if (kernel->ioInputs[i]) CFRelease(kernel->ioInputs[i]);
|
||||
for (int i = 0; i < kernel->nOutputs; i++)
|
||||
if (kernel->ioOutputs[i]) CFRelease(kernel->ioOutputs[i]);
|
||||
if (kernel->tmpDir) {
|
||||
[[NSFileManager defaultManager] removeItemAtPath:kernel->tmpDir error:nil];
|
||||
}
|
||||
free(kernel->ioInputs);
|
||||
free(kernel->ioOutputs);
|
||||
free(kernel->inputBytes);
|
||||
free(kernel->outputBytes);
|
||||
|
||||
// Explicitly nil Objective-C objects to trigger ARC release before freeing struct
|
||||
kernel->model = nil;
|
||||
kernel->request = nil;
|
||||
kernel->tmpDir = nil;
|
||||
|
||||
free(kernel);
|
||||
}
|
||||
}
|
||||
|
||||
int ane_bridge_get_compile_count(void) {
|
||||
return g_compile_count;
|
||||
}
|
||||
|
||||
void ane_bridge_reset_compile_count(void) {
|
||||
g_compile_count = 0;
|
||||
}
|
||||
|
||||
uint8_t *ane_bridge_build_weight_blob(const float *src, int rows, int cols,
|
||||
size_t *out_len) {
|
||||
int wsize = rows * cols * 2; // fp16
|
||||
int total = 128 + wsize;
|
||||
uint8_t *buf = (uint8_t *)calloc(total, 1);
|
||||
|
||||
// ANE blob header
|
||||
buf[0] = 0x01; buf[4] = 0x02;
|
||||
buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
|
||||
buf[68] = 0x01;
|
||||
*(uint32_t*)(buf + 72) = wsize;
|
||||
*(uint32_t*)(buf + 80) = 128;
|
||||
|
||||
// Convert float32 -> float16
|
||||
_Float16 *fp16 = (_Float16 *)(buf + 128);
|
||||
for (int i = 0; i < rows * cols; i++) {
|
||||
fp16[i] = (_Float16)src[i];
|
||||
}
|
||||
|
||||
*out_len = total;
|
||||
return buf;
|
||||
}
|
||||
|
||||
uint8_t *ane_bridge_build_weight_blob_transposed(const float *src, int rows, int cols,
|
||||
size_t *out_len) {
|
||||
int wsize = rows * cols * 2;
|
||||
int total = 128 + wsize;
|
||||
uint8_t *buf = (uint8_t *)calloc(total, 1);
|
||||
|
||||
buf[0] = 0x01; buf[4] = 0x02;
|
||||
buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
|
||||
buf[68] = 0x01;
|
||||
*(uint32_t*)(buf + 72) = wsize;
|
||||
*(uint32_t*)(buf + 80) = 128;
|
||||
|
||||
_Float16 *fp16 = (_Float16 *)(buf + 128);
|
||||
for (int i = 0; i < rows; i++)
|
||||
for (int j = 0; j < cols; j++)
|
||||
fp16[j * rows + i] = (_Float16)src[i * cols + j];
|
||||
|
||||
*out_len = total;
|
||||
return buf;
|
||||
}
|
||||
Binary file not shown.
|
|
@ -5,14 +5,25 @@ LDFLAGS = $(FRAMEWORKS) -ldl
|
|||
|
||||
HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h
|
||||
|
||||
HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h
|
||||
|
||||
train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h
|
||||
$(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS)
|
||||
|
||||
train_large: train_large.m $(HEADERS_LARGE)
|
||||
$(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate
|
||||
|
||||
train_large_ane: train_large_ane.m $(HEADERS_ANE)
|
||||
$(CC) $(CFLAGS) -o $@ train_large_ane.m $(LDFLAGS) -framework Accelerate
|
||||
|
||||
PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced
|
||||
|
||||
test_rmsnorm_bwd: test_rmsnorm_bwd.m $(HEADERS_ANE)
|
||||
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate
|
||||
|
||||
test_classifier: test_classifier.m $(HEADERS_ANE)
|
||||
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate
|
||||
|
||||
test_weight_reload: test_weight_reload.m
|
||||
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
|
||||
|
||||
|
|
@ -31,6 +42,7 @@ tokenize:
|
|||
python3 tokenize.py
|
||||
|
||||
clean:
|
||||
rm -f train train_large $(PROBES)
|
||||
rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier
|
||||
|
||||
.PHONY: clean tokenize probes
|
||||
|
||||
|
|
|
|||
|
|
@ -47,18 +47,70 @@ Training a 109M-parameter Llama2-architecture transformer (Stories110M) directly
|
|||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Extract tokenized data
|
||||
python3 tokenize.py
|
||||
### 1. Download Training Data
|
||||
|
||||
# Build and train
|
||||
```bash
|
||||
bash download_data.sh
|
||||
```
|
||||
|
||||
Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from [enio/TinyStories](https://huggingface.co/datasets/enio/TinyStories) on HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens).
|
||||
|
||||
### 2. Build & Train
|
||||
|
||||
```bash
|
||||
# Baseline: classifier + softmax on CPU
|
||||
make train_large
|
||||
./train_large # fresh start
|
||||
./train_large --steps 100 # quick test
|
||||
./train_large # full 10k steps
|
||||
./train_large --resume # resume from checkpoint
|
||||
|
||||
# Monitor with dashboard
|
||||
# ANE-offloaded: classifier + softmax on ANE (faster)
|
||||
make train_large_ane
|
||||
./train_large_ane --steps 100
|
||||
```
|
||||
|
||||
**CLI flags:** `--steps N` (default 10000), `--lr F` (default 3e-4), `--resume`.
|
||||
|
||||
### 3. Monitor with Dashboard
|
||||
|
||||
```bash
|
||||
pip install blessed psutil numpy
|
||||
python3 dashboard.py --resume # needs sudo for powermetrics
|
||||
sudo python3 dashboard.py # live mode (needs powermetrics)
|
||||
sudo python3 dashboard.py --resume # attach to resumed training
|
||||
```
|
||||
|
||||
### 4. Benchmarking
|
||||
|
||||
Both programs print an **Efficiency Report** at completion:
|
||||
|
||||
```
|
||||
=== Efficiency Report ===
|
||||
Total steps: 100
|
||||
Avg train: 107.0 ms/step
|
||||
ANE TFLOPS: 2.45 sustained
|
||||
ANE utilization: 15.5% of 15.8 TFLOPS
|
||||
```
|
||||
|
||||
Per-batch timing breakdown during training:
|
||||
|
||||
```
|
||||
ane=9.6 io=4.1 cls=9.1 elem=14.4 rms=0.1 cblas_wait=2.3 ms/step
|
||||
```
|
||||
|
||||
| Metric | What it measures |
|
||||
|--------|-----------------|
|
||||
| `ane` | ANE kernel evaluation |
|
||||
| `io` | fp16↔fp32 IOSurface transfer |
|
||||
| `cls` | Classifier matmul (CPU cblas) |
|
||||
| `elem` | Embedding, residual adds, cross-entropy |
|
||||
| `rms` | RMSNorm forward/backward |
|
||||
| `cblas_wait` | Waiting for async dW gradient sgemms |
|
||||
|
||||
Compare baseline vs ANE-offloaded:
|
||||
|
||||
```bash
|
||||
make train_large && ./train_large --steps 100
|
||||
make train_large_ane && ./train_large_ane --steps 100
|
||||
```
|
||||
|
||||
## Key techniques
|
||||
|
|
|
|||
|
|
@ -0,0 +1,102 @@
|
|||
// ane_classifier.h — MIL generators for classifier matmul and softmax on ANE
|
||||
// Replaces classifier cblas_sgemm and cross-entropy softmax from CPU
|
||||
#pragma once
|
||||
#include "stories_mil.h"
|
||||
|
||||
// ============================================================
|
||||
// Classifier forward: logits = embed @ x_final
|
||||
// embed: [VOCAB, DIM] baked as conv weight [VOCAB, DIM, 1, 1]
|
||||
// x: [1, DIM, 1, SEQ] input
|
||||
// out: [1, VOCAB, 1, SEQ] logits
|
||||
//
|
||||
// VOCAB=32000 output channels — this is the largest conv we've attempted.
|
||||
// If it fails, we'll need to tile into smaller chunks.
|
||||
// ============================================================
|
||||
static NSString *gen_classifier_fwd(void) {
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
|
||||
[m appendString:@CONV_CONST];
|
||||
[m appendFormat:@" tensor<fp16, [%d,%d,1,1]> We = const()[name=string(\"We\"), "
|
||||
"val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/embed.bin\"), offset=uint64(64)))];\n",
|
||||
VOCAB, DIM, VOCAB, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=We,x=x)[name=string(\"cls\")];\n", VOCAB, SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Classifier backward: dx = embed^T @ dlogits
|
||||
// ANE rejects conv with 32000 input channels.
|
||||
// Use matmul instead: reshape dlogits to [1, VOCAB, SEQ],
|
||||
// bake embed^T as [1, DIM, VOCAB], matmul → [1, DIM, SEQ],
|
||||
// reshape back to [1, DIM, 1, SEQ].
|
||||
// ============================================================
|
||||
static NSString *gen_classifier_bwd(void) {
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> dl) {\n", VOCAB, SEQ];
|
||||
// Reshape dlogits from [1, VOCAB, 1, SEQ] to [1, VOCAB, SEQ]
|
||||
[m appendFormat:@" tensor<int32, [3]> sh3 = const()[name=string(\"sh3\"), val=tensor<int32, [3]>([1,%d,%d])];\n", VOCAB, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d]> dl3 = reshape(shape=sh3,x=dl)[name=string(\"rdl\")];\n", VOCAB, SEQ];
|
||||
// embed_t as baked constant [1, DIM, VOCAB]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d]> Wet = const()[name=string(\"Wet\"), "
|
||||
"val=tensor<fp16, [1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/embed_t.bin\"), offset=uint64(64)))];\n",
|
||||
DIM, VOCAB, DIM, VOCAB];
|
||||
// matmul: [1, DIM, VOCAB] @ [1, VOCAB, SEQ] -> [1, DIM, SEQ]
|
||||
[m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,%d]> dx3 = matmul(transpose_x=bF,transpose_y=bF,x=Wet,y=dl3)[name=string(\"mm\")];\n", DIM, SEQ];
|
||||
// Reshape back to [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<int32, [4]> sh4 = const()[name=string(\"sh4\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = reshape(shape=sh4,x=dx3)[name=string(\"out\")];\n", DIM, SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Softmax over VOCAB dimension (channel axis) for cross-entropy
|
||||
// Input: logits [1, VOCAB, 1, SEQ]
|
||||
// Output: probs [1, VOCAB, 1, SEQ]
|
||||
//
|
||||
// softmax(x, axis=1) = exp(x - max(x)) / sum(exp(x - max(x)))
|
||||
//
|
||||
// Note: After getting probs from ANE, the NLL loss + gradient
|
||||
// (prob[target] -= 1.0) are done on CPU since they need target indexing.
|
||||
// ============================================================
|
||||
static NSString *gen_softmax_vocab(void) {
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", VOCAB, SEQ];
|
||||
[m appendString:@" int32 ax = const()[name=string(\"ax\"), val=int32(1)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = softmax(axis=ax,x=x)[name=string(\"sm\")];\n", VOCAB, SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Final RMSNorm on ANE (replaces CPU rmsnorm for final layer)
|
||||
// Input: x [1, DIM, 1, SEQ]
|
||||
// Baked: rms_final weights [DIM]
|
||||
// Output: xn [1, DIM, 1, SEQ]
|
||||
// ============================================================
|
||||
static NSString *gen_final_rmsnorm(void) {
|
||||
float invd = 1.0f/(float)DIM;
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
|
||||
[m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms_w.bin\"), offset=uint64(64)))];\n", DIM, DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = mul(x=xr,y=rw)[name=string(\"out\")];\n", DIM, SEQ];
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
// ane_rmsnorm_bwd.h — MIL generator for RMSNorm backward on ANE
|
||||
// Replaces CPU rmsnorm_bwd() from stories_cpu_ops.h
|
||||
//
|
||||
// RMSNorm forward: xn = x * rrms * w, where rrms = 1/sqrt(mean(x²) + eps)
|
||||
// RMSNorm backward: dx = w * rrms * (dy - x * sum(dy*w*x) * invd * rrms²)
|
||||
//
|
||||
// Input: concat(dy, x) as [1, 2*DIM, 1, SEQ]
|
||||
// Baked: RMSNorm weights w [1, DIM, 1, 1] as BLOBFILE
|
||||
// Output: dx [1, DIM, 1, SEQ]
|
||||
//
|
||||
// Note: dw (weight gradient) stays on CPU — it requires reduce_sum over SEQ
|
||||
// and accumulation across steps, which is cheap and better done on CPU.
|
||||
#pragma once
|
||||
#include "stories_mil.h"
|
||||
|
||||
// Generate MIL for RMSNorm backward
|
||||
// Input: concat(dy, x) [1, 2*DIM, 1, SEQ]
|
||||
// Baked weights: rms_w [DIM] — the RMSNorm scale weights
|
||||
// Output: dx [1, DIM, 1, SEQ]
|
||||
static NSString *gen_rmsnorm_bwd(void) {
|
||||
float invd = 1.0f / (float)DIM;
|
||||
NSMutableString *m = [NSMutableString string];
|
||||
[m appendString:MIL_HDR];
|
||||
|
||||
// Input: concat of dy and x along channel dimension
|
||||
[m appendFormat:@" func main<ios18>(tensor<fp16, [1, %d, 1, %d]> inp) {\n", 2*DIM, SEQ];
|
||||
|
||||
// Slice out dy [1, DIM, 1, SEQ] and x [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
|
||||
[m appendString:@" tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dy = slice_by_size(x=inp,begin=b0,size=sz)[name=string(\"sdy\")];\n", DIM, SEQ];
|
||||
[m appendFormat:@" tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> x = slice_by_size(x=inp,begin=b1,size=sz)[name=string(\"sx\")];\n", DIM, SEQ];
|
||||
|
||||
// Step 1: Compute rrms = 1/sqrt(mean(x²) + eps)
|
||||
// sq = x * x
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
|
||||
// ss = sum(sq, axis=1, keepdims=true) → [1,1,1,SEQ]
|
||||
[m appendFormat:@" tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
|
||||
[m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
|
||||
// ss2 = ss * invd + eps
|
||||
[m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
|
||||
[m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
|
||||
// rrms = pow(ss3, -0.5) → [1,1,1,SEQ]
|
||||
[m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
|
||||
|
||||
// Step 2: Load RMSNorm weights w [1, DIM, 1, 1]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,1]> w = const()[name=string(\"w\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms_w.bin\"), offset=uint64(64)))];\n", DIM, DIM];
|
||||
|
||||
// Step 3: Compute dot = sum(dy * w * x, axis=1) * invd * rrms²
|
||||
// dyw = dy * w → [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dyw = mul(x=dy,y=w)[name=string(\"dyw\")];\n", DIM, SEQ];
|
||||
// dywx = dyw * x → [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> dywx = mul(x=dyw,y=x)[name=string(\"dywx\")];\n", DIM, SEQ];
|
||||
// dot_sum = sum(dywx, axis=1, keepdims=true) → [1,1,1,SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> dot_sum = reduce_sum(x=dywx,axes=rax,keep_dims=kd)[name=string(\"ds\")];\n", SEQ];
|
||||
// dot_scaled = dot_sum * invd → [1,1,1,SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> dot_sc = mul(x=dot_sum,y=invd)[name=string(\"dsc\")];\n", SEQ];
|
||||
// rrms_sq = rrms * rrms → [1,1,1,SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> rrms2 = mul(x=rrms,y=rrms)[name=string(\"rr2\")];\n", SEQ];
|
||||
// coeff = dot_scaled * rrms_sq → [1,1,1,SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,1,1,%d]> coeff = mul(x=dot_sc,y=rrms2)[name=string(\"cof\")];\n", SEQ];
|
||||
|
||||
// Step 4: dx = (dy * w - x * coeff) * rrms
|
||||
// x_coeff = x * coeff → [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> xc = mul(x=x,y=coeff)[name=string(\"xc\")];\n", DIM, SEQ];
|
||||
// diff = dyw - xc → [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> diff = sub(x=dyw,y=xc)[name=string(\"dif\")];\n", DIM, SEQ];
|
||||
// dx = diff * rrms → [1, DIM, 1, SEQ]
|
||||
[m appendFormat:@" tensor<fp16, [1,%d,1,%d]> out = mul(x=diff,y=rrms)[name=string(\"out\")];\n", DIM, SEQ];
|
||||
|
||||
[m appendString:@" } -> (out);\n}\n"];
|
||||
return m;
|
||||
}
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
#!/bin/bash
|
||||
# Download pretokenized TinyStories data for ANE training
|
||||
# Format: flat uint16 token IDs (Llama2 BPE, 32K vocab)
|
||||
# Source: enio/TinyStories on HuggingFace (pretokenized with karpathy/llama2.c)
|
||||
#
|
||||
# The tar.gz contains data00.bin..data49.bin (50 shards).
|
||||
# We extract only data00.bin and rename it to tinystories_data00.bin.
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
OUTPUT="$SCRIPT_DIR/tinystories_data00.bin"
|
||||
|
||||
if [ -f "$OUTPUT" ]; then
|
||||
SIZE=$(stat -f%z "$OUTPUT" 2>/dev/null || stat -c%s "$OUTPUT" 2>/dev/null)
|
||||
TOKENS=$((SIZE / 2))
|
||||
echo "$OUTPUT already exists ($TOKENS tokens, $(echo "scale=1; $SIZE/1000000" | bc) MB)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TAR_URL="https://huggingface.co/datasets/enio/TinyStories/resolve/main/tok32000/TinyStories_tok32000.tar.gz?download=true"
|
||||
TAR_FILE="$SCRIPT_DIR/TinyStories_tok32000.tar.gz"
|
||||
|
||||
echo "=== TinyStories Data Download ==="
|
||||
echo "Downloading pretokenized TinyStories (32K vocab, ~993 MB)..."
|
||||
echo " Source: enio/TinyStories on HuggingFace"
|
||||
echo " This will take a few minutes depending on your connection."
|
||||
echo ""
|
||||
|
||||
# Download the tar.gz
|
||||
if [ ! -f "$TAR_FILE" ]; then
|
||||
if command -v curl &>/dev/null; then
|
||||
curl -L --progress-bar -o "$TAR_FILE" "$TAR_URL"
|
||||
elif command -v wget &>/dev/null; then
|
||||
wget --show-progress -O "$TAR_FILE" "$TAR_URL"
|
||||
else
|
||||
echo "Error: need curl or wget"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Tar file already downloaded, skipping..."
|
||||
fi
|
||||
|
||||
# Verify it's actually a gzip file (not an error page)
|
||||
if ! file "$TAR_FILE" | grep -q "gzip"; then
|
||||
echo "Error: Downloaded file is not a valid gzip archive."
|
||||
echo "Content: $(head -c 100 "$TAR_FILE")"
|
||||
rm -f "$TAR_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Extracting data00.bin from archive..."
|
||||
|
||||
# List what's in the archive to find the right path
|
||||
DATA_FILE=$(tar tzf "$TAR_FILE" 2>/dev/null | grep 'data00\.bin' | head -1)
|
||||
if [ -z "$DATA_FILE" ]; then
|
||||
echo "Error: data00.bin not found in archive. Contents:"
|
||||
tar tzf "$TAR_FILE" | head -20
|
||||
exit 1
|
||||
fi
|
||||
echo " Found: $DATA_FILE"
|
||||
|
||||
# Extract just data00.bin
|
||||
tar xzf "$TAR_FILE" -C "$SCRIPT_DIR" "$DATA_FILE"
|
||||
|
||||
# Move to expected location (might be in a subdirectory)
|
||||
EXTRACTED="$SCRIPT_DIR/$DATA_FILE"
|
||||
if [ "$EXTRACTED" != "$OUTPUT" ]; then
|
||||
mv "$EXTRACTED" "$OUTPUT"
|
||||
# Clean up any extracted subdirectories
|
||||
rmdir "$(dirname "$EXTRACTED")" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Clean up tar.gz to save disk space
|
||||
echo "Cleaning up archive..."
|
||||
rm -f "$TAR_FILE"
|
||||
|
||||
SIZE=$(stat -f%z "$OUTPUT" 2>/dev/null || stat -c%s "$OUTPUT" 2>/dev/null)
|
||||
TOKENS=$((SIZE / 2))
|
||||
echo ""
|
||||
echo "Done: $OUTPUT"
|
||||
echo " $TOKENS tokens ($(echo "scale=1; $SIZE/1000000" | bc) MB)"
|
||||
|
||||
# Sanity check
|
||||
python3 -c "
|
||||
import struct
|
||||
with open('$OUTPUT', 'rb') as f:
|
||||
tokens = struct.unpack('<10H', f.read(20))
|
||||
print(f'First 10 tokens: {tokens}')
|
||||
" 2>/dev/null || true
|
||||
|
|
@ -0,0 +1,255 @@
|
|||
// test_classifier.m — Test classifier matmul (32000 channels) and softmax on ANE
|
||||
// This tests the riskiest operations: VOCAB-sized conv and softmax
|
||||
// Build: xcrun clang -O2 -framework Foundation -framework IOSurface \
|
||||
// -framework CoreML -framework Accelerate -ldl -lobjc \
|
||||
// -o test_classifier test_classifier.m
|
||||
#include "ane_classifier.h"
|
||||
#include "stories_cpu_ops.h"
|
||||
|
||||
int main(void) {
|
||||
@autoreleasepool {
|
||||
setbuf(stdout, NULL);
|
||||
ane_init();
|
||||
mach_timebase_info(&g_tb);
|
||||
|
||||
printf("=== Test: Classifier + Softmax on ANE ===\n");
|
||||
printf("DIM=%d SEQ=%d VOCAB=%d\n\n", DIM, SEQ, VOCAB);
|
||||
|
||||
// ======== Test 1: Final RMSNorm ========
|
||||
printf("--- Test 1: Final RMSNorm on ANE ---\n");
|
||||
{
|
||||
float *x = (float*)malloc(DIM * SEQ * 4);
|
||||
float *w = (float*)malloc(DIM * 4);
|
||||
float *out_cpu = (float*)malloc(DIM * SEQ * 4);
|
||||
float *out_ane = (float*)malloc(DIM * SEQ * 4);
|
||||
srand48(42);
|
||||
for (int i = 0; i < DIM * SEQ; i++) x[i] = (float)(drand48() * 2 - 1);
|
||||
for (int i = 0; i < DIM; i++) w[i] = (float)(drand48() * 0.5 + 0.75);
|
||||
|
||||
rmsnorm(out_cpu, x, w, DIM, SEQ);
|
||||
|
||||
Kern *kern = compile_kern_mil_w(gen_final_rmsnorm(), (@{
|
||||
@"@model_path/weights/rms_w.bin": @{@"offset":@0, @"data":build_blob(w, 1, DIM)},
|
||||
}), DIM*SEQ*2, DIM*SEQ*2);
|
||||
|
||||
if (!kern) { printf("FAIL: Final RMSNorm compile failed\n"); return 1; }
|
||||
printf("Compile OK\n");
|
||||
|
||||
io_write_fp16(kern->ioIn, x, DIM, SEQ);
|
||||
ane_eval(kern);
|
||||
io_read_fp16(kern->ioOut, out_ane, 0, DIM, SEQ);
|
||||
|
||||
float max_err = 0;
|
||||
for (int i = 0; i < DIM*SEQ; i++) {
|
||||
float e = fabsf(out_cpu[i] - out_ane[i]);
|
||||
if (e > max_err) max_err = e;
|
||||
}
|
||||
printf("Max error: %.6f %s\n\n", max_err, max_err < 0.05 ? "PASS ✅" : "FAIL ❌");
|
||||
free_kern(kern);
|
||||
free(x); free(w); free(out_cpu); free(out_ane);
|
||||
}
|
||||
|
||||
// ======== Test 2: Classifier forward (32000-channel conv) ========
|
||||
printf("--- Test 2: Classifier Forward (VOCAB=%d channel conv) ---\n", VOCAB);
|
||||
{
|
||||
float *x_final = (float*)malloc(DIM * SEQ * 4);
|
||||
float *embed = (float*)malloc((size_t)VOCAB * DIM * 4);
|
||||
float *logits_cpu = (float*)malloc((size_t)VOCAB * SEQ * 4);
|
||||
float *logits_ane = (float*)malloc((size_t)VOCAB * SEQ * 4);
|
||||
|
||||
srand48(123);
|
||||
for (int i = 0; i < DIM * SEQ; i++) x_final[i] = (float)(drand48() * 2 - 1) * 0.1f;
|
||||
for (size_t i = 0; i < (size_t)VOCAB * DIM; i++) embed[i] = (float)(drand48() * 2 - 1) * 0.02f;
|
||||
|
||||
// CPU reference: logits = embed @ x_final
|
||||
// logits[v, t] = sum_d embed[v,d] * x_final[d,t]
|
||||
// embed is [VOCAB, DIM] row-major, x_final is [DIM, SEQ] channel-first
|
||||
uint64_t t0 = mach_absolute_time();
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||
VOCAB, SEQ, DIM, 1.0f,
|
||||
embed, DIM, x_final, SEQ, 0.0f, logits_cpu, SEQ);
|
||||
uint64_t t1 = mach_absolute_time();
|
||||
printf("CPU cblas_sgemm: %.2f ms\n", tb_ms(t1-t0));
|
||||
|
||||
// ANE: build weight blob for embed [VOCAB, DIM]
|
||||
printf("Building embed blob (%.1f MB fp16)...\n", (float)VOCAB*DIM*2/1e6);
|
||||
NSData *embed_blob = build_blob(embed, VOCAB, DIM);
|
||||
|
||||
printf("Compiling classifier kernel...\n");
|
||||
t0 = mach_absolute_time();
|
||||
Kern *cls = compile_kern_mil_w(gen_classifier_fwd(), (@{
|
||||
@"@model_path/weights/embed.bin": @{@"offset":@0, @"data":embed_blob},
|
||||
}), DIM*SEQ*2, VOCAB*SEQ*2);
|
||||
t1 = mach_absolute_time();
|
||||
|
||||
if (!cls) {
|
||||
printf("FAIL: Classifier compile failed (32000 channels too large for ANE)\n");
|
||||
printf("This confirms tiling is needed.\n\n");
|
||||
} else {
|
||||
printf("Compile OK in %.0f ms (compiles=%d)\n", tb_ms(t1-t0), g_compile_count);
|
||||
|
||||
io_write_fp16(cls->ioIn, x_final, DIM, SEQ);
|
||||
t0 = mach_absolute_time();
|
||||
ane_eval(cls);
|
||||
t1 = mach_absolute_time();
|
||||
printf("ANE eval: %.2f ms\n", tb_ms(t1-t0));
|
||||
|
||||
// Read back and compare (sample — full read would be 32000*256*4 = 32MB)
|
||||
io_read_fp16(cls->ioOut, logits_ane, 0, VOCAB, SEQ);
|
||||
|
||||
float max_err = 0, sum_err = 0;
|
||||
int cnt = 0;
|
||||
for (int v = 0; v < VOCAB; v++) {
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
int idx = v*SEQ + t;
|
||||
float e = fabsf(logits_cpu[idx] - logits_ane[idx]);
|
||||
sum_err += e;
|
||||
cnt++;
|
||||
if (e > max_err) max_err = e;
|
||||
}
|
||||
}
|
||||
printf("Max error: %.6f Mean error: %.6f %s\n",
|
||||
max_err, sum_err/cnt, max_err < 1.0 ? "PASS ✅" : "FAIL ❌");
|
||||
|
||||
// Benchmark
|
||||
int N = 10;
|
||||
t0 = mach_absolute_time();
|
||||
for (int i = 0; i < N; i++) ane_eval(cls);
|
||||
t1 = mach_absolute_time();
|
||||
printf("Benchmark: %d evals in %.2f ms (%.2f ms/eval)\n\n", N, tb_ms(t1-t0), tb_ms(t1-t0)/N);
|
||||
free_kern(cls);
|
||||
}
|
||||
free(x_final); free(embed); free(logits_cpu); free(logits_ane);
|
||||
}
|
||||
|
||||
// ======== Test 3: Softmax over VOCAB dimension ========
|
||||
printf("--- Test 3: Softmax over VOCAB=%d ---\n", VOCAB);
|
||||
{
|
||||
float *logits = (float*)malloc((size_t)VOCAB * SEQ * 4);
|
||||
float *probs_cpu = (float*)malloc((size_t)VOCAB * SEQ * 4);
|
||||
float *probs_ane = (float*)malloc((size_t)VOCAB * SEQ * 4);
|
||||
|
||||
srand48(999);
|
||||
for (size_t i = 0; i < (size_t)VOCAB * SEQ; i++)
|
||||
logits[i] = (float)(drand48() * 10 - 5);
|
||||
|
||||
// CPU reference softmax (per position, over vocab)
|
||||
// logits is [VOCAB, SEQ] channel-first
|
||||
uint64_t t0 = mach_absolute_time();
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
float maxv = -1e30f;
|
||||
for (int v = 0; v < VOCAB; v++) {
|
||||
float val = logits[v*SEQ+t];
|
||||
if (val > maxv) maxv = val;
|
||||
}
|
||||
float sum = 0;
|
||||
for (int v = 0; v < VOCAB; v++) {
|
||||
probs_cpu[v*SEQ+t] = expf(logits[v*SEQ+t] - maxv);
|
||||
sum += probs_cpu[v*SEQ+t];
|
||||
}
|
||||
for (int v = 0; v < VOCAB; v++) probs_cpu[v*SEQ+t] /= sum;
|
||||
}
|
||||
uint64_t t1 = mach_absolute_time();
|
||||
printf("CPU softmax: %.2f ms\n", tb_ms(t1-t0));
|
||||
|
||||
printf("Compiling softmax kernel...\n");
|
||||
int sm_bytes = VOCAB * SEQ * 2;
|
||||
Kern *sm = compile_kern_mil_w(gen_softmax_vocab(), @{}, sm_bytes, sm_bytes);
|
||||
|
||||
if (!sm) {
|
||||
printf("FAIL: Softmax compile failed\n\n");
|
||||
} else {
|
||||
printf("Compile OK\n");
|
||||
|
||||
io_write_fp16(sm->ioIn, logits, VOCAB, SEQ);
|
||||
t0 = mach_absolute_time();
|
||||
ane_eval(sm);
|
||||
t1 = mach_absolute_time();
|
||||
printf("ANE eval: %.2f ms\n", tb_ms(t1-t0));
|
||||
|
||||
io_read_fp16(sm->ioOut, probs_ane, 0, VOCAB, SEQ);
|
||||
|
||||
// Check: probs should sum to ~1.0 per position
|
||||
float max_err = 0;
|
||||
for (int t = 0; t < 4; t++) {
|
||||
float sum_cpu = 0, sum_ane = 0;
|
||||
for (int v = 0; v < VOCAB; v++) {
|
||||
sum_cpu += probs_cpu[v*SEQ+t];
|
||||
sum_ane += probs_ane[v*SEQ+t];
|
||||
float e = fabsf(probs_cpu[v*SEQ+t] - probs_ane[v*SEQ+t]);
|
||||
if (e > max_err) max_err = e;
|
||||
}
|
||||
printf(" pos %d: CPU sum=%.4f ANE sum=%.4f\n", t, sum_cpu, sum_ane);
|
||||
}
|
||||
printf("Max error (first 4 positions): %.6f %s\n",
|
||||
max_err, max_err < 0.01 ? "PASS ✅" : "FAIL ❌");
|
||||
|
||||
int N = 10;
|
||||
t0 = mach_absolute_time();
|
||||
for (int i = 0; i < N; i++) ane_eval(sm);
|
||||
t1 = mach_absolute_time();
|
||||
printf("Benchmark: %d evals in %.2f ms (%.2f ms/eval)\n\n", N, tb_ms(t1-t0), tb_ms(t1-t0)/N);
|
||||
free_kern(sm);
|
||||
}
|
||||
free(logits); free(probs_cpu); free(probs_ane);
|
||||
}
|
||||
|
||||
// ======== Test 4: Classifier backward ========
|
||||
printf("--- Test 4: Classifier Backward (DIM=%d from VOCAB=%d) ---\n", DIM, VOCAB);
|
||||
{
|
||||
float *dlogits = (float*)malloc((size_t)VOCAB * SEQ * 4);
|
||||
float *embed = (float*)malloc((size_t)VOCAB * DIM * 4);
|
||||
float *dx_cpu = (float*)malloc(DIM * SEQ * 4);
|
||||
float *dx_ane = (float*)malloc(DIM * SEQ * 4);
|
||||
|
||||
srand48(456);
|
||||
for (size_t i = 0; i < (size_t)VOCAB * SEQ; i++) dlogits[i] = (float)(drand48() * 2 - 1) * 0.01f;
|
||||
for (size_t i = 0; i < (size_t)VOCAB * DIM; i++) embed[i] = (float)(drand48() * 2 - 1) * 0.02f;
|
||||
|
||||
// CPU: dx = embed^T @ dlogits
|
||||
uint64_t t0 = mach_absolute_time();
|
||||
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
|
||||
DIM, SEQ, VOCAB, 1.0f,
|
||||
embed, DIM, dlogits, SEQ, 0.0f, dx_cpu, SEQ);
|
||||
uint64_t t1 = mach_absolute_time();
|
||||
printf("CPU cblas_sgemm: %.2f ms\n", tb_ms(t1-t0));
|
||||
|
||||
// Build transposed embed blob
|
||||
NSData *embed_t_blob = build_blob_t(embed, VOCAB, DIM);
|
||||
|
||||
printf("Compiling classifier backward...\n");
|
||||
Kern *clsb = compile_kern_mil_w(gen_classifier_bwd(), (@{
|
||||
@"@model_path/weights/embed_t.bin": @{@"offset":@0, @"data":embed_t_blob},
|
||||
}), VOCAB*SEQ*2, DIM*SEQ*2);
|
||||
|
||||
if (!clsb) {
|
||||
printf("FAIL: Classifier backward compile failed\n\n");
|
||||
} else {
|
||||
printf("Compile OK\n");
|
||||
|
||||
io_write_fp16(clsb->ioIn, dlogits, VOCAB, SEQ);
|
||||
t0 = mach_absolute_time();
|
||||
ane_eval(clsb);
|
||||
t1 = mach_absolute_time();
|
||||
printf("ANE eval: %.2f ms\n", tb_ms(t1-t0));
|
||||
|
||||
io_read_fp16(clsb->ioOut, dx_ane, 0, DIM, SEQ);
|
||||
|
||||
float max_err = 0, sum_err = 0;
|
||||
for (int i = 0; i < DIM*SEQ; i++) {
|
||||
float e = fabsf(dx_cpu[i] - dx_ane[i]);
|
||||
sum_err += e;
|
||||
if (e > max_err) max_err = e;
|
||||
}
|
||||
printf("Max error: %.6f Mean error: %.6f %s\n\n",
|
||||
max_err, sum_err/(DIM*SEQ), max_err < 1.0 ? "PASS ✅" : "FAIL ❌");
|
||||
free_kern(clsb);
|
||||
}
|
||||
free(dlogits); free(embed); free(dx_cpu); free(dx_ane);
|
||||
}
|
||||
|
||||
printf("=== All tests complete ===\n");
|
||||
printf("Total ANE compiles used: %d\n", g_compile_count);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
// test_rmsnorm_bwd.m — Test RMSNorm backward ANE kernel vs CPU reference
|
||||
// Build: xcrun clang -O2 -framework Foundation -framework IOSurface \
|
||||
// -framework CoreML -framework Accelerate -ldl -lobjc \
|
||||
// -o test_rmsnorm_bwd test_rmsnorm_bwd.m
|
||||
#include "ane_rmsnorm_bwd.h"
|
||||
#include "stories_cpu_ops.h"
|
||||
|
||||
int main(void) {
|
||||
@autoreleasepool {
|
||||
setbuf(stdout, NULL);
|
||||
ane_init();
|
||||
mach_timebase_info(&g_tb);
|
||||
|
||||
printf("=== Test: RMSNorm Backward on ANE ===\n");
|
||||
printf("DIM=%d SEQ=%d\n\n", DIM, SEQ);
|
||||
|
||||
// Allocate test data
|
||||
float *x = (float*)malloc(DIM * SEQ * 4);
|
||||
float *dy = (float*)malloc(DIM * SEQ * 4);
|
||||
float *w = (float*)malloc(DIM * 4);
|
||||
float *dx_cpu = (float*)calloc(DIM * SEQ, 4);
|
||||
float *dw_cpu = (float*)calloc(DIM, 4);
|
||||
float *dx_ane = (float*)malloc(DIM * SEQ * 4);
|
||||
|
||||
// Random init (channel-first [DIM, SEQ])
|
||||
srand48(42);
|
||||
for (int i = 0; i < DIM * SEQ; i++) {
|
||||
x[i] = (float)(drand48() * 2 - 1) * 0.5f;
|
||||
dy[i] = (float)(drand48() * 2 - 1) * 0.1f;
|
||||
}
|
||||
for (int i = 0; i < DIM; i++) {
|
||||
w[i] = (float)(drand48() * 0.5 + 0.75); // close to 1.0
|
||||
}
|
||||
|
||||
// === CPU Reference ===
|
||||
uint64_t t0 = mach_absolute_time();
|
||||
rmsnorm_bwd(dx_cpu, dw_cpu, dy, x, w, DIM, SEQ);
|
||||
uint64_t t1 = mach_absolute_time();
|
||||
printf("CPU rmsnorm_bwd: %.2f ms\n", tb_ms(t1 - t0));
|
||||
|
||||
// === ANE Kernel ===
|
||||
printf("Compiling ANE rmsnorm_bwd kernel...\n");
|
||||
NSString *mil = gen_rmsnorm_bwd();
|
||||
|
||||
// Build weight blob for RMSNorm weights
|
||||
NSData *rms_blob = build_blob(w, 1, DIM);
|
||||
|
||||
int in_bytes = 2 * DIM * SEQ * 2; // concat(dy, x) in fp16
|
||||
int out_bytes = DIM * SEQ * 2; // dx in fp16
|
||||
|
||||
Kern *kern = compile_kern_mil_w(mil, (@{
|
||||
@"@model_path/weights/rms_w.bin": @{@"offset":@0, @"data":rms_blob},
|
||||
}), in_bytes, out_bytes);
|
||||
|
||||
if (!kern) {
|
||||
printf("FAIL: ANE kernel compilation failed!\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Compile OK (compiles=%d)\n", g_compile_count);
|
||||
|
||||
// Write input: concat(dy, x) into ioIn
|
||||
// dy goes at channel offset 0, x goes at channel offset DIM
|
||||
io_write_fp16_at(kern->ioIn, 0, dy, DIM, SEQ);
|
||||
io_write_fp16_at(kern->ioIn, DIM, x, DIM, SEQ);
|
||||
|
||||
// Evaluate
|
||||
t0 = mach_absolute_time();
|
||||
ane_eval(kern);
|
||||
t1 = mach_absolute_time();
|
||||
printf("ANE eval: %.3f ms\n", tb_ms(t1 - t0));
|
||||
|
||||
// Read output
|
||||
io_read_fp16(kern->ioOut, dx_ane, 0, DIM, SEQ);
|
||||
|
||||
// === Compare ===
|
||||
float max_err = 0, sum_err = 0;
|
||||
int max_i = 0, max_j = 0;
|
||||
for (int i = 0; i < DIM; i++) {
|
||||
for (int j = 0; j < SEQ; j++) {
|
||||
int idx = i * SEQ + j;
|
||||
float err = fabsf(dx_cpu[idx] - dx_ane[idx]);
|
||||
sum_err += err;
|
||||
if (err > max_err) {
|
||||
max_err = err;
|
||||
max_i = i; max_j = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
float mean_err = sum_err / (DIM * SEQ);
|
||||
|
||||
printf("\n=== Results ===\n");
|
||||
printf("Max absolute error: %.6f at [%d,%d] (CPU=%.6f ANE=%.6f)\n",
|
||||
max_err, max_i, max_j, dx_cpu[max_i*SEQ+max_j], dx_ane[max_i*SEQ+max_j]);
|
||||
printf("Mean absolute error: %.6f\n", mean_err);
|
||||
|
||||
// Sample outputs
|
||||
printf("\nSample dx values (first 4 channels, first 4 positions):\n");
|
||||
printf("%-6s %-12s %-12s %-10s\n", "Idx", "CPU", "ANE", "Error");
|
||||
for (int i = 0; i < 4 && i < DIM; i++) {
|
||||
for (int j = 0; j < 4 && j < SEQ; j++) {
|
||||
int idx = i * SEQ + j;
|
||||
printf("[%d,%d] %-12.6f %-12.6f %-10.6f\n",
|
||||
i, j, dx_cpu[idx], dx_ane[idx], fabsf(dx_cpu[idx] - dx_ane[idx]));
|
||||
}
|
||||
}
|
||||
|
||||
// Benchmark: multiple evals
|
||||
int N = 100;
|
||||
t0 = mach_absolute_time();
|
||||
for (int i = 0; i < N; i++) ane_eval(kern);
|
||||
t1 = mach_absolute_time();
|
||||
printf("\nBenchmark: %d evals in %.2f ms (%.3f ms/eval)\n",
|
||||
N, tb_ms(t1-t0), tb_ms(t1-t0)/N);
|
||||
|
||||
// Pass/fail
|
||||
bool pass = max_err < 0.05f && mean_err < 0.01f;
|
||||
printf("\n%s (threshold: max<0.05, mean<0.01)\n", pass ? "PASS ✅" : "FAIL ❌");
|
||||
|
||||
free_kern(kern);
|
||||
free(x); free(dy); free(w); free(dx_cpu); free(dw_cpu); free(dx_ane);
|
||||
return pass ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,695 @@
|
|||
// train_large_ane.m — Stories110M training with CPU ops offloaded to ANE
|
||||
// Based on train_large.m but moves these operations from CPU to ANE:
|
||||
// 1. Final RMSNorm (was CPU vDSP) → ANE kernel
|
||||
// 2. Classifier forward embed@x (was CPU cblas) → ANE 32000-ch conv
|
||||
// 3. Cross-entropy softmax (was CPU vDSP) → ANE softmax kernel
|
||||
// 4. RMSNorm backward (was CPU vDSP) → ANE kernel
|
||||
// Still on CPU: dW gradients (parallel via GCD), Adam optimizer (needs weight mutation),
|
||||
// classifier backward (ANE matmul slower than cblas for this shape),
|
||||
// NLL loss + gradient (needs target indexing)
|
||||
//
|
||||
// Build: make train_large_ane
|
||||
// Run: ./train_large_ane [--resume] [--steps N] [--lr F]
|
||||
#include "stories_io.h"
|
||||
#include "stories_mil.h"
|
||||
#include "stories_cpu_ops.h"
|
||||
#include "ane_rmsnorm_bwd.h"
|
||||
#include "ane_classifier.h"
|
||||
|
||||
#define CKPT_PATH "ane_stories110M_ckpt.bin"
|
||||
#define MODEL_PATH "../../assets/models/stories110M.bin"
|
||||
#define DATA_PATH "tinystories_data00.bin"
|
||||
|
||||
// ===== Weight loading from llama2.c format =====
|
||||
static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) {
|
||||
FILE *f = fopen(path, "rb");
|
||||
if (!f) { printf("Cannot open %s\n", path); return false; }
|
||||
Llama2Config cfg;
|
||||
fread(&cfg, sizeof(cfg), 1, f);
|
||||
printf(" Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n",
|
||||
cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len);
|
||||
if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) {
|
||||
printf(" ERROR: Config mismatch!\n"); fclose(f); return false;
|
||||
}
|
||||
int V = abs(cfg.vocab_size);
|
||||
bool shared = cfg.vocab_size > 0;
|
||||
fread(embed, 4, V * DIM, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f);
|
||||
for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f);
|
||||
fread(rms_final, 4, DIM, f);
|
||||
fclose(f);
|
||||
printf(" Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls");
|
||||
return true;
|
||||
}
|
||||
|
||||
// ===== Compile one layer's kernels =====
|
||||
static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) {
|
||||
lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{
|
||||
@"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)},
|
||||
@"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)},
|
||||
@"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)},
|
||||
@"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)},
|
||||
@"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)},
|
||||
@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
|
||||
}), DIM*SEQ*2, 6*DIM*SEQ*2);
|
||||
lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{
|
||||
@"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)},
|
||||
@"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)},
|
||||
@"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)},
|
||||
@"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)},
|
||||
}), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2);
|
||||
lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{
|
||||
@"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)},
|
||||
@"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)},
|
||||
@"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)},
|
||||
}), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2);
|
||||
lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{
|
||||
@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
|
||||
@"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)},
|
||||
}), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2);
|
||||
lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{
|
||||
@"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)},
|
||||
@"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)},
|
||||
@"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)},
|
||||
}), 3*DIM*SEQ*2, DIM*SEQ*2);
|
||||
return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd;
|
||||
}
|
||||
|
||||
static Kern *compile_sdpa_bwd2(void) {
|
||||
return compile_kern_mil_w(gen_sdpa_bwd2(), @{},
|
||||
(2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2);
|
||||
}
|
||||
|
||||
// NEW: Compile RMSNorm backward kernels (one per layer pair: attn + ffn)
|
||||
static Kern *compile_rmsnorm_bwd_kern(const float *rms_w) {
|
||||
return compile_kern_mil_w(gen_rmsnorm_bwd(), (@{
|
||||
@"@model_path/weights/rms_w.bin": @{@"offset":@0, @"data":build_blob(rms_w, 1, DIM)},
|
||||
}), 2*DIM*SEQ*2, DIM*SEQ*2);
|
||||
}
|
||||
|
||||
// NEW: Compile classifier forward kernel
|
||||
static Kern *compile_classifier_fwd(const float *embed) {
|
||||
return compile_kern_mil_w(gen_classifier_fwd(), (@{
|
||||
@"@model_path/weights/embed.bin": @{@"offset":@0, @"data":build_blob(embed, VOCAB, DIM)},
|
||||
}), DIM*SEQ*2, VOCAB*SEQ*2);
|
||||
}
|
||||
|
||||
// NEW: Compile final RMSNorm kernel
|
||||
static Kern *compile_final_rmsnorm_kern(const float *rms_w) {
|
||||
return compile_kern_mil_w(gen_final_rmsnorm(), (@{
|
||||
@"@model_path/weights/rms_w.bin": @{@"offset":@0, @"data":build_blob(rms_w, 1, DIM)},
|
||||
}), DIM*SEQ*2, DIM*SEQ*2);
|
||||
}
|
||||
|
||||
// NEW: Compile softmax kernel (no weights)
|
||||
static Kern *compile_softmax_kern(void) {
|
||||
return compile_kern_mil_w(gen_softmax_vocab(), @{}, VOCAB*SEQ*2, VOCAB*SEQ*2);
|
||||
}
|
||||
|
||||
static void free_layer_kernels(LayerKernels *lk) {
|
||||
free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd);
|
||||
free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd);
|
||||
lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL;
|
||||
}
|
||||
|
||||
// ===== Checkpoint save/load (same as train_large.m) =====
|
||||
static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss,
|
||||
double cc, double ct, double cw, int cs, int cb, int adam_t,
|
||||
LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
|
||||
float *embed, AdamState *aembed) {
|
||||
FILE *f = fopen(path, "wb");
|
||||
CkptHdr h = {0};
|
||||
h.magic = 0x424C5A54; h.version = 2;
|
||||
h.step = step; h.total_steps = total_steps;
|
||||
h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM;
|
||||
h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ;
|
||||
h.lr = lr; h.loss = loss;
|
||||
h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw;
|
||||
h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t;
|
||||
fwrite(&h, sizeof(h), 1, f);
|
||||
for (int L = 0; L < NLAYERS; L++) {
|
||||
fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f);
|
||||
fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f);
|
||||
fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f);
|
||||
fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f);
|
||||
fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f);
|
||||
fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f);
|
||||
fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f);
|
||||
fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f);
|
||||
fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f);
|
||||
fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f);
|
||||
fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f);
|
||||
fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f);
|
||||
fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f);
|
||||
}
|
||||
fwrite(rms_final,4,DIM,f);
|
||||
fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f);
|
||||
fwrite(embed,4,VOCAB*DIM,f);
|
||||
fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss,
|
||||
double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t,
|
||||
LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
|
||||
float *embed, AdamState *aembed) {
|
||||
FILE *f = fopen(path, "rb");
|
||||
if (!f) return false;
|
||||
CkptHdr h;
|
||||
fread(&h, sizeof(h), 1, f);
|
||||
if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; }
|
||||
*step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss;
|
||||
*cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall;
|
||||
*cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t;
|
||||
for (int L = 0; L < NLAYERS; L++) {
|
||||
fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f);
|
||||
fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f);
|
||||
fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f);
|
||||
fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f);
|
||||
fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f);
|
||||
fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f);
|
||||
fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f);
|
||||
fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f);
|
||||
fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f);
|
||||
fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f);
|
||||
fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f);
|
||||
fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f);
|
||||
fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f);
|
||||
}
|
||||
fread(rms_final,4,DIM,f);
|
||||
fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f);
|
||||
fread(embed,4,VOCAB*DIM,f);
|
||||
fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f);
|
||||
fclose(f);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ===== Main =====
|
||||
int main(int argc, char *argv[]) {
|
||||
@autoreleasepool {
|
||||
setbuf(stdout, NULL);
|
||||
ane_init();
|
||||
mach_timebase_info(&g_tb);
|
||||
|
||||
int total_steps = 10000;
|
||||
float lr = 3e-4f;
|
||||
float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f;
|
||||
int adam_t = 0, start_step = 0;
|
||||
bool do_resume = false;
|
||||
for (int i=1; i<argc; i++) {
|
||||
if (strcmp(argv[i], "--resume") == 0) do_resume = true;
|
||||
else if (strcmp(argv[i], "--steps") == 0 && i+1<argc) total_steps = atoi(argv[++i]);
|
||||
else if (strcmp(argv[i], "--lr") == 0 && i+1<argc) lr = atof(argv[++i]);
|
||||
}
|
||||
|
||||
LayerWeights lw[NLAYERS]; LayerAdam la[NLAYERS];
|
||||
LayerActs acts[NLAYERS]; LayerGrads grads[NLAYERS]; LayerKernels kern[NLAYERS];
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
lw[L] = layer_weights_alloc(); la[L] = layer_adam_alloc();
|
||||
acts[L] = layer_acts_alloc(); grads[L] = layer_grads_alloc();
|
||||
memset(&kern[L], 0, sizeof(LayerKernels));
|
||||
}
|
||||
float *rms_final = (float*)malloc(DIM*4);
|
||||
float *embed = (float*)malloc(VOCAB*DIM*4);
|
||||
float *grms_final = (float*)calloc(DIM, 4);
|
||||
float *gembed = (float*)calloc(VOCAB*DIM, 4);
|
||||
AdamState arms_final = adam_alloc(DIM);
|
||||
AdamState aembed = adam_alloc((size_t)VOCAB*DIM);
|
||||
double cum_compile=0, cum_train=0, cum_wall=0;
|
||||
int cum_steps=0, cum_batches=0;
|
||||
|
||||
float resume_loss = 0;
|
||||
bool resuming = false;
|
||||
if (do_resume) {
|
||||
resuming = load_checkpoint(CKPT_PATH, &start_step, &total_steps, &lr, &resume_loss,
|
||||
&cum_compile, &cum_train, &cum_wall, &cum_steps, &cum_batches, &adam_t,
|
||||
lw, la, rms_final, &arms_final, embed, &aembed);
|
||||
if (resuming) printf("[RESUMED step %d, loss=%.4f]\n", start_step, resume_loss);
|
||||
}
|
||||
if (!resuming) {
|
||||
printf("=== ANE Training: Stories110M (ANE-offloaded) ===\n");
|
||||
printf("dim=%d hidden=%d heads=%d seq=%d vocab=%d layers=%d\n", DIM, HIDDEN, HEADS, SEQ, VOCAB, NLAYERS);
|
||||
printf("NEW: final_rmsnorm, classifier_fwd, softmax, rmsnorm_bwd on ANE\n");
|
||||
if (!load_pretrained(lw, rms_final, embed, MODEL_PATH)) {
|
||||
printf("Pretrained load failed, using random init\n");
|
||||
srand48(42);
|
||||
float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN);
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
for(size_t i=0;i<WQ_SZ;i++){lw[L].Wq[i]=scale_d*(2*drand48()-1);lw[L].Wk[i]=scale_d*(2*drand48()-1);}
|
||||
for(size_t i=0;i<WQ_SZ;i++){lw[L].Wv[i]=scale_d*(2*drand48()-1);lw[L].Wo[i]=scale_d*(2*drand48()-1);}
|
||||
for(size_t i=0;i<W1_SZ;i++) lw[L].W1[i]=scale_h*(2*drand48()-1);
|
||||
for(size_t i=0;i<W2_SZ;i++) lw[L].W2[i]=scale_d*(2*drand48()-1);
|
||||
for(size_t i=0;i<W3_SZ;i++) lw[L].W3[i]=scale_h*(2*drand48()-1);
|
||||
for(int i=0;i<DIM;i++){lw[L].rms_att[i]=1.0f; lw[L].rms_ffn[i]=1.0f;}
|
||||
}
|
||||
for(int i=0;i<DIM;i++) rms_final[i]=1.0f;
|
||||
float escale = 0.02f;
|
||||
for(size_t i=0;i<(size_t)VOCAB*DIM;i++) embed[i]=escale*(2*drand48()-1);
|
||||
}
|
||||
}
|
||||
|
||||
// mmap token data
|
||||
int data_fd = open(DATA_PATH, O_RDONLY);
|
||||
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
|
||||
struct stat st; fstat(data_fd, &st);
|
||||
size_t data_len = st.st_size;
|
||||
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
|
||||
if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
|
||||
size_t n_tokens = data_len / 2;
|
||||
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
|
||||
|
||||
// Gradient buffers
|
||||
float *dy = (float*)malloc(SEQ*DIM*4);
|
||||
float *dffn = (float*)malloc(SEQ*DIM*4);
|
||||
float *dh1 = (float*)malloc(SEQ*HIDDEN*4);
|
||||
float *dh3 = (float*)malloc(SEQ*HIDDEN*4);
|
||||
float *dx_ffn = (float*)malloc(SEQ*DIM*4);
|
||||
float *dx2 = (float*)malloc(SEQ*DIM*4);
|
||||
float *do_out_buf = (float*)malloc(SEQ*DIM*4);
|
||||
float *dq = (float*)malloc(SEQ*DIM*4);
|
||||
float *dk = (float*)malloc(SEQ*DIM*4);
|
||||
float *dv = (float*)malloc(SEQ*DIM*4);
|
||||
float *dx_attn = (float*)malloc(SEQ*DIM*4);
|
||||
float *x_cur = (float*)malloc(SEQ*DIM*4);
|
||||
float *x_final = (float*)malloc(SEQ*DIM*4);
|
||||
float *logits = (float*)malloc(SEQ*VOCAB*4);
|
||||
float *dlogits = (float*)malloc(SEQ*VOCAB*4);
|
||||
float *probs = (float*)malloc(SEQ*VOCAB*4); // NEW: for ANE softmax output
|
||||
|
||||
// Compile static sdpaBwd2 kernels
|
||||
Kern *sdpaBwd2[NLAYERS];
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
sdpaBwd2[L] = compile_sdpa_bwd2();
|
||||
if (!sdpaBwd2[L]) { printf("sdpaBwd2 compile failed\n"); return 1; }
|
||||
}
|
||||
|
||||
// NEW: Compile ANE-offloaded kernels (static — no per-batch recompile needed)
|
||||
// These have no weight-bearing or static weights that don't change per batch
|
||||
|
||||
// RMSNorm backward kernels — one per layer for attn and ffn
|
||||
// These DO have baked weights (rms_att, rms_ffn) so they need recompile per batch
|
||||
// But they're small weights, and we compile them alongside the layer kernels
|
||||
Kern *rmsAttBwd[NLAYERS], *rmsFFNBwd[NLAYERS];
|
||||
memset(rmsAttBwd, 0, sizeof(rmsAttBwd));
|
||||
memset(rmsFFNBwd, 0, sizeof(rmsFFNBwd));
|
||||
|
||||
// Softmax kernel (no weights — compile once)
|
||||
Kern *softmaxKern = compile_softmax_kern();
|
||||
if (!softmaxKern) { printf("softmax compile failed\n"); return 1; }
|
||||
printf("Softmax kernel compiled (no weights)\n");
|
||||
|
||||
// Final RMSNorm and classifier are recompiled per batch since they have baked weights
|
||||
Kern *finalRmsKern = NULL, *classifierKern = NULL;
|
||||
|
||||
dispatch_queue_t dw_q = dispatch_queue_create("dw_cblas", DISPATCH_QUEUE_SERIAL);
|
||||
dispatch_group_t dw_grp = dispatch_group_create();
|
||||
|
||||
float last_loss = 999.0f;
|
||||
double total_compile_ms=0, total_train_ms=0;
|
||||
int total_steps_done=0, total_batches=0;
|
||||
uint64_t t_wall_start = mach_absolute_time();
|
||||
srand48(42 + start_step);
|
||||
|
||||
int step = start_step;
|
||||
while (step < total_steps) {
|
||||
// Check compile budget — account for new kernels
|
||||
// Per batch: 60 layer kernels + 24 rmsnorm_bwd + 1 classifier + 1 final_rms = 86
|
||||
int kernels_needed = TOTAL_WEIGHT_KERNELS + 2*NLAYERS + 2;
|
||||
if (g_compile_count + kernels_needed > MAX_COMPILES) {
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
free_layer_kernels(&kern[L]); free_kern(sdpaBwd2[L]);
|
||||
free_kern(rmsAttBwd[L]); free_kern(rmsFFNBwd[L]);
|
||||
}
|
||||
free_kern(softmaxKern); free_kern(finalRmsKern); free_kern(classifierKern);
|
||||
double wall = tb_ms(mach_absolute_time() - t_wall_start);
|
||||
save_checkpoint(CKPT_PATH, step, total_steps, lr, last_loss,
|
||||
total_compile_ms+cum_compile, total_train_ms+cum_train, wall+cum_wall,
|
||||
total_steps_done+cum_steps, total_batches+cum_batches, adam_t,
|
||||
lw, la, rms_final, &arms_final, embed, &aembed);
|
||||
printf("[exec() restart step %d, %d compiles, loss=%.4f]\n", step, g_compile_count, last_loss);
|
||||
fflush(stdout);
|
||||
execl(argv[0], argv[0], "--resume", NULL);
|
||||
perror("execl"); return 1;
|
||||
}
|
||||
|
||||
// Compile all layer kernels
|
||||
uint64_t tc = mach_absolute_time();
|
||||
for (int L=0; L<NLAYERS; L++) free_layer_kernels(&kern[L]);
|
||||
bool compile_ok = true;
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
printf(" Compiling layer %d/%d... (%d compiles)\r", L+1, NLAYERS, g_compile_count);
|
||||
fflush(stdout);
|
||||
if (!compile_layer_kernels(&kern[L], &lw[L])) {
|
||||
printf("\nCompile failed at layer %d\n", L);
|
||||
compile_ok = false; break;
|
||||
}
|
||||
// NEW: Compile RMSNorm backward kernels for this layer
|
||||
free_kern(rmsAttBwd[L]); free_kern(rmsFFNBwd[L]);
|
||||
rmsAttBwd[L] = compile_rmsnorm_bwd_kern(lw[L].rms_att);
|
||||
rmsFFNBwd[L] = compile_rmsnorm_bwd_kern(lw[L].rms_ffn);
|
||||
if (!rmsAttBwd[L] || !rmsFFNBwd[L]) {
|
||||
printf("\nrmsnorm_bwd compile failed at layer %d\n", L);
|
||||
compile_ok = false; break;
|
||||
}
|
||||
}
|
||||
if (!compile_ok) { g_compile_count = MAX_COMPILES; continue; }
|
||||
|
||||
// Re-compile sdpaBwd2 if needed
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
if (!sdpaBwd2[L]) {
|
||||
sdpaBwd2[L] = compile_sdpa_bwd2();
|
||||
if (!sdpaBwd2[L]) { printf("sdpaBwd2 recompile failed\n"); return 1; }
|
||||
}
|
||||
}
|
||||
|
||||
// NEW: Compile final RMSNorm and classifier with current weights
|
||||
free_kern(finalRmsKern); free_kern(classifierKern);
|
||||
finalRmsKern = compile_final_rmsnorm_kern(rms_final);
|
||||
classifierKern = compile_classifier_fwd(embed);
|
||||
if (!finalRmsKern || !classifierKern) {
|
||||
printf("finalRms or classifier compile failed\n");
|
||||
g_compile_count = MAX_COMPILES; continue;
|
||||
}
|
||||
// Re-compile softmax if needed
|
||||
if (!softmaxKern) {
|
||||
softmaxKern = compile_softmax_kern();
|
||||
if (!softmaxKern) { printf("softmax recompile failed\n"); return 1; }
|
||||
}
|
||||
|
||||
double cms = tb_ms(mach_absolute_time() - tc);
|
||||
total_compile_ms += cms;
|
||||
printf(" Compiled %d kernels in %.0fms \n", kernels_needed, cms);
|
||||
|
||||
// Zero gradient accumulators
|
||||
for (int L=0; L<NLAYERS; L++) layer_grads_zero(&grads[L]);
|
||||
memset(grms_final, 0, DIM*4);
|
||||
memset(gembed, 0, (size_t)VOCAB*DIM*4);
|
||||
|
||||
int steps_batch = 0;
|
||||
uint64_t tt = mach_absolute_time();
|
||||
double t_ane=0,t_io=0,t_elem=0,t_rms=0,t_cblas_wait=0,t_cls=0;
|
||||
|
||||
for (int a=0; a<ACCUM_STEPS && step<total_steps; a++, step++) {
|
||||
uint64_t t0,t1;
|
||||
size_t max_pos = n_tokens - SEQ - 1;
|
||||
size_t pos = (size_t)(drand48() * max_pos);
|
||||
uint16_t *input_tokens = token_data + pos;
|
||||
uint16_t *target_tokens = token_data + pos + 1;
|
||||
|
||||
// Embedding lookup
|
||||
t0=mach_absolute_time();
|
||||
embed_lookup(x_cur, embed, input_tokens, DIM, SEQ);
|
||||
t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
|
||||
|
||||
// ===== FORWARD (12 layers) =====
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
LayerActs *ac = &acts[L];
|
||||
memcpy(ac->layer_in, x_cur, SEQ*DIM*4);
|
||||
|
||||
t0=mach_absolute_time();
|
||||
dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
|
||||
t1=mach_absolute_time(); t_cblas_wait+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
io_write_fp16(kern[L].fwdAttn->ioIn, x_cur, DIM, SEQ);
|
||||
t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
|
||||
ane_eval(kern[L].fwdAttn);
|
||||
t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
|
||||
io_read_fp16(kern[L].fwdAttn->ioOut, ac->o_out, 0, DIM, SEQ);
|
||||
io_read_fp16(kern[L].fwdAttn->ioOut, ac->attn_out, 4*DIM, DIM, SEQ);
|
||||
io_read_fp16(kern[L].fwdAttn->ioOut, ac->xnorm, 5*DIM, DIM, SEQ);
|
||||
t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
vDSP_vadd(x_cur, 1, ac->o_out, 1, ac->x2, 1, (vDSP_Length)(SEQ*DIM));
|
||||
t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
io_write_fp16(kern[L].fwdFFN->ioIn, ac->x2, DIM, SEQ);
|
||||
t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
|
||||
ane_eval(kern[L].fwdFFN);
|
||||
t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
|
||||
io_read_fp16(kern[L].fwdFFN->ioOut, ac->ffn_out, 0, DIM, SEQ);
|
||||
io_read_fp16(kern[L].fwdFFN->ioOut, ac->h1, DIM, HIDDEN, SEQ);
|
||||
io_read_fp16(kern[L].fwdFFN->ioOut, ac->h3, DIM+HIDDEN, HIDDEN, SEQ);
|
||||
io_read_fp16(kern[L].fwdFFN->ioOut, ac->silu_out, DIM+2*HIDDEN, HIDDEN, SEQ);
|
||||
io_read_fp16(kern[L].fwdFFN->ioOut, ac->x2norm, DIM+3*HIDDEN, DIM, SEQ);
|
||||
t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
vDSP_vadd(ac->x2, 1, ac->ffn_out, 1, x_cur, 1, (vDSP_Length)(SEQ*DIM));
|
||||
t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
|
||||
}
|
||||
|
||||
// CHANGED: Final RMSNorm on ANE (was CPU)
|
||||
t0=mach_absolute_time();
|
||||
io_write_fp16(finalRmsKern->ioIn, x_cur, DIM, SEQ);
|
||||
ane_eval(finalRmsKern);
|
||||
io_read_fp16(finalRmsKern->ioOut, x_final, 0, DIM, SEQ);
|
||||
t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
// CHANGED: Classifier on ANE (was CPU cblas)
|
||||
io_write_fp16(classifierKern->ioIn, x_final, DIM, SEQ);
|
||||
ane_eval(classifierKern);
|
||||
t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
// CHANGED: Softmax on ANE, then read probs back for NLL on CPU
|
||||
io_copy(softmaxKern->ioIn, 0, classifierKern->ioOut, 0, VOCAB, SEQ);
|
||||
ane_eval(softmaxKern);
|
||||
t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
// Read probs back for NLL loss + gradient (needs target indexing — CPU)
|
||||
io_read_fp16(softmaxKern->ioOut, probs, 0, VOCAB, SEQ);
|
||||
t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
// NLL loss + gradient on CPU: dlogits = probs - one_hot(targets)
|
||||
float total_loss = 0;
|
||||
float invS = 1.0f / SEQ;
|
||||
memcpy(dlogits, probs, (size_t)VOCAB*SEQ*4);
|
||||
for (int t = 0; t < SEQ; t++) {
|
||||
int tgt = target_tokens[t];
|
||||
total_loss -= logf(probs[tgt*SEQ+t] + 1e-10f);
|
||||
dlogits[tgt*SEQ+t] -= 1.0f; // subtract one_hot
|
||||
}
|
||||
// Scale by 1/S
|
||||
vDSP_vsmul(dlogits, 1, &invS, dlogits, 1, (vDSP_Length)((size_t)VOCAB*SEQ));
|
||||
float loss = total_loss / SEQ;
|
||||
last_loss = loss;
|
||||
t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
|
||||
|
||||
// ===== BACKWARD =====
|
||||
// Classifier backward: dx_final = embed^T @ dlogits (CPU — ANE is slower)
|
||||
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
|
||||
DIM, SEQ, VOCAB, 1.0f,
|
||||
embed, DIM, dlogits, SEQ, 0.0f, dy, SEQ);
|
||||
// dembed async on CPU
|
||||
dispatch_group_async(dw_grp, dw_q, ^{
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
VOCAB, DIM, SEQ, 1.0f,
|
||||
dlogits, SEQ, x_final, SEQ, 1.0f, gembed, DIM);
|
||||
});
|
||||
|
||||
// Final RMSNorm backward (CPU — just one call, not worth ANE overhead)
|
||||
{
|
||||
float *dx_rms_final = (float*)calloc(SEQ*DIM, 4);
|
||||
rmsnorm_bwd(dx_rms_final, grms_final, dy, x_cur, rms_final, DIM, SEQ);
|
||||
memcpy(dy, dx_rms_final, SEQ*DIM*4);
|
||||
free(dx_rms_final);
|
||||
}
|
||||
t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0);
|
||||
|
||||
// ===== BACKWARD (12 layers, reverse) =====
|
||||
for (int L=NLAYERS-1; L>=0; L--) {
|
||||
LayerActs *ac = &acts[L];
|
||||
LayerGrads *gr = &grads[L];
|
||||
memcpy(dffn, dy, SEQ*DIM*4);
|
||||
|
||||
// FFN backward (ANE) — same as original
|
||||
io_write_fp16_at(kern[L].ffnBwd->ioIn, 0, dffn, DIM, SEQ);
|
||||
io_copy(kern[L].ffnBwd->ioIn, DIM, kern[L].fwdFFN->ioOut, DIM, 2*HIDDEN, SEQ);
|
||||
ane_eval(kern[L].ffnBwd);
|
||||
io_read_fp16(kern[L].ffnBwd->ioOut, dx_ffn, 0, DIM, SEQ);
|
||||
io_read_fp16(kern[L].ffnBwd->ioOut, dh1, DIM, HIDDEN, SEQ);
|
||||
io_read_fp16(kern[L].ffnBwd->ioOut, dh3, DIM+HIDDEN, HIDDEN, SEQ);
|
||||
|
||||
// dW FFN async (CPU — parallel with ANE)
|
||||
float *capt_dffn = (float*)malloc(SEQ*DIM*4); memcpy(capt_dffn, dffn, SEQ*DIM*4);
|
||||
float *capt_silu = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_silu, ac->silu_out, SEQ*HIDDEN*4);
|
||||
float *capt_dh1 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh1, dh1, SEQ*HIDDEN*4);
|
||||
float *capt_dh3 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh3, dh3, SEQ*HIDDEN*4);
|
||||
float *capt_x2n = (float*)malloc(SEQ*DIM*4); memcpy(capt_x2n, ac->x2norm, SEQ*DIM*4);
|
||||
dispatch_group_async(dw_grp, dw_q, ^{
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, HIDDEN, SEQ,
|
||||
1.0f, capt_dffn, SEQ, capt_silu, SEQ, 1.0f, gr->W2, HIDDEN);
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
|
||||
1.0f, capt_dh1, SEQ, capt_x2n, SEQ, 1.0f, gr->W1, DIM);
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
|
||||
1.0f, capt_dh3, SEQ, capt_x2n, SEQ, 1.0f, gr->W3, DIM);
|
||||
free(capt_dffn); free(capt_silu); free(capt_dh1); free(capt_dh3); free(capt_x2n);
|
||||
});
|
||||
|
||||
// CHANGED: RMSNorm2 backward on ANE
|
||||
// Write concat(dx_ffn, x2) into rmsnorm_bwd kernel
|
||||
io_write_fp16_at(rmsFFNBwd[L]->ioIn, 0, dx_ffn, DIM, SEQ);
|
||||
io_write_fp16_at(rmsFFNBwd[L]->ioIn, DIM, ac->x2, DIM, SEQ);
|
||||
ane_eval(rmsFFNBwd[L]);
|
||||
io_read_fp16(rmsFFNBwd[L]->ioOut, dx2, 0, DIM, SEQ);
|
||||
// dw for rmsnorm_ffn still on CPU (accumulate per step)
|
||||
{
|
||||
float *dw_tmp = (float*)calloc(DIM, 4);
|
||||
float *dx_scratch = (float*)malloc(SEQ*DIM*4);
|
||||
rmsnorm_bwd(dx_scratch, dw_tmp, dx_ffn, ac->x2, lw[L].rms_ffn, DIM, SEQ);
|
||||
for(int i=0;i<DIM;i++) gr->rms_ffn[i] += dw_tmp[i];
|
||||
free(dx_scratch); free(dw_tmp);
|
||||
}
|
||||
// Add residual: dx2 += dy
|
||||
for(int i=0;i<SEQ*DIM;i++) dx2[i] += dy[i];
|
||||
|
||||
// dWo async (CPU)
|
||||
memcpy(do_out_buf, dx2, SEQ*DIM*4);
|
||||
float *capt_do = (float*)malloc(SEQ*DIM*4); memcpy(capt_do, do_out_buf, SEQ*DIM*4);
|
||||
float *capt_attn = (float*)malloc(SEQ*DIM*4); memcpy(capt_attn, ac->attn_out, SEQ*DIM*4);
|
||||
dispatch_group_async(dw_grp, dw_q, ^{
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
|
||||
1.0f, capt_do, SEQ, capt_attn, SEQ, 1.0f, gr->Wo, DIM);
|
||||
free(capt_do); free(capt_attn);
|
||||
});
|
||||
|
||||
// SDPA backward (ANE) — same as original
|
||||
io_copy(kern[L].sdpaBwd1->ioIn, 0, kern[L].fwdAttn->ioOut, DIM, 3*DIM, SEQ);
|
||||
io_write_fp16_at(kern[L].sdpaBwd1->ioIn, 3*DIM, dx2, DIM, SEQ);
|
||||
ane_eval(kern[L].sdpaBwd1);
|
||||
io_copy(sdpaBwd2[L]->ioIn, 0, kern[L].sdpaBwd1->ioOut, DIM, 2*SCORE_CH, SEQ);
|
||||
io_copy(sdpaBwd2[L]->ioIn, 2*SCORE_CH, kern[L].fwdAttn->ioOut, DIM, 2*DIM, SEQ);
|
||||
ane_eval(sdpaBwd2[L]);
|
||||
|
||||
io_read_fp16(sdpaBwd2[L]->ioOut, dq, 0, DIM, SEQ);
|
||||
io_read_fp16(sdpaBwd2[L]->ioOut, dk, DIM, DIM, SEQ);
|
||||
io_read_fp16(kern[L].sdpaBwd1->ioOut, dv, 0, DIM, SEQ);
|
||||
|
||||
// dWq/dWk/dWv async (CPU)
|
||||
float *capt_dq = (float*)malloc(SEQ*DIM*4); memcpy(capt_dq, dq, SEQ*DIM*4);
|
||||
float *capt_dk = (float*)malloc(SEQ*DIM*4); memcpy(capt_dk, dk, SEQ*DIM*4);
|
||||
float *capt_dv = (float*)malloc(SEQ*DIM*4); memcpy(capt_dv, dv, SEQ*DIM*4);
|
||||
float *capt_xn = (float*)malloc(SEQ*DIM*4); memcpy(capt_xn, ac->xnorm, SEQ*DIM*4);
|
||||
dispatch_group_async(dw_grp, dw_q, ^{
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
|
||||
1.0f, capt_dq, SEQ, capt_xn, SEQ, 1.0f, gr->Wq, DIM);
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
|
||||
1.0f, capt_dk, SEQ, capt_xn, SEQ, 1.0f, gr->Wk, DIM);
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
|
||||
1.0f, capt_dv, SEQ, capt_xn, SEQ, 1.0f, gr->Wv, DIM);
|
||||
free(capt_dq); free(capt_dk); free(capt_dv); free(capt_xn);
|
||||
});
|
||||
|
||||
// QKV backward (ANE) — same as original
|
||||
io_copy(kern[L].qkvBwd->ioIn, 0, sdpaBwd2[L]->ioOut, 0, 2*DIM, SEQ);
|
||||
io_copy(kern[L].qkvBwd->ioIn, 2*DIM, kern[L].sdpaBwd1->ioOut, 0, DIM, SEQ);
|
||||
ane_eval(kern[L].qkvBwd);
|
||||
io_read_fp16(kern[L].qkvBwd->ioOut, dx_attn, 0, DIM, SEQ);
|
||||
|
||||
// CHANGED: RMSNorm1 backward on ANE
|
||||
io_write_fp16_at(rmsAttBwd[L]->ioIn, 0, dx_attn, DIM, SEQ);
|
||||
io_write_fp16_at(rmsAttBwd[L]->ioIn, DIM, ac->layer_in, DIM, SEQ);
|
||||
ane_eval(rmsAttBwd[L]);
|
||||
float *dx_rms1 = (float*)malloc(SEQ*DIM*4);
|
||||
io_read_fp16(rmsAttBwd[L]->ioOut, dx_rms1, 0, DIM, SEQ);
|
||||
// dw for rmsnorm_att still on CPU
|
||||
{
|
||||
float *dw_tmp = (float*)calloc(DIM, 4);
|
||||
float *dx_scratch = (float*)malloc(SEQ*DIM*4);
|
||||
rmsnorm_bwd(dx_scratch, dw_tmp, dx_attn, ac->layer_in, lw[L].rms_att, DIM, SEQ);
|
||||
for(int i=0;i<DIM;i++) gr->rms_att[i] += dw_tmp[i];
|
||||
free(dx_scratch); free(dw_tmp);
|
||||
}
|
||||
|
||||
for(int i=0;i<SEQ*DIM;i++) dy[i] = dx_rms1[i] + dx2[i];
|
||||
free(dx_rms1);
|
||||
}
|
||||
|
||||
// Embedding backward
|
||||
dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
|
||||
embed_backward(gembed, dy, input_tokens, DIM, SEQ);
|
||||
|
||||
steps_batch++;
|
||||
if (step % 10 == 0 || step == start_step)
|
||||
printf("step %-4d loss=%.4f\n", step, loss);
|
||||
}
|
||||
double tms = tb_ms(mach_absolute_time() - tt);
|
||||
total_train_ms += tms;
|
||||
total_steps_done += steps_batch;
|
||||
total_batches++;
|
||||
|
||||
dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
|
||||
|
||||
// Adam update
|
||||
float gsc = 1.0f / steps_batch;
|
||||
adam_t++;
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
LayerGrads *g = &grads[L];
|
||||
for(size_t i=0;i<WQ_SZ;i++){g->Wq[i]*=gsc;g->Wk[i]*=gsc;g->Wv[i]*=gsc;g->Wo[i]*=gsc;}
|
||||
for(size_t i=0;i<W1_SZ;i++) g->W1[i]*=gsc;
|
||||
for(size_t i=0;i<W2_SZ;i++) g->W2[i]*=gsc;
|
||||
for(size_t i=0;i<W3_SZ;i++) g->W3[i]*=gsc;
|
||||
for(int i=0;i<DIM;i++){g->rms_att[i]*=gsc; g->rms_ffn[i]*=gsc;}
|
||||
adam_update(lw[L].Wq, g->Wq, &la[L].Wq, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].Wk, g->Wk, &la[L].Wk, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].Wv, g->Wv, &la[L].Wv, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].Wo, g->Wo, &la[L].Wo, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].W1, g->W1, &la[L].W1, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].W2, g->W2, &la[L].W2, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].W3, g->W3, &la[L].W3, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].rms_att, g->rms_att, &la[L].rms_att, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
adam_update(lw[L].rms_ffn, g->rms_ffn, &la[L].rms_ffn, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
}
|
||||
for(int i=0;i<DIM;i++) grms_final[i]*=gsc;
|
||||
adam_update(rms_final, grms_final, &arms_final, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
for(size_t i=0;i<(size_t)VOCAB*DIM;i++) gembed[i]*=gsc;
|
||||
adam_update(embed, gembed, &aembed, adam_t, lr, adam_b1, adam_b2, adam_eps);
|
||||
|
||||
printf(" [batch %d: compile=%.0fms train=%.1fms (%.1fms/step) compiles=%d]\n",
|
||||
steps_batch, cms, tms, tms/steps_batch, g_compile_count);
|
||||
}
|
||||
|
||||
// Efficiency report
|
||||
double wall = tb_ms(mach_absolute_time() - t_wall_start);
|
||||
total_compile_ms += cum_compile; total_train_ms += cum_train;
|
||||
wall += cum_wall; total_steps_done += cum_steps; total_batches += cum_batches;
|
||||
|
||||
// FLOP accounting — same as train_large.m but classifier+softmax now on ANE
|
||||
double fwd_flops = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
|
||||
double sdpa_flops = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
|
||||
double cls_flops = 2.0*VOCAB*DIM*SEQ;
|
||||
double total_flops = (fwd_flops*3 + sdpa_flops + cls_flops*3) * total_steps_done;
|
||||
// In train_large_ane: classifier fwd + softmax run on ANE (not CPU)
|
||||
double ane_flops = (fwd_flops*2 + sdpa_flops + cls_flops) * total_steps_done;
|
||||
|
||||
printf("\n=== NEW Efficiency Report ===\n");
|
||||
printf("Total steps: %d\n", total_steps_done);
|
||||
printf("Wall time: %.0f ms (%.1f s)\n", wall, wall/1000);
|
||||
printf("Compile time: %.0f ms (%.1f%%)\n", total_compile_ms, 100*total_compile_ms/wall);
|
||||
printf("Train time: %.0f ms (%.1f%%)\n", total_train_ms, 100*total_train_ms/wall);
|
||||
printf("Avg train: %.1f ms/step\n", total_train_ms/total_steps_done);
|
||||
printf("ANE TFLOPS: %.2f sustained\n", ane_flops / (total_train_ms * 1e9));
|
||||
printf("Total TFLOPS: %.2f (ANE+CPU)\n", total_flops / (total_train_ms * 1e9));
|
||||
printf("ANE utilization: %.1f%% of 15.8 TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/15.8);
|
||||
// Cleanup
|
||||
for (int L=0; L<NLAYERS; L++) {
|
||||
free_layer_kernels(&kern[L]); free_kern(sdpaBwd2[L]);
|
||||
free_kern(rmsAttBwd[L]); free_kern(rmsFFNBwd[L]);
|
||||
layer_weights_free(&lw[L]); layer_adam_free(&la[L]);
|
||||
layer_acts_free(&acts[L]); layer_grads_free(&grads[L]);
|
||||
}
|
||||
free_kern(softmaxKern); free_kern(finalRmsKern); free_kern(classifierKern);
|
||||
munmap(token_data, data_len); close(data_fd);
|
||||
free(rms_final); free(embed); free(grms_final); free(gembed);
|
||||
adam_free(&arms_final); adam_free(&aembed);
|
||||
free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);
|
||||
free(do_out_buf); free(dq); free(dk); free(dv); free(dx_attn);
|
||||
free(x_cur); free(x_final); free(logits); free(dlogits); free(probs);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue