Harden token dataset validation across all training pipelines

This commit is contained in:
nabbilkhan 2026-03-03 19:36:51 +00:00
parent 443194bca4
commit 991bf4d618
7 changed files with 280 additions and 44 deletions

View File

@ -1,9 +1,11 @@
CC = xcrun clang
CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc
FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface
LDFLAGS = $(FRAMEWORKS) -ldl
HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h
CC = xcrun clang
CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc
CC_C = xcrun clang
CFLAGS_C = -O2 -Wall -Wextra -Werror -std=c11
FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface
LDFLAGS = $(FRAMEWORKS) -ldl
HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h data_validation.h
HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h
@ -33,16 +35,21 @@ test_perf_stats: test_perf_stats.m
test_qos_sweep: test_qos_sweep.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
test_ane_advanced: test_ane_advanced.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
probes: $(PROBES)
tokenize:
python3 tokenize.py
clean:
rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier
.PHONY: clean tokenize probes
test_ane_advanced: test_ane_advanced.m
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
test_data_validation: test_data_validation.c data_validation.h
$(CC_C) $(CFLAGS_C) -o $@ $<
probes: $(PROBES)
security-tests: test_data_validation
tokenize:
python3 tokenize.py
clean:
rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier test_data_validation
.PHONY: clean tokenize probes security-tests

View File

@ -78,7 +78,11 @@ Weights passed via IOSurface spatial dimension — compile 9 kernels once at sta
bash download_data.sh
```
Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens).
Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens).
All training pipelines perform token-data validation at startup:
- token file must contain at least `SEQ+1` tokens
- every token id must be within `[0, vocab_size)`
### 2. Build & Train

View File

@ -0,0 +1,65 @@
// data_validation.h — Shared token-data validation helpers
#pragma once
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
typedef enum {
TOKEN_DATA_VALID = 0,
TOKEN_DATA_ERR_TOO_SHORT = 1,
TOKEN_DATA_ERR_OOB_TOKEN = 2
} TokenDataValidationCode;
typedef struct {
size_t required_tokens;
size_t bad_index;
uint16_t bad_token;
} TokenDataValidationError;
static inline bool token_data_has_min_tokens(size_t n_tokens, int seq, size_t *required_tokens) {
if (seq < 0) return false;
size_t needed = (size_t)seq + 1;
if (required_tokens) *required_tokens = needed;
return n_tokens >= needed;
}
static inline bool token_data_find_oob_token(const uint16_t *token_data, size_t n_tokens, int vocab,
size_t *bad_index, uint16_t *bad_token) {
if (!token_data || n_tokens == 0 || vocab <= 0) return false;
for (size_t i = 0; i < n_tokens; i++) {
if ((int)token_data[i] >= vocab) {
if (bad_index) *bad_index = i;
if (bad_token) *bad_token = token_data[i];
return true;
}
}
return false;
}
static inline TokenDataValidationCode token_data_validate(const uint16_t *token_data, size_t n_tokens,
int seq, int vocab,
TokenDataValidationError *err) {
if (err) {
err->required_tokens = 0;
err->bad_index = 0;
err->bad_token = 0;
}
size_t required = 0;
if (!token_data_has_min_tokens(n_tokens, seq, &required)) {
if (err) err->required_tokens = required;
return TOKEN_DATA_ERR_TOO_SHORT;
}
size_t bad_index = 0;
uint16_t bad_token = 0;
if (token_data_find_oob_token(token_data, n_tokens, vocab, &bad_index, &bad_token)) {
if (err) {
err->bad_index = bad_index;
err->bad_token = bad_token;
}
return TOKEN_DATA_ERR_OOB_TOKEN;
}
return TOKEN_DATA_VALID;
}

View File

@ -0,0 +1,112 @@
// test_data_validation.c — Unit tests for token-data hardening helpers
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "data_validation.h"
typedef struct {
int passed;
int failed;
} TestStats;
#define CHECK_TRUE(stats, cond, msg) \
do { \
if (!(cond)) { \
fprintf(stderr, "FAIL: %s (%s:%d)\n", msg, __FILE__, __LINE__); \
(stats)->failed++; \
return; \
} \
} while (0)
#define CHECK_EQ_INT(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg)
#define CHECK_EQ_SIZE(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg)
static void test_min_tokens_boundary(TestStats *stats) {
size_t required = 0;
CHECK_TRUE(stats, token_data_has_min_tokens(257, 256, &required), "257 tokens should satisfy seq=256");
CHECK_EQ_SIZE(stats, required, 257, "required tokens should be seq+1");
stats->passed++;
}
static void test_min_tokens_short(TestStats *stats) {
size_t required = 0;
CHECK_TRUE(stats, !token_data_has_min_tokens(256, 256, &required), "256 tokens should fail seq=256");
CHECK_EQ_SIZE(stats, required, 257, "required tokens should still be seq+1");
stats->passed++;
}
static void test_validate_too_short(TestStats *stats) {
uint16_t tokens[2] = {1, 2};
TokenDataValidationError err = {0};
TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err);
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short dataset should fail");
CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should be reported");
stats->passed++;
}
static void test_validate_oob_first(TestStats *stats) {
uint16_t tokens[6] = {32000, 1, 2, 3, 4, 5};
TokenDataValidationError err = {0};
TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err);
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "first token OOB should fail");
CHECK_EQ_SIZE(stats, err.bad_index, 0, "bad index should point to first token");
CHECK_EQ_INT(stats, err.bad_token, 32000, "bad token value should be reported");
stats->passed++;
}
static void test_validate_oob_middle(TestStats *stats) {
uint16_t tokens[7] = {1, 2, 3, 65535, 4, 5, 6};
TokenDataValidationError err = {0};
TokenDataValidationCode code = token_data_validate(tokens, 7, 4, 32000, &err);
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "middle token OOB should fail");
CHECK_EQ_SIZE(stats, err.bad_index, 3, "bad index should point to middle token");
CHECK_EQ_INT(stats, err.bad_token, 65535, "bad token value should be reported");
stats->passed++;
}
static void test_validate_oob_last(TestStats *stats) {
uint16_t tokens[6] = {1, 2, 3, 4, 5, 40000};
TokenDataValidationError err = {0};
TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err);
CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "last token OOB should fail");
CHECK_EQ_SIZE(stats, err.bad_index, 5, "bad index should point to last token");
CHECK_EQ_INT(stats, err.bad_token, 40000, "bad token value should be reported");
stats->passed++;
}
static void test_validate_ok(TestStats *stats) {
uint16_t tokens[8] = {0, 1, 2, 3, 4, 5, 31998, 31999};
TokenDataValidationError err;
memset(&err, 0xA5, sizeof(err));
TokenDataValidationCode code = token_data_validate(tokens, 8, 4, 32000, &err);
CHECK_EQ_INT(stats, code, TOKEN_DATA_VALID, "valid dataset should pass");
stats->passed++;
}
static void test_find_oob_empty(TestStats *stats) {
size_t bad_index = 123;
uint16_t bad_token = 456;
CHECK_TRUE(stats, !token_data_find_oob_token(NULL, 0, 32000, &bad_index, &bad_token),
"empty dataset should not report OOB token");
CHECK_EQ_SIZE(stats, bad_index, 123, "bad index should remain unchanged for empty input");
CHECK_EQ_INT(stats, bad_token, 456, "bad token should remain unchanged for empty input");
stats->passed++;
}
int main(void) {
TestStats stats = {0, 0};
test_min_tokens_boundary(&stats);
test_min_tokens_short(&stats);
test_validate_too_short(&stats);
test_validate_oob_first(&stats);
test_validate_oob_middle(&stats);
test_validate_oob_last(&stats);
test_validate_ok(&stats);
test_find_oob_empty(&stats);
printf("test_data_validation: %d passed, %d failed\n", stats.passed, stats.failed);
return stats.failed == 0 ? 0 : 1;
}

View File

@ -1,9 +1,10 @@
// train_large.m Train stories110M (12 layers, 768dim, 3072hidden) on ANE
// Uses pretokenized TinyStories data with cross-entropy loss
// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch
#include "stories_io.h"
#include "stories_mil.h"
#include "stories_cpu_ops.h"
#include "stories_io.h"
#include "stories_mil.h"
#include "stories_cpu_ops.h"
#include "data_validation.h"
#define CKPT_PATH_DEFAULT "ane_stories110M_ckpt.bin"
#define MODEL_PATH_DEFAULT "stories110M.bin"
@ -283,14 +284,29 @@ int main(int argc, char *argv[]) {
}
// mmap token data
int data_fd = open(DATA_PATH, O_RDONLY);
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
struct stat st; fstat(data_fd, &st);
size_t data_len = st.st_size;
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
size_t n_tokens = data_len / 2;
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
int data_fd = open(DATA_PATH, O_RDONLY);
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
struct stat st; fstat(data_fd, &st);
size_t data_len = st.st_size;
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; }
size_t n_tokens = data_len / 2;
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
TokenDataValidationError data_err = {0};
TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err);
if (data_code == TOKEN_DATA_ERR_TOO_SHORT) {
fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n",
data_err.required_tokens, n_tokens);
munmap(token_data, data_len); close(data_fd);
return 1;
}
if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) {
fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n",
data_err.bad_token, data_err.bad_index, VOCAB);
munmap(token_data, data_len); close(data_fd);
return 1;
}
// Gradient buffers shared across layers (reused each step)
float *dy = (float*)malloc(SEQ*DIM*4); // gradient flowing backward

View File

@ -13,6 +13,7 @@
#include "stories_io.h"
#include "stories_mil.h"
#include "stories_cpu_ops.h"
#include "data_validation.h"
#include "ane_rmsnorm_bwd.h"
#include "ane_classifier.h"
@ -276,10 +277,25 @@ int main(int argc, char *argv[]) {
struct stat st; fstat(data_fd, &st);
size_t data_len = st.st_size;
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; }
size_t n_tokens = data_len / 2;
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
TokenDataValidationError data_err = {0};
TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err);
if (data_code == TOKEN_DATA_ERR_TOO_SHORT) {
fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n",
data_err.required_tokens, n_tokens);
munmap(token_data, data_len); close(data_fd);
return 1;
}
if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) {
fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n",
data_err.bad_token, data_err.bad_index, VOCAB);
munmap(token_data, data_len); close(data_fd);
return 1;
}
// Gradient buffers
float *dy = (float*)malloc(SEQ*DIM*4);
float *dffn = (float*)malloc(SEQ*DIM*4);

View File

@ -1,8 +1,9 @@
// train.m Dynamic weight ANE training for Stories110M
// Compile kernels ONCE at startup, update weights via IOSurface every step.
// No exec() restart needed eliminates 76% compile overhead.
#include "mil_dynamic.h"
#include "cpu_ops.h"
#include "mil_dynamic.h"
#include "cpu_ops.h"
#include "../data_validation.h"
#define CKPT_PATH "ane_stories110M_dyn_ckpt.bin"
#define MODEL_PATH "../../../assets/models/stories110M.bin"
@ -333,17 +334,32 @@ int main(int argc, char *argv[]) {
}
// mmap token data
int data_fd = open(DATA_PATH, O_RDONLY);
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
struct stat st; fstat(data_fd, &st);
size_t data_len = st.st_size;
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
size_t n_tokens = data_len / 2;
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
// Vocab compaction: map 32K sparse vocab ~9K compact
VocabMap vm = vocab_map_build(token_data, n_tokens, VOCAB);
int data_fd = open(DATA_PATH, O_RDONLY);
if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
struct stat st; fstat(data_fd, &st);
size_t data_len = st.st_size;
uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; }
size_t n_tokens = data_len / 2;
printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
TokenDataValidationError data_err = {0};
TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err);
if (data_code == TOKEN_DATA_ERR_TOO_SHORT) {
fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n",
data_err.required_tokens, n_tokens);
munmap(token_data, data_len); close(data_fd);
return 1;
}
if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) {
fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n",
data_err.bad_token, data_err.bad_index, VOCAB);
munmap(token_data, data_len); close(data_fd);
return 1;
}
// Vocab compaction: map 32K sparse vocab ~9K compact
VocabMap vm = vocab_map_build(token_data, n_tokens, VOCAB);
int CV = vm.compact_vocab;
printf("Vocab compaction: %d → %d active tokens (%.1fx reduction)\n", VOCAB, CV, (float)VOCAB/CV);