ANE/training/training_dynamic/models/qwen3_06b.h

20 lines
596 B
C

// qwen3_06b.h — Qwen3-0.6B (28 layers, GQA 16q/8kv, head_dim=128)
#pragma once
#define MODEL_NAME "Qwen3-0.6B"
#define DIM 1024
#define HIDDEN 3072
#define HEADS 16
#define KV_HEADS 8
#define HD 128 // explicit head_dim (NOT DIM/HEADS)
#define GQA_RATIO (HEADS / KV_HEADS) // = 2
#define Q_DIM (HEADS * HD) // = 2048
#define KV_DIM (KV_HEADS * HD) // = 1024 (= DIM for this model)
#define SEQ 256
#define NLAYERS 28
#define VOCAB 151936
#define CKPT_PATH "ane_qwen3_06b_dyn_ckpt.bin"
#define DEFAULT_DATA_PATH "../tinystories_data00.bin"