mirror of https://github.com/maderix/ANE.git
20 lines
596 B
C
20 lines
596 B
C
// qwen3_06b.h — Qwen3-0.6B (28 layers, GQA 16q/8kv, head_dim=128)
|
|
#pragma once
|
|
|
|
#define MODEL_NAME "Qwen3-0.6B"
|
|
|
|
#define DIM 1024
|
|
#define HIDDEN 3072
|
|
#define HEADS 16
|
|
#define KV_HEADS 8
|
|
#define HD 128 // explicit head_dim (NOT DIM/HEADS)
|
|
#define GQA_RATIO (HEADS / KV_HEADS) // = 2
|
|
#define Q_DIM (HEADS * HD) // = 2048
|
|
#define KV_DIM (KV_HEADS * HD) // = 1024 (= DIM for this model)
|
|
#define SEQ 256
|
|
#define NLAYERS 28
|
|
#define VOCAB 151936
|
|
|
|
#define CKPT_PATH "ane_qwen3_06b_dyn_ckpt.bin"
|
|
#define DEFAULT_DATA_PATH "../tinystories_data00.bin"
|