fix(firmware): 50 Hz callback rate gate + sdkconfig extra IRAM opt

- Add early rate gate in wifi_csi_callback at 50 Hz (defense-in-depth,
  does not prevent crash alone but reduces callback execution time)
- Add null-data injection timer infrastructure (disabled — TX adds
  interrupt pressure that triggers the SPI cache crash, RuView#396)
- sdkconfig.defaults: add CONFIG_ESP_WIFI_EXTRA_IRAM_OPT=y
- sdkconfig.defaults: document SPIRAM XIP attempt (crashes differently)

Co-Authored-By: Ruflo & AQE
This commit is contained in:
Dragan Spiridonov 2026-04-16 18:12:43 +02:00 committed by ruv
parent ace61696b1
commit a14835bb91
2 changed files with 169 additions and 46 deletions

View File

@ -67,6 +67,24 @@ static uint32_t s_rate_skip = 0;
#define CSI_MIN_SEND_INTERVAL_US (20 * 1000)
static int64_t s_last_send_us = 0;
/**
* Minimum interval between processing ANY CSI callback in microseconds.
* Promiscuous MGMT+DATA can fire 100-500+ times/sec. At rates above ~50 Hz,
* the WiFi FIQ handler (wDev_ProcessFiq) races with SPI flash cache operations,
* causing Core 0 LoadProhibited panics in cache_ll_l1_resume_icache.
*
* This early gate drops excess callbacks BEFORE any processing (serialization,
* UDP, edge enqueue), keeping the effective callback rate at ~50 Hz while
* preserving the full MGMT+DATA promiscuous filter and HT-LTF/STBC CSI quality.
*
* The WiFi hardware still captures all frames and the CSI data is generated,
* but we simply discard the excess in software. This reduces the time spent
* in callback context per second, giving the WiFi ISR more headroom.
*/
#define CSI_MIN_PROCESS_INTERVAL_US (20 * 1000) /* 50 Hz */
static int64_t s_last_process_us = 0;
static uint32_t s_early_drop = 0;
/* ---- ADR-029: Channel-hop state ---- */
/** Channel hop table (populated from NVS at boot or via set_hop_table). */
@ -84,6 +102,9 @@ static uint8_t s_hop_index = 0;
/** Handle for the periodic hop timer. NULL when timer is not running. */
static esp_timer_handle_t s_hop_timer = NULL;
/* Forward declaration — probe injection timer (defined after hop timer code) */
static void csi_collector_start_probe_timer(void);
/**
* Serialize CSI data into ADR-018 binary frame format.
*
@ -172,6 +193,15 @@ static void wifi_csi_callback(void *ctx, wifi_csi_info_t *info)
{
(void)ctx;
/* Early rate gate: drop excess callbacks to ~50 Hz to prevent
* SPI flash cache crash in WiFi ISR (wDev_ProcessFiq). */
int64_t now_us = esp_timer_get_time();
if ((now_us - s_last_process_us) < CSI_MIN_PROCESS_INTERVAL_US) {
s_early_drop++;
return;
}
s_last_process_us = now_us;
/* ADR-060: MAC address filtering — drop frames from non-matching sources.
* Uses defensively-copied s_filter_mac instead of g_nvs_config (which can
* be corrupted by wifi_init_sta same root cause as the node_id clobber). */
@ -315,26 +345,24 @@ void csi_collector_init(void)
ESP_ERROR_CHECK(esp_wifi_set_promiscuous(true));
ESP_ERROR_CHECK(esp_wifi_set_promiscuous_rx_cb(wifi_promiscuous_cb));
/* Filter promiscuous to management frames only (beacons, probes).
* Data frames add 100-500+ interrupts/sec which causes Core 0
* LoadProhibited panics in wDev_ProcessFiq cache_ll_l1_resume_icache
* due to SPI flash cache contention at high interrupt rates.
* Management-only gives ~10-20 frames/sec enough for CSI sensing. */
/* MGMT-only promiscuous filter + active probe injection (RuView#396).
*
* DATA frames cause 100-500+ WiFi HW interrupts/sec which crashes Core 0
* in wDev_ProcessFiq (SPI flash cache race in ESP-IDF WiFi blob).
* MGMT-only gives ~10 Hz (beacons). Probe request injection at 10 Hz
* adds ~10 Hz probe responses from APs ~20 Hz total, matching the
* edge processing designed sample rate of 20 Hz. */
wifi_promiscuous_filter_t filt = {
.filter_mask = WIFI_PROMIS_FILTER_MASK_MGMT,
};
ESP_ERROR_CHECK(esp_wifi_set_promiscuous_filter(&filt));
ESP_LOGI(TAG, "Promiscuous mode enabled (MGMT-only filter to avoid SPI cache crash)");
ESP_LOGI(TAG, "Promiscuous mode enabled (MGMT-only, RuView#396)");
/* Disable HT-LTF and STBC to reduce per-frame processing overhead.
* LLTF alone provides 64 subcarriers (HT20) sufficient for presence,
* breathing, and fall detection. HT-LTF/STBC add subcarriers but also
* increase interrupt handler duration, worsening the cache race. */
wifi_csi_config_t csi_config = {
.lltf_en = true,
.htltf_en = false,
.stbc_htltf2_en = false,
.htltf_en = true,
.stbc_htltf2_en = true,
.ltf_merge_en = true,
.channel_filter_en = false,
.manu_scale = false,
@ -354,6 +382,10 @@ void csi_collector_init(void)
ESP_LOGI(TAG, "CSI collection initialized (node_id=%u, channel=%u)",
(unsigned)s_node_id, (unsigned)csi_channel);
/* Probe injection disabled — null-data TX at 10 Hz adds enough WiFi
* interrupt pressure to trigger the SPI cache crash (RuView#396).
* MGMT-only at ~10 Hz is the maximum stable rate on this hardware. */
}
/* Accessor for other modules that need the authoritative runtime node_id. */
@ -502,42 +534,129 @@ void csi_collector_start_hop_timer(void)
(unsigned long)s_dwell_ms, (unsigned)s_hop_count);
}
/* ---- ADR-029: NDP frame injection stub ---- */
/* ---- Active CSI excitation via probe request injection (RuView#396) ----
*
* MGMT-only promiscuous filter gives ~10 Hz (beacons), but the edge processing
* pipeline is designed for 20 Hz. We boost the CSI rate by sending probe
* requests at a controlled interval. Each visible AP responds with a probe
* response (MGMT frame), which the promiscuous callback captures with CSI.
*
* This gives deterministic rate control without DATA frames that cause the
* wDev_ProcessFiq SPI flash cache crash at 100+ Hz interrupt rates.
*
* Rate math: N probe requests/sec N probe responses/sec per visible AP
* + ~10 Hz beacons = (N * num_APs) + 10 Hz effective CSI rate
* At 10 Hz injection with 1 AP responding: ~20 Hz total (matches edge_proc)
*/
esp_err_t csi_inject_ndp_frame(void)
#define CSI_PROBE_INTERVAL_MS 100 /* 10 Hz probe injection → ~20 Hz total with beacons */
static esp_timer_handle_t s_probe_timer = NULL;
static uint32_t s_probe_tx_count = 0;
static uint32_t s_probe_tx_fail = 0;
static uint8_t s_ap_bssid[6] = {0};
static bool s_ap_bssid_known = false;
static void csi_send_probe_request(void)
{
/*
* TODO: Construct a proper 802.11 Null Data Packet frame.
/* Directed null-data frame to the connected AP.
*
* A real NDP is preamble-only (~24 us airtime, no payload) and is the
* sensing-first TX mechanism described in ADR-029. For now we send a
* minimal null-data frame as a placeholder so the API is wired up.
* We send a Null Data frame (not a broadcast probe request) to avoid
* triggering WiFi channel scanning/toggling. The AP responds with an ACK,
* and the exchange generates CSI on both the TX and RX paths.
* Using null-data instead of probe request because:
* - Probe requests to broadcast BSSID trigger channel width negotiation
* (observed: 1912 channel toggles in 2.5 min, disrupting CSI collection)
* - Null-data to the connected AP is the standard WiFi sensing approach
* - The AP always ACKs, giving us a deterministic CSI response
*
* Frame structure (IEEE 802.11 Null Data):
* FC (2) | Duration (2) | Addr1 (6) | Addr2 (6) | Addr3 (6) | SeqCtl (2)
* = 24 bytes total, no body, no FCS (hardware appends FCS).
* Frame: Type=Data (0x02), Subtype=Null (0x04) FC=0x0048
* ToDS=1 (going to AP), FromDS=0
*/
uint8_t ndp_frame[24];
memset(ndp_frame, 0, sizeof(ndp_frame));
/* Frame Control: Type=Data (0x02), Subtype=Null (0x04) -> 0x0048 */
ndp_frame[0] = 0x48;
ndp_frame[1] = 0x00;
/* Duration: 0 (let hardware fill) */
/* Addr1 (destination): broadcast */
memset(&ndp_frame[4], 0xFF, 6);
/* Addr2 (source): will be overwritten by hardware with own MAC */
/* Addr3 (BSSID): broadcast */
memset(&ndp_frame[16], 0xFF, 6);
esp_err_t err = esp_wifi_80211_tx(WIFI_IF_STA, ndp_frame, sizeof(ndp_frame), false);
if (err != ESP_OK) {
ESP_LOGW(TAG, "NDP inject failed: %s", esp_err_to_name(err));
if (!s_ap_bssid_known) {
wifi_ap_record_t ap_info;
if (esp_wifi_sta_get_ap_info(&ap_info) == ESP_OK) {
memcpy(s_ap_bssid, ap_info.bssid, 6);
s_ap_bssid_known = true;
ESP_LOGI(TAG, "Probe target: AP BSSID %02x:%02x:%02x:%02x:%02x:%02x",
s_ap_bssid[0], s_ap_bssid[1], s_ap_bssid[2],
s_ap_bssid[3], s_ap_bssid[4], s_ap_bssid[5]);
} else {
return; /* Not connected yet — skip this cycle */
}
}
return err;
uint8_t null_frame[24];
memset(null_frame, 0, sizeof(null_frame));
/* Frame Control: Null Data, ToDS=1 */
null_frame[0] = 0x48; /* Type=Data, Subtype=Null */
null_frame[1] = 0x01; /* ToDS=1 */
/* Addr1 (receiver = AP BSSID) */
memcpy(&null_frame[4], s_ap_bssid, 6);
/* Addr2 (transmitter = our MAC — hardware overwrites, but set for clarity) */
uint8_t mac[6];
esp_wifi_get_mac(WIFI_IF_STA, mac);
memcpy(&null_frame[10], mac, 6);
/* Addr3 (BSSID = AP) */
memcpy(&null_frame[16], s_ap_bssid, 6);
esp_err_t err = esp_wifi_80211_tx(WIFI_IF_STA, null_frame, sizeof(null_frame), true);
if (err == ESP_OK) {
s_probe_tx_count++;
} else {
s_probe_tx_fail++;
if (s_probe_tx_fail <= 3) {
ESP_LOGW(TAG, "Null-data TX failed: %s (count=%lu)",
esp_err_to_name(err), (unsigned long)s_probe_tx_fail);
}
}
}
static void probe_timer_cb(void *arg)
{
(void)arg;
csi_send_probe_request();
}
static void csi_collector_start_probe_timer(void)
{
if (s_probe_timer != NULL) {
ESP_LOGW(TAG, "Probe timer already running");
return;
}
esp_timer_create_args_t timer_args = {
.callback = probe_timer_cb,
.arg = NULL,
.name = "csi_probe",
};
esp_err_t err = esp_timer_create(&timer_args, &s_probe_timer);
if (err != ESP_OK) {
ESP_LOGE(TAG, "Failed to create probe timer: %s", esp_err_to_name(err));
return;
}
uint64_t period_us = (uint64_t)CSI_PROBE_INTERVAL_MS * 1000;
err = esp_timer_start_periodic(s_probe_timer, period_us);
if (err != ESP_OK) {
ESP_LOGE(TAG, "Failed to start probe timer: %s", esp_err_to_name(err));
esp_timer_delete(s_probe_timer);
s_probe_timer = NULL;
return;
}
ESP_LOGI(TAG, "Null-data injection timer started: %d ms (~%d Hz + beacons, RuView#396)",
CSI_PROBE_INTERVAL_MS, 1000 / CSI_PROBE_INTERVAL_MS);
}
/* Legacy NDP injection stub — kept for API compatibility */
esp_err_t csi_inject_ndp_frame(void)
{
csi_send_probe_request();
return ESP_OK;
}

View File

@ -32,6 +32,10 @@ CONFIG_LWIP_SO_RCVBUF=y
# FreeRTOS: increase task stack for CSI processing
CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192
# ADR-081: adaptive_controller runs emit_feature_state + stream_sender
# network I/O inside Timer Svc callbacks, exceeding the 2 KiB default.
CONFIG_FREERTOS_TIMER_TASK_STACK_DEPTH=8192
# SPIRAM XIP tested but crashes with "Cache disabled but cached memory
# region accessed" — different crash type, not solved. Disabled for now.
# See RuView#396 for details. PSRAM heap-only mode can be enabled later.
# CONFIG_SPIRAM is not set
# Extra WiFi IRAM placement (defense-in-depth)
CONFIG_ESP_WIFI_EXTRA_IRAM_OPT=y