Compare commits
11 Commits
ffeaa46bc6
...
d3c58145a4
| Author | SHA1 | Date |
|---|---|---|
|
|
d3c58145a4 | |
|
|
b41681e079 | |
|
|
0f13a55f52 | |
|
|
71f9597f58 | |
|
|
bfe5cbc83a | |
|
|
1e0af686a0 | |
|
|
21ec163941 | |
|
|
a8f5276d9b | |
|
|
e574cbe129 | |
|
|
1dbea4e9fb | |
|
|
fb2d1afb0c |
|
|
@ -7,6 +7,9 @@ on:
|
|||
- 'scripts/qemu-esp32s3-test.sh'
|
||||
- 'scripts/validate_qemu_output.py'
|
||||
- 'scripts/generate_nvs_matrix.py'
|
||||
- 'scripts/qemu_swarm.py'
|
||||
- 'scripts/swarm_health.py'
|
||||
- 'scripts/swarm_presets/**'
|
||||
- '.github/workflows/firmware-qemu.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
|
|
@ -14,6 +17,9 @@ on:
|
|||
- 'scripts/qemu-esp32s3-test.sh'
|
||||
- 'scripts/validate_qemu_output.py'
|
||||
- 'scripts/generate_nvs_matrix.py'
|
||||
- 'scripts/qemu_swarm.py'
|
||||
- 'scripts/swarm_health.py'
|
||||
- 'scripts/swarm_presets/**'
|
||||
- '.github/workflows/firmware-qemu.yml'
|
||||
|
||||
env:
|
||||
|
|
@ -31,7 +37,10 @@ jobs:
|
|||
uses: actions/cache@v4
|
||||
with:
|
||||
path: /opt/qemu-esp32
|
||||
key: qemu-esp32s3-${{ env.QEMU_BRANCH }}-v2
|
||||
# Include date component so cache refreshes monthly when branch updates
|
||||
key: qemu-esp32s3-${{ env.QEMU_BRANCH }}-v4
|
||||
restore-keys: |
|
||||
qemu-esp32s3-${{ env.QEMU_BRANCH }}-
|
||||
|
||||
- name: Install QEMU build dependencies
|
||||
if: steps.cache-qemu.outputs.cache-hit != 'true'
|
||||
|
|
@ -58,8 +67,9 @@ jobs:
|
|||
|
||||
- name: Verify QEMU binary
|
||||
run: |
|
||||
file_size() { stat -c%s "$1" 2>/dev/null || stat -f%z "$1" 2>/dev/null || wc -c < "$1"; }
|
||||
/opt/qemu-esp32/bin/qemu-system-xtensa --version
|
||||
echo "QEMU binary size: $(stat -c%s /opt/qemu-esp32/bin/qemu-system-xtensa) bytes"
|
||||
echo "QEMU binary size: $(file_size /opt/qemu-esp32/bin/qemu-system-xtensa) bytes"
|
||||
|
||||
- name: Upload QEMU artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
|
|
@ -73,7 +83,7 @@ jobs:
|
|||
needs: build-qemu
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: espressif/idf:${{ env.IDF_VERSION }}
|
||||
image: espressif/idf:v5.4
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
|
|
@ -82,7 +92,10 @@ jobs:
|
|||
- default
|
||||
- full-adr060
|
||||
- edge-tier0
|
||||
- edge-tier1
|
||||
- tdm-3node
|
||||
- boundary-max
|
||||
- boundary-min
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
@ -141,7 +154,8 @@ jobs:
|
|||
$OTA_ARGS \
|
||||
0x20000 build/esp32-csi-node.bin
|
||||
|
||||
echo "Flash image size: $(stat -c%s build/qemu_flash.bin) bytes"
|
||||
file_size() { stat -c%s "$1" 2>/dev/null || stat -f%z "$1" 2>/dev/null || wc -c < "$1"; }
|
||||
echo "Flash image size: $(file_size build/qemu_flash.bin) bytes"
|
||||
|
||||
- name: Inject NVS partition
|
||||
if: matrix.nvs_config != 'default'
|
||||
|
|
@ -149,7 +163,8 @@ jobs:
|
|||
run: |
|
||||
NVS_BIN="build/nvs_matrix/nvs_${{ matrix.nvs_config }}.bin"
|
||||
if [ -f "$NVS_BIN" ]; then
|
||||
echo "Injecting NVS: $NVS_BIN ($(stat -c%s "$NVS_BIN") bytes)"
|
||||
file_size() { stat -c%s "$1" 2>/dev/null || stat -f%z "$1" 2>/dev/null || wc -c < "$1"; }
|
||||
echo "Injecting NVS: $NVS_BIN ($(file_size "$NVS_BIN") bytes)"
|
||||
dd if="$NVS_BIN" of=build/qemu_flash.bin \
|
||||
bs=1 seek=$((0x9000)) conv=notrunc 2>/dev/null
|
||||
else
|
||||
|
|
@ -159,9 +174,8 @@ jobs:
|
|||
- name: Run QEMU smoke test
|
||||
env:
|
||||
QEMU_PATH: /opt/qemu-esp32/bin/qemu-system-xtensa
|
||||
QEMU_TIMEOUT: "60"
|
||||
QEMU_TIMEOUT: "90"
|
||||
run: |
|
||||
# Run QEMU with timeout; capture output
|
||||
echo "Starting QEMU (timeout: ${QEMU_TIMEOUT}s)..."
|
||||
|
||||
timeout "$QEMU_TIMEOUT" "$QEMU_PATH" \
|
||||
|
|
@ -169,6 +183,7 @@ jobs:
|
|||
-nographic \
|
||||
-drive file=firmware/esp32-csi-node/build/qemu_flash.bin,if=mtd,format=raw \
|
||||
-serial mon:stdio \
|
||||
-nic user,model=open_eth,net=10.0.2.0/24 \
|
||||
-no-reboot \
|
||||
2>&1 | tee firmware/esp32-csi-node/build/qemu_output.log || true
|
||||
|
||||
|
|
@ -188,3 +203,153 @@ jobs:
|
|||
firmware/esp32-csi-node/build/qemu_output.log
|
||||
firmware/esp32-csi-node/build/nvs_matrix/
|
||||
retention-days: 14
|
||||
|
||||
fuzz-test:
|
||||
name: Fuzz Testing (ADR-061 Layer 6)
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install clang
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y clang
|
||||
|
||||
- name: Build fuzz targets
|
||||
working-directory: firmware/esp32-csi-node/test
|
||||
run: make all CC=clang
|
||||
|
||||
- name: Run serialize fuzzer (60s)
|
||||
working-directory: firmware/esp32-csi-node/test
|
||||
run: make run_serialize FUZZ_DURATION=60 || echo "FUZZER_CRASH=serialize" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run edge enqueue fuzzer (60s)
|
||||
working-directory: firmware/esp32-csi-node/test
|
||||
run: make run_edge FUZZ_DURATION=60 || echo "FUZZER_CRASH=edge" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run NVS config fuzzer (60s)
|
||||
working-directory: firmware/esp32-csi-node/test
|
||||
run: make run_nvs FUZZ_DURATION=60 || echo "FUZZER_CRASH=nvs" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Check for crashes
|
||||
working-directory: firmware/esp32-csi-node/test
|
||||
run: |
|
||||
CRASHES=$(find . -type f \( -name "crash-*" -o -name "oom-*" -o -name "timeout-*" \) 2>/dev/null | wc -l)
|
||||
echo "Crash artifacts found: $CRASHES"
|
||||
if [ "$CRASHES" -gt 0 ] || [ -n "${FUZZER_CRASH:-}" ]; then
|
||||
echo "::error::Fuzzer found $CRASHES crash/oom/timeout artifacts. FUZZER_CRASH=${FUZZER_CRASH:-none}"
|
||||
ls -la crash-* oom-* timeout-* 2>/dev/null
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Upload fuzz artifacts
|
||||
if: failure()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: fuzz-crashes
|
||||
path: |
|
||||
firmware/esp32-csi-node/test/crash-*
|
||||
firmware/esp32-csi-node/test/oom-*
|
||||
firmware/esp32-csi-node/test/timeout-*
|
||||
retention-days: 30
|
||||
|
||||
nvs-matrix-validate:
|
||||
name: NVS Matrix Generation
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install NVS generator
|
||||
run: pip install esp-idf-nvs-partition-gen
|
||||
|
||||
- name: Generate all 14 NVS configs
|
||||
run: |
|
||||
python3 scripts/generate_nvs_matrix.py \
|
||||
--output-dir build/nvs_matrix
|
||||
|
||||
- name: Verify all binaries generated
|
||||
run: |
|
||||
EXPECTED=14
|
||||
ACTUAL=$(find build/nvs_matrix -type f -name "nvs_*.bin" 2>/dev/null | wc -l)
|
||||
echo "Generated $ACTUAL / $EXPECTED NVS binaries"
|
||||
ls -la build/nvs_matrix/
|
||||
|
||||
if [ "$ACTUAL" -lt "$EXPECTED" ]; then
|
||||
echo "::error::Only $ACTUAL of $EXPECTED NVS binaries generated"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify binary sizes
|
||||
run: |
|
||||
file_size() { stat -c%s "$1" 2>/dev/null || stat -f%z "$1" 2>/dev/null || wc -c < "$1"; }
|
||||
for f in build/nvs_matrix/nvs_*.bin; do
|
||||
SIZE=$(file_size "$f")
|
||||
if [ "$SIZE" -ne 24576 ]; then
|
||||
echo "::error::$f has unexpected size $SIZE (expected 24576)"
|
||||
exit 1
|
||||
fi
|
||||
echo " OK: $(basename $f) ($SIZE bytes)"
|
||||
done
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ADR-062: QEMU Swarm Configurator Test
|
||||
#
|
||||
# Runs a lightweight 3-node swarm (ci_matrix preset) under QEMU to validate
|
||||
# multi-node orchestration, TDM slot coordination, and swarm-level health
|
||||
# assertions. Uses the pre-built QEMU binary from the build-qemu job and the
|
||||
# firmware built by qemu-test.
|
||||
#
|
||||
# The CI runner is non-root, so TAP bridge networking is unavailable.
|
||||
# The orchestrator (qemu_swarm.py) detects this and falls back to SLIRP
|
||||
# user-mode networking, which is sufficient for the ci_matrix preset.
|
||||
# ---------------------------------------------------------------------------
|
||||
swarm-test:
|
||||
name: Swarm Test (ADR-062)
|
||||
needs: [build-qemu]
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: espressif/idf:v5.4
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download QEMU artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: qemu-esp32
|
||||
path: ${{ github.workspace }}/qemu-build
|
||||
|
||||
- name: Make QEMU executable
|
||||
run: chmod +x ${{ github.workspace }}/qemu-build/bin/qemu-system-xtensa
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: pip install pyyaml esptool esp-idf-nvs-partition-gen
|
||||
|
||||
- name: Build firmware for swarm
|
||||
working-directory: firmware/esp32-csi-node
|
||||
run: |
|
||||
. $IDF_PATH/export.sh
|
||||
idf.py set-target esp32s3
|
||||
idf.py -D SDKCONFIG_DEFAULTS="sdkconfig.defaults;sdkconfig.qemu" build
|
||||
python3 -m esptool --chip esp32s3 merge_bin \
|
||||
-o build/qemu_flash.bin \
|
||||
--flash_mode dio --flash_freq 80m --flash_size 8MB \
|
||||
0x0 build/bootloader/bootloader.bin \
|
||||
0x8000 build/partition_table/partition-table.bin \
|
||||
0x20000 build/esp32-csi-node.bin
|
||||
|
||||
- name: Run swarm smoke test
|
||||
run: |
|
||||
python3 scripts/qemu_swarm.py --preset ci_matrix \
|
||||
--qemu-path ${{ github.workspace }}/qemu-build/bin/qemu-system-xtensa \
|
||||
--output-dir build/swarm-results
|
||||
timeout-minutes: 10
|
||||
|
||||
- name: Upload swarm results
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: swarm-results
|
||||
path: |
|
||||
build/swarm-results/
|
||||
retention-days: 14
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "QEMU ESP32-S3 Debug",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/firmware/esp32-csi-node/build/esp32-csi-node.elf",
|
||||
"cwd": "${workspaceFolder}/firmware/esp32-csi-node",
|
||||
"MIMode": "gdb",
|
||||
"miDebuggerPath": "xtensa-esp-elf-gdb",
|
||||
"miDebuggerServerAddress": "localhost:1234",
|
||||
"setupCommands": [
|
||||
{
|
||||
"description": "Set remote hardware breakpoint limit (ESP32-S3 has 2)",
|
||||
"text": "set remote hardware-breakpoint-limit 2",
|
||||
"ignoreFailures": false
|
||||
},
|
||||
{
|
||||
"description": "Set remote hardware watchpoint limit (ESP32-S3 has 2)",
|
||||
"text": "set remote hardware-watchpoint-limit 2",
|
||||
"ignoreFailures": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "QEMU ESP32-S3 Debug (attach)",
|
||||
"type": "cppdbg",
|
||||
"request": "attach",
|
||||
"program": "${workspaceFolder}/firmware/esp32-csi-node/build/esp32-csi-node.elf",
|
||||
"cwd": "${workspaceFolder}/firmware/esp32-csi-node",
|
||||
"MIMode": "gdb",
|
||||
"miDebuggerPath": "xtensa-esp-elf-gdb",
|
||||
"miDebuggerServerAddress": "localhost:1234",
|
||||
"setupCommands": [
|
||||
{
|
||||
"description": "Set remote hardware breakpoint limit (ESP32-S3 has 2)",
|
||||
"text": "set remote hardware-breakpoint-limit 2",
|
||||
"ignoreFailures": false
|
||||
},
|
||||
{
|
||||
"description": "Set remote hardware watchpoint limit (ESP32-S3 has 2)",
|
||||
"text": "set remote hardware-watchpoint-limit 2",
|
||||
"ignoreFailures": false
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
28
CHANGELOG.md
28
CHANGELOG.md
|
|
@ -8,6 +8,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **QEMU ESP32-S3 testing platform (ADR-061)** — 9-layer firmware testing without hardware
|
||||
- Mock CSI generator with 10 physics-based scenarios (empty room, walking, fall, multi-person, etc.)
|
||||
- Single-node QEMU runner with 16-check UART validation
|
||||
- Multi-node TDM mesh simulation (TAP networking, 2-6 nodes)
|
||||
- GDB remote debugging with VS Code integration
|
||||
- Code coverage via gcov/lcov + apptrace
|
||||
- Fuzz testing (3 libFuzzer targets + ASAN/UBSAN)
|
||||
- NVS provisioning matrix (14 configs)
|
||||
- Snapshot-based regression testing (sub-second VM restore)
|
||||
- Chaos testing with fault injection + health monitoring
|
||||
- **QEMU Swarm Configurator (ADR-062)** — YAML-driven multi-ESP32 test orchestration
|
||||
- 4 topologies: star, mesh, line, ring
|
||||
- 3 node roles: sensor, coordinator, gateway
|
||||
- 9 swarm-level assertions (boot, crashes, TDM, frame rate, fall detection, etc.)
|
||||
- 7 presets: smoke (2n/15s), standard (3n/60s), ci-matrix, large-mesh, line-relay, ring-fault, heterogeneous
|
||||
- Health oracle with cross-node validation
|
||||
- **QEMU installer** (`install-qemu.sh`) — auto-detects OS, installs deps, builds Espressif QEMU fork
|
||||
- **Unified QEMU CLI** (`qemu-cli.sh`) — single entry point for all 11 QEMU test commands
|
||||
- CI: `firmware-qemu.yml` workflow with QEMU test matrix, fuzz testing, NVS validation, and swarm test jobs
|
||||
- User guide: QEMU testing and swarm configurator section with plain-language walkthrough
|
||||
|
||||
### Fixed
|
||||
- Firmware now boots in QEMU: WiFi/UDP/OTA/display guards for mock CSI mode
|
||||
- 9 bugs in mock_csi.c (LFSR bias, MAC filter init, scenario loop, overflow burst timing)
|
||||
- 23 bugs from ADR-061 deep review (inject_fault.py writes, CI cache, snapshot log corruption, etc.)
|
||||
- 16 bugs from ADR-062 deep review (log filename mismatch, SLIRP port collision, heap false positives, etc.)
|
||||
- All scripts: `--help` flags, prerequisite checks with install hints, standardized exit codes
|
||||
|
||||
- **Sensing server UI API completion (ADR-043)** — 14 fully-functional REST endpoints for model management, CSI recording, and training control
|
||||
- Model CRUD: `GET /api/v1/models`, `GET /api/v1/models/active`, `POST /api/v1/models/load`, `POST /api/v1/models/unload`, `DELETE /api/v1/models/:id`, `GET /api/v1/models/lora/profiles`, `POST /api/v1/models/lora/activate`
|
||||
- CSI recording: `GET /api/v1/recording/list`, `POST /api/v1/recording/start`, `POST /api/v1/recording/stop`, `DELETE /api/v1/recording/:id`
|
||||
|
|
|
|||
77
README.md
77
README.md
|
|
@ -75,7 +75,7 @@ docker run -p 3000:3000 ruvnet/wifi-densepose:latest
|
|||
|----------|-------------|
|
||||
| [User Guide](docs/user-guide.md) | Step-by-step guide: installation, first run, API usage, hardware setup, training |
|
||||
| [Build Guide](docs/build-guide.md) | Building from source (Rust and Python) |
|
||||
| [Architecture Decisions](docs/adr/README.md) | 49 ADRs — why each technical choice was made, organized by domain (hardware, signal processing, ML, platform, infrastructure) |
|
||||
| [Architecture Decisions](docs/adr/README.md) | 62 ADRs — why each technical choice was made, organized by domain (hardware, signal processing, ML, platform, infrastructure) |
|
||||
| [Domain Models](docs/ddd/README.md) | 7 DDD models (RuvSense, Signal Processing, Training Pipeline, Hardware Platform, Sensing Server, WiFi-Mat, CHCI) — bounded contexts, aggregates, domain events, and ubiquitous language |
|
||||
| [Desktop App](rust-port/wifi-densepose-rs/crates/wifi-densepose-desktop/README.md) | **WIP** — Tauri v2 desktop app for node management, OTA updates, WASM deployment, and mesh visualization |
|
||||
|
||||
|
|
@ -1697,31 +1697,78 @@ WebSocket: `ws://localhost:3001/ws/sensing` (real-time sensing + vital signs)
|
|||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>QEMU Firmware Testing (ADR-061)</strong></summary>
|
||||
<summary><strong>QEMU Firmware Testing (ADR-061) — 9-Layer Platform</strong></summary>
|
||||
|
||||
Test ESP32-S3 firmware without physical hardware using Espressif's QEMU fork.
|
||||
Test ESP32-S3 firmware without physical hardware using Espressif's QEMU fork. The platform provides 9 layers of testing capability:
|
||||
|
||||
| Layer | Capability | Script / Config |
|
||||
|-------|-----------|-----------------|
|
||||
| 1 | Mock CSI generator (10 physics-based scenarios) | `firmware/esp32-csi-node/main/mock_csi.c` |
|
||||
| 2 | Single-node QEMU runner + UART validation (16 checks) | `scripts/qemu-esp32s3-test.sh`, `scripts/validate_qemu_output.py` |
|
||||
| 3 | Multi-node TDM mesh simulation (TAP networking) | `scripts/qemu-mesh-test.sh`, `scripts/validate_mesh_test.py` |
|
||||
| 4 | GDB remote debugging (VS Code integration) | `.vscode/launch.json` |
|
||||
| 5 | Code coverage (gcov/lcov via apptrace) | `firmware/esp32-csi-node/sdkconfig.coverage` |
|
||||
| 6 | Fuzz testing (libFuzzer + ASAN/UBSAN) | `firmware/esp32-csi-node/test/fuzz_*.c` |
|
||||
| 7 | NVS provisioning matrix (14 configs) | `scripts/generate_nvs_matrix.py` |
|
||||
| 8 | Snapshot regression (sub-second VM restore) | `scripts/qemu-snapshot-test.sh` |
|
||||
| 9 | Chaos testing (fault injection + health monitoring) | `scripts/qemu-chaos-test.sh`, `scripts/inject_fault.py`, `scripts/check_health.py` |
|
||||
|
||||
```bash
|
||||
# Build with mock CSI
|
||||
# Quick start: build + run + validate
|
||||
cd firmware/esp32-csi-node
|
||||
idf.py -D SDKCONFIG_DEFAULTS="sdkconfig.defaults;sdkconfig.qemu" build
|
||||
|
||||
# Create flash image
|
||||
esptool.py --chip esp32s3 merge_bin -o build/qemu_flash.bin \
|
||||
--flash_size 8MB 0x0 build/bootloader/bootloader.bin \
|
||||
0x8000 build/partition_table/partition-table.bin \
|
||||
0x20000 build/esp32-csi-node.bin
|
||||
# Single-node test (builds, merges flash, runs QEMU, validates output)
|
||||
bash scripts/qemu-esp32s3-test.sh
|
||||
|
||||
# Run in QEMU
|
||||
qemu-system-xtensa -machine esp32s3 -nographic \
|
||||
-drive file=build/qemu_flash.bin,if=mtd,format=raw
|
||||
# Multi-node mesh test (3 QEMU instances with TDM)
|
||||
sudo bash scripts/qemu-mesh-test.sh 3
|
||||
|
||||
# Fuzz testing (60 seconds per target)
|
||||
cd firmware/esp32-csi-node/test && make all CC=clang && make run_serialize FUZZ_DURATION=60
|
||||
|
||||
# Chaos testing (fault injection resilience)
|
||||
bash scripts/qemu-chaos-test.sh --faults all --duration 120
|
||||
```
|
||||
|
||||
**10 test scenarios**: empty room, static person, walking, fall, multi-person, channel sweep, MAC filter, ring overflow, boundary RSSI, zero-length frames.
|
||||
|
||||
**14 NVS configs**: default, WiFi-only, full ADR-060, edge tiers 0/1/2, TDM mesh, WASM signed/unsigned, 5GHz, boundary values.
|
||||
**14 NVS configs**: default, WiFi-only, full ADR-060, edge tiers 0/1/2, TDM mesh, WASM signed/unsigned, 5GHz, boundary max/min, power-save, empty-strings.
|
||||
|
||||
See [ADR-061](docs/adr/ADR-061-qemu-esp32s3-firmware-testing.md) and [firmware README](firmware/esp32-csi-node/README.md) for full details.
|
||||
**CI**: GitHub Actions workflow runs 7 NVS matrix configs, 3 fuzz targets, and NVS binary validation on every push to `firmware/`.
|
||||
|
||||
See [ADR-061](docs/adr/ADR-061-qemu-esp32s3-firmware-testing.md) for the full architecture.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>QEMU Swarm Configurator (ADR-062)</strong></summary>
|
||||
|
||||
Test multiple ESP32-S3 nodes simultaneously using a YAML-driven orchestrator. Define node roles, network topologies, and validation assertions in a config file.
|
||||
|
||||
```bash
|
||||
# Quick smoke test (2 nodes, 15 seconds)
|
||||
python3 scripts/qemu_swarm.py --preset smoke
|
||||
|
||||
# Standard 3-node test (coordinator + 2 sensors)
|
||||
python3 scripts/qemu_swarm.py --preset standard
|
||||
|
||||
# See all presets
|
||||
python3 scripts/qemu_swarm.py --list-presets
|
||||
|
||||
# Preview without running
|
||||
python3 scripts/qemu_swarm.py --preset standard --dry-run
|
||||
```
|
||||
|
||||
**Topologies**: star (sensors → coordinator), mesh (fully connected), line (relay chain), ring (circular).
|
||||
|
||||
**Node roles**: sensor (generates CSI), coordinator (aggregates), gateway (bridges to host).
|
||||
|
||||
**7 presets**: smoke, standard, ci-matrix, large-mesh, line-relay, ring-fault, heterogeneous.
|
||||
|
||||
**9 swarm assertions**: boot check, crash detection, TDM collision, frame production, coordinator reception, fall detection, frame rate, boot time, heap health.
|
||||
|
||||
See [ADR-062](docs/adr/ADR-062-qemu-swarm-configurator.md) and the [User Guide](docs/user-guide.md#testing-firmware-without-hardware-qemu) for step-by-step instructions.
|
||||
|
||||
</details>
|
||||
|
||||
|
|
@ -1744,7 +1791,9 @@ wifi-densepose tasks list # List background tasks
|
|||
<details>
|
||||
<summary><strong>Documentation Links</strong></summary>
|
||||
|
||||
- [User Guide](docs/user-guide.md) — installation, first run, API, hardware setup, QEMU testing
|
||||
- [WiFi-Mat User Guide](docs/wifi-mat-user-guide.md) | [Domain Model](docs/ddd/wifi-mat-domain-model.md)
|
||||
- [ADR-061](docs/adr/ADR-061-qemu-esp32s3-firmware-testing.md) QEMU platform | [ADR-062](docs/adr/ADR-062-qemu-swarm-configurator.md) Swarm configurator
|
||||
- [ADR-021](docs/adr/ADR-021-vital-sign-detection-rvdna-pipeline.md) | [ADR-022](docs/adr/ADR-022-windows-wifi-enhanced-fidelity-ruvector.md) | [ADR-023](docs/adr/ADR-023-trained-densepose-model-ruvector-pipeline.md)
|
||||
|
||||
</details>
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
| Field | Value |
|
||||
|-------------|------------------------------------------------|
|
||||
| **Status** | Proposed |
|
||||
| **Date** | 2026-03-13 |
|
||||
| **Status** | Accepted |
|
||||
| **Date** | 2026-03-13 (updated 2026-03-14) |
|
||||
| **Authors** | RuView Team |
|
||||
| **Relates** | ADR-018 (binary frame), ADR-039 (edge intel), ADR-040 (WASM), ADR-057 (build guard), ADR-060 (channel/MAC filter) |
|
||||
|
||||
|
|
@ -32,6 +32,98 @@ Currently, **every code change requires flashing to physical hardware** on COM7.
|
|||
|
||||
Espressif maintains an official QEMU fork (`github.com/espressif/qemu`) with ESP32-S3 machine support, including dual-core Xtensa LX7, flash mapping, UART, GPIO, timers, and FreeRTOS.
|
||||
|
||||
## Glossary
|
||||
|
||||
| Term | Definition |
|
||||
|------|-----------|
|
||||
| CSI | Channel State Information — per-subcarrier amplitude/phase from WiFi |
|
||||
| NVS | Non-Volatile Storage — ESP-IDF key-value flash partition |
|
||||
| TDM | Time-Division Multiplexing — nodes transmit in assigned time slots |
|
||||
| UART | Universal Asynchronous Receiver-Transmitter — serial console output |
|
||||
| SLIRP | User-mode TCP/IP stack — enables networking without root/TAP |
|
||||
| QEMU | Quick Emulator — runs ESP32-S3 firmware without physical hardware |
|
||||
| QMP | QEMU Machine Protocol — JSON-based control interface |
|
||||
| LFSR | Linear Feedback Shift Register — deterministic pseudo-random generator |
|
||||
| SPSC | Single Producer Single Consumer — lock-free ring buffer pattern |
|
||||
| FreeRTOS | Real-time OS used by ESP-IDF for task scheduling |
|
||||
| gcov/lcov | GCC code coverage tools for line/branch analysis |
|
||||
| libFuzzer | LLVM coverage-guided fuzzer for finding crashes |
|
||||
| ASAN | AddressSanitizer — detects buffer overflows and use-after-free |
|
||||
| UBSAN | UndefinedBehaviorSanitizer — detects undefined C behavior |
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Install required tools:
|
||||
|
||||
```bash
|
||||
# QEMU (Espressif fork with ESP32-S3 support)
|
||||
git clone https://github.com/espressif/qemu.git
|
||||
cd qemu && ./configure --target-list=xtensa-softmmu && make -j$(nproc)
|
||||
export QEMU_PATH=/path/to/qemu/build/qemu-system-xtensa
|
||||
|
||||
# ESP-IDF (for building firmware)
|
||||
# See https://docs.espressif.com/projects/esp-idf/en/latest/esp32s3/get-started/
|
||||
|
||||
# Python tools
|
||||
pip install esptool esp-idf-nvs-partition-gen
|
||||
|
||||
# Coverage tools (optional, Layer 5)
|
||||
sudo apt install lcov # Debian/Ubuntu
|
||||
brew install lcov # macOS
|
||||
|
||||
# Fuzz testing (optional, Layer 6)
|
||||
sudo apt install clang # Debian/Ubuntu
|
||||
|
||||
# Mesh testing (optional, Layer 3 — requires root)
|
||||
sudo apt install socat bridge-utils iproute2
|
||||
```
|
||||
|
||||
### Run the Full Test Suite
|
||||
|
||||
```bash
|
||||
# Layer 2: Single-node test (build + run + validate)
|
||||
bash scripts/qemu-esp32s3-test.sh
|
||||
|
||||
# Layer 3: Multi-node mesh (3 nodes, requires root)
|
||||
sudo bash scripts/qemu-mesh-test.sh 3
|
||||
|
||||
# Layer 6: Fuzz testing (60 seconds per target)
|
||||
cd firmware/esp32-csi-node/test && make all CC=clang
|
||||
make run_serialize FUZZ_DURATION=60
|
||||
|
||||
# Layer 7: Generate NVS test matrix
|
||||
python3 scripts/generate_nvs_matrix.py --output-dir build/nvs_matrix
|
||||
|
||||
# Layer 8: Snapshot regression tests
|
||||
bash scripts/qemu-snapshot-test.sh --create
|
||||
bash scripts/qemu-snapshot-test.sh --restore csi-streaming
|
||||
|
||||
# Layer 9: Chaos/fault injection
|
||||
bash scripts/qemu-chaos-test.sh --faults all --duration 120
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `QEMU_PATH` | `qemu-system-xtensa` | Path to Espressif QEMU binary |
|
||||
| `QEMU_TIMEOUT` | `60` (single) / `45` (mesh) / `120` (chaos) | Test timeout in seconds |
|
||||
| `SKIP_BUILD` | unset | Set to `1` to skip firmware build step |
|
||||
| `NVS_BIN` | unset | Path to pre-built NVS partition binary |
|
||||
| `QEMU_NET` | `1` | Set to `0` to disable SLIRP networking |
|
||||
| `CHAOS_SEED` | current time | Seed for reproducible chaos testing |
|
||||
|
||||
### Exit Codes (all scripts)
|
||||
|
||||
| Code | Meaning | Action |
|
||||
|------|---------|--------|
|
||||
| 0 | PASS | All checks passed |
|
||||
| 1 | WARN | Non-critical issues; review output |
|
||||
| 2 | FAIL | Critical checks failed; fix and re-run |
|
||||
| 3 | FATAL | Build error, crash, or missing tool; check prerequisites |
|
||||
|
||||
## Decision
|
||||
|
||||
Introduce a **comprehensive QEMU testing platform** for the ESP32-S3 CSI node firmware with nine capability layers:
|
||||
|
|
@ -145,7 +237,7 @@ This model exercises:
|
|||
| 5 | Channel sweep | 5s | Frames on channels 1, 6, 11 in sequence |
|
||||
| 6 | MAC filter test | 5s | Frames with wrong MAC are dropped (counter check) |
|
||||
| 7 | Ring buffer overflow | 3s | 1000 frames in 100ms burst, graceful drop |
|
||||
| 8 | Boundary RSSI | 5s | RSSI sweeps -127 to 0, no crash |
|
||||
| 8 | Boundary RSSI | 5s | RSSI sweeps -90 to -10 dBm, no crash |
|
||||
| 9 | Zero-length frame | 2s | `iq_len=0` frames, serialize returns 0 |
|
||||
|
||||
---
|
||||
|
|
@ -456,6 +548,53 @@ xtensa-esp-elf-gdb build/esp32-csi-node.elf \
|
|||
-ex "continue"
|
||||
```
|
||||
|
||||
### Debugging Walkthrough
|
||||
|
||||
**1. Start QEMU with GDB stub (paused at reset vector):**
|
||||
|
||||
```bash
|
||||
qemu-system-xtensa \
|
||||
-machine esp32s3 \
|
||||
-nographic \
|
||||
-drive file=build/qemu_flash.bin,if=mtd,format=raw \
|
||||
-serial mon:stdio \
|
||||
-s -S
|
||||
# -s opens GDB server on localhost:1234
|
||||
# -S pauses CPU until GDB sends "continue"
|
||||
```
|
||||
|
||||
**2. Connect from a second terminal:**
|
||||
|
||||
```bash
|
||||
xtensa-esp-elf-gdb build/esp32-csi-node.elf \
|
||||
-ex "target remote :1234" \
|
||||
-ex "b app_main" \
|
||||
-ex "continue"
|
||||
```
|
||||
|
||||
**3. Set a breakpoint on DSP processing and inspect state:**
|
||||
|
||||
```
|
||||
(gdb) b edge_processing.c:dsp_task
|
||||
(gdb) continue
|
||||
# ...breakpoint hit...
|
||||
(gdb) print g_nvs_config
|
||||
(gdb) print ring->head - ring->tail
|
||||
(gdb) continue
|
||||
```
|
||||
|
||||
**4. Connect from VS Code** using the `launch.json` config below (set breakpoints in the editor gutter, then press F5).
|
||||
|
||||
**5. Dump gcov coverage data (requires `sdkconfig.coverage` overlay):**
|
||||
|
||||
```
|
||||
(gdb) monitor gcov dump
|
||||
# Writes .gcda files to the build directory.
|
||||
# Then generate the HTML report on the host:
|
||||
# lcov --capture --directory build --output-file coverage.info
|
||||
# genhtml coverage.info --output-directory build/coverage_report
|
||||
```
|
||||
|
||||
### Key Breakpoint Locations
|
||||
|
||||
| Breakpoint | Purpose |
|
||||
|
|
@ -862,3 +1001,32 @@ Alternative to QEMU with better peripheral modeling for some platforms.
|
|||
- ADR-040: WASM programmable sensing runtime
|
||||
- ADR-057: Build-time CSI guard (`CONFIG_ESP_WIFI_CSI_ENABLED`)
|
||||
- ADR-060: Channel override and MAC address filter
|
||||
|
||||
---
|
||||
|
||||
## Optimization Log (2026-03-14)
|
||||
|
||||
### Bugs Fixed
|
||||
|
||||
1. **LFSR float bias** — `lfsr_float()` used divisor 32767.5 producing range [-1.0, 1.00002]; fixed to 32768.0 for exact [-1.0, +1.0)
|
||||
2. **MAC filter initialization** — `gen_mac_filter()` compared `frame_count == scenario_start_ms` (count vs timestamp); replaced with boolean flag
|
||||
3. **Scenario infinite loop** — `advance_scenario()` looped to scenario 0 when all completed; now sets `s_all_done=true` and timer callback exits early
|
||||
4. **Boot check severity** — `validate_qemu_output.py` reported no-boot as ERROR; upgraded to FATAL (nothing works without boot)
|
||||
5. **NVS boundary configs** — `boundary-max` used `vital_win=65535` which firmware silently rejects (valid: 32-256); fixed to 256
|
||||
6. **NVS boundary-min** — `vital_win=1` also invalid; fixed to 32 (firmware min)
|
||||
7. **edge-tier2-custom** — `vital_win=512` exceeded firmware max of 256; fixed to 256
|
||||
8. **power-save config** — Described as "10% duty cycle" but didn't set `power_duty=10`; fixed
|
||||
9. **wasm-signed/unsigned** — Both configs were identical; signed now includes pubkey blob, unsigned sets `wasm_verify=0`
|
||||
|
||||
### Optimizations Applied
|
||||
|
||||
1. **SLIRP networking** — QEMU runner now passes `-nic user,model=open_eth` for UDP testing
|
||||
2. **Scenario completion tracking** — Validator now checks `All N scenarios complete` log marker (check 15)
|
||||
3. **Frame rate monitoring** — Validator extracts `scenario=N frames=M` counters for rate analysis (check 16)
|
||||
4. **Watchdog tuning** — `sdkconfig.qemu` relaxes WDT to 30s / INT_WDT to 800ms for QEMU timing variance
|
||||
5. **Timer stack depth** — Increased `FREERTOS_TIMER_TASK_STACK_DEPTH=4096` to prevent overflow from math-heavy mock callback
|
||||
6. **Display disabled** — `CONFIG_DISPLAY_ENABLE=n` in QEMU overlay (no I2C hardware)
|
||||
7. **CI fuzz job** — Added `fuzz-test` job running all 3 fuzz targets for 60s each with crash artifact upload
|
||||
8. **CI NVS validation** — Added `nvs-matrix-validate` job that generates all 14 binaries and verifies sizes
|
||||
9. **CI matrix expanded** — Added `edge-tier1`, `boundary-max`, `boundary-min` to QEMU test matrix (4 → 7 configs)
|
||||
10. **QEMU cache key** — Uses `github.run_id` with restore-keys fallback to prevent stale QEMU builds
|
||||
|
|
|
|||
|
|
@ -0,0 +1,199 @@
|
|||
# ADR-062: QEMU ESP32-S3 Swarm Configurator
|
||||
|
||||
| Field | Value |
|
||||
|-------------|------------------------------------------------|
|
||||
| **Status** | Accepted |
|
||||
| **Date** | 2026-03-14 |
|
||||
| **Authors** | RuView Team |
|
||||
| **Relates** | ADR-061 (QEMU testing platform), ADR-060 (channel/MAC filter), ADR-018 (binary frame), ADR-039 (edge intel) |
|
||||
|
||||
## Glossary
|
||||
|
||||
| Term | Definition |
|
||||
|------|-----------|
|
||||
| Swarm | A group of N QEMU ESP32-S3 instances running simultaneously |
|
||||
| Topology | How nodes are connected: star, mesh, line, ring |
|
||||
| Role | Node function: `sensor` (collects CSI), `coordinator` (aggregates + forwards), `gateway` (bridges to host) |
|
||||
| Scenario matrix | Cross-product of topology × node count × NVS config × mock scenario |
|
||||
| Health oracle | Python process that monitors all node UART logs and declares swarm health |
|
||||
|
||||
## Context
|
||||
|
||||
ADR-061 Layer 3 provides a basic multi-node mesh test: N identical nodes with sequential TDM slots connected via a Linux bridge. This is useful but limited:
|
||||
|
||||
1. **All nodes are identical** — real deployments have heterogeneous roles (sensor, coordinator, gateway)
|
||||
2. **Single topology** — only fully-connected bridge; no star, line, or ring topologies
|
||||
3. **No scenario variation per node** — all nodes run the same mock CSI scenario
|
||||
4. **Manual configuration** — each test requires hand-editing env vars and arguments
|
||||
5. **No swarm-level health monitoring** — validation checks individual nodes, not collective behavior
|
||||
6. **No cross-node timing validation** — TDM slot ordering and inter-frame gaps aren't verified
|
||||
|
||||
Real WiFi-DensePose deployments use 3-8 ESP32-S3 nodes in various topologies. A single coordinator aggregates CSI from multiple sensors. The firmware must handle TDM conflicts, missing nodes, role-based behavior differences, and network partitions — none of which ADR-061 Layer 3 tests.
|
||||
|
||||
## Decision
|
||||
|
||||
Build a **QEMU Swarm Configurator** — a YAML-driven tool that defines multi-node test scenarios declaratively and orchestrates them under QEMU with swarm-level validation.
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────┐
|
||||
│ swarm_config.yaml │
|
||||
│ nodes: [{role: sensor, scenario: 2, channel: 6}] │
|
||||
│ topology: star │
|
||||
│ duration: 60s │
|
||||
│ assertions: [all_nodes_boot, tdm_no_collision, ...] │
|
||||
└──────────────────────┬──────────────────────────────┘
|
||||
│
|
||||
┌────────────▼────────────┐
|
||||
│ qemu_swarm.py │
|
||||
│ (orchestrator) │
|
||||
└───┬────┬────┬───┬──────┘
|
||||
│ │ │ │
|
||||
┌────▼┐ ┌▼──┐ ▼ ┌▼────┐
|
||||
│Node0│ │N1 │... │N(n-1)│ QEMU instances
|
||||
│sens │ │sen│ │coord │
|
||||
└──┬──┘ └─┬─┘ └──┬───┘
|
||||
│ │ │
|
||||
┌──▼──────▼─────────▼──┐
|
||||
│ Virtual Network │ TAP bridge / SLIRP
|
||||
│ (topology-shaped) │
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
┌──────────▼───────────┐
|
||||
│ Aggregator (Rust) │ Collects frames
|
||||
└──────────┬───────────┘
|
||||
│
|
||||
┌──────────▼───────────┐
|
||||
│ Health Oracle │ Swarm-level assertions
|
||||
│ (swarm_health.py) │
|
||||
└──────────────────────┘
|
||||
```
|
||||
|
||||
### YAML Configuration Schema
|
||||
|
||||
```yaml
|
||||
# swarm_config.yaml
|
||||
swarm:
|
||||
name: "3-sensor-star"
|
||||
duration_s: 60
|
||||
topology: star # star | mesh | line | ring
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0 # empty room (baseline)
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
is_gateway: true # receives aggregated frames
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 2 # walking person
|
||||
channel: 6
|
||||
tdm_slot: 1 # TDM slot index (auto-assigned from node position if omitted)
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 3 # fall event
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- all_nodes_produce_frames
|
||||
- coordinator_receives_from_all
|
||||
- fall_detected_by_node_2
|
||||
- frame_rate_above: 15 # Hz minimum per node
|
||||
- max_boot_time_s: 10
|
||||
```
|
||||
|
||||
### Topologies
|
||||
|
||||
| Topology | Network | Description |
|
||||
|----------|---------|-------------|
|
||||
| `star` | All sensors connect to coordinator; coordinator has TAP to each sensor | Hub-and-spoke, most common |
|
||||
| `mesh` | All nodes on same bridge (existing Layer 3 behavior) | Every node sees every other |
|
||||
| `line` | Node 0 ↔ Node 1 ↔ Node 2 ↔ ... | Linear chain, tests multi-hop |
|
||||
| `ring` | Like line but last connects to first | Circular, tests routing |
|
||||
|
||||
### Node Roles
|
||||
|
||||
| Role | Behavior | NVS Keys |
|
||||
|------|----------|----------|
|
||||
| `sensor` | Runs mock CSI, sends frames to coordinator | `node_id`, `tdm_slot`, `target_ip` |
|
||||
| `coordinator` | Receives frames from sensors, runs edge aggregation | `node_id`, `tdm_slot=0`, `edge_tier=2` |
|
||||
| `gateway` | Like coordinator but also bridges to host UDP | `node_id`, `target_ip=host`, `is_gateway=1` |
|
||||
|
||||
### Assertions (Swarm-Level)
|
||||
|
||||
| Assertion | What It Checks |
|
||||
|-----------|---------------|
|
||||
| `all_nodes_boot` | Every node's UART log shows boot indicators within timeout |
|
||||
| `no_crashes` | No Guru Meditation, assert, panic in any log |
|
||||
| `tdm_no_collision` | No two nodes transmit in the same TDM slot |
|
||||
| `all_nodes_produce_frames` | Every sensor node's log contains CSI frame output |
|
||||
| `coordinator_receives_from_all` | Coordinator log shows frames from each sensor's node_id |
|
||||
| `fall_detected_by_node_N` | Node N's log reports a fall detection event |
|
||||
| `frame_rate_above` | Each node produces at least N frames/second |
|
||||
| `max_boot_time_s` | All nodes boot within N seconds |
|
||||
| `no_heap_errors` | No OOM or heap corruption in any log |
|
||||
| `network_partitioned_recovery` | After deliberate partition, nodes resume communication (future) |
|
||||
|
||||
### Preset Configurations
|
||||
|
||||
| Preset | Nodes | Topology | Purpose |
|
||||
|--------|-------|----------|---------|
|
||||
| `smoke` | 2 | star | Quick CI smoke test (15s) |
|
||||
| `standard` | 3 | star | Default 3-node (sensor + sensor + coordinator) |
|
||||
| `large-mesh` | 6 | mesh | Scale test with 6 fully-connected nodes |
|
||||
| `line-relay` | 4 | line | Multi-hop relay chain |
|
||||
| `ring-fault` | 4 | ring | Ring with fault injection mid-test |
|
||||
| `heterogeneous` | 5 | star | Mixed scenarios: walk, fall, static, channel-sweep, empty |
|
||||
| `ci-matrix` | 3 | star | CI-optimized preset (30s, minimal assertions) |
|
||||
|
||||
## File Layout
|
||||
|
||||
```
|
||||
scripts/
|
||||
├── qemu_swarm.py # Main orchestrator (CLI entry point)
|
||||
├── swarm_health.py # Swarm-level health oracle
|
||||
└── swarm_presets/
|
||||
├── smoke.yaml
|
||||
├── standard.yaml
|
||||
├── large_mesh.yaml
|
||||
├── line_relay.yaml
|
||||
├── ring_fault.yaml
|
||||
├── heterogeneous.yaml
|
||||
└── ci_matrix.yaml
|
||||
|
||||
.github/workflows/
|
||||
└── firmware-qemu.yml # MODIFIED: add swarm test job
|
||||
```
|
||||
|
||||
## Consequences
|
||||
|
||||
### Benefits
|
||||
|
||||
1. **Declarative testing** — define swarm topology in YAML, not shell scripts
|
||||
2. **Role-based nodes** — test coordinator/sensor/gateway interactions
|
||||
3. **Topology variety** — star/mesh/line/ring match real deployment patterns
|
||||
4. **Swarm-level assertions** — validate collective behavior, not just individual nodes
|
||||
5. **Preset library** — quick CI smoke tests and thorough manual validation
|
||||
6. **Reproducible** — YAML configs are version-controlled and shareable
|
||||
|
||||
### Limitations
|
||||
|
||||
1. **Still requires root** for TAP bridge topologies (star, line, ring); mesh can use SLIRP
|
||||
2. **QEMU resource usage** — 6+ QEMU instances use ~2GB RAM, may slow CI runners
|
||||
3. **No real RF** — inter-node communication is IP-based, not WiFi CSI multipath
|
||||
|
||||
## References
|
||||
|
||||
- ADR-061: QEMU ESP32-S3 firmware testing platform (Layers 1-9)
|
||||
- ADR-060: Channel override and MAC address filter provisioning
|
||||
- ADR-018: Binary CSI frame format (magic `0xC5110001`)
|
||||
- ADR-039: Edge intelligence pipeline (biquad, vitals, fall detection)
|
||||
|
|
@ -38,8 +38,17 @@ WiFi DensePose turns commodity WiFi signals into real-time human pose estimation
|
|||
- [ESP32-S3 Mesh](#esp32-s3-mesh)
|
||||
- [Intel 5300 / Atheros NIC](#intel-5300--atheros-nic)
|
||||
15. [Docker Compose (Multi-Service)](#docker-compose-multi-service)
|
||||
16. [Troubleshooting](#troubleshooting)
|
||||
17. [FAQ](#faq)
|
||||
16. [Testing Firmware Without Hardware (QEMU)](#testing-firmware-without-hardware-qemu)
|
||||
- [What You Need](#what-you-need)
|
||||
- [Your First Test Run](#your-first-test-run)
|
||||
- [Understanding the Test Output](#understanding-the-test-output)
|
||||
- [Testing Multiple Nodes at Once (Swarm)](#testing-multiple-nodes-at-once-swarm)
|
||||
- [Swarm Presets](#swarm-presets)
|
||||
- [Writing Your Own Swarm Config](#writing-your-own-swarm-config)
|
||||
- [Debugging Firmware in QEMU](#debugging-firmware-in-qemu)
|
||||
- [Running the Full Test Suite](#running-the-full-test-suite)
|
||||
17. [Troubleshooting](#troubleshooting)
|
||||
18. [FAQ](#faq)
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -936,6 +945,288 @@ This starts:
|
|||
|
||||
---
|
||||
|
||||
## Testing Firmware Without Hardware (QEMU)
|
||||
|
||||
You can test the ESP32-S3 firmware on your computer without any physical hardware. The project uses **QEMU** — an emulator that pretends to be an ESP32-S3 chip, running the real firmware code inside a virtual machine on your PC.
|
||||
|
||||
This is useful when:
|
||||
- You don't have an ESP32-S3 board yet
|
||||
- You want to test firmware changes before flashing to real hardware
|
||||
- You're running automated tests in CI/CD
|
||||
- You want to simulate multiple ESP32 nodes talking to each other
|
||||
|
||||
### What You Need
|
||||
|
||||
**Required:**
|
||||
- Python 3.8+ (you probably already have this)
|
||||
- QEMU with ESP32-S3 support (Espressif's fork)
|
||||
|
||||
**Install QEMU (one-time setup):**
|
||||
|
||||
```bash
|
||||
# Easiest: use the automated installer (installs QEMU + Python tools)
|
||||
bash scripts/install-qemu.sh
|
||||
|
||||
# Or check what's already installed:
|
||||
bash scripts/install-qemu.sh --check
|
||||
```
|
||||
|
||||
The installer detects your OS (Ubuntu, Fedora, macOS, etc.), installs build dependencies, clones Espressif's QEMU fork, builds it, and adds it to your PATH. It also installs the Python tools (`esptool`, `pyyaml`, `esp-idf-nvs-partition-gen`).
|
||||
|
||||
<details>
|
||||
<summary>Manual installation (if you prefer)</summary>
|
||||
|
||||
```bash
|
||||
# Build from source
|
||||
git clone https://github.com/espressif/qemu.git
|
||||
cd qemu
|
||||
./configure --target-list=xtensa-softmmu --enable-slirp
|
||||
make -j$(nproc)
|
||||
export QEMU_PATH=$(pwd)/build/qemu-system-xtensa
|
||||
|
||||
# Install Python tools
|
||||
pip install esptool pyyaml esp-idf-nvs-partition-gen
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
**For multi-node testing (optional):**
|
||||
|
||||
```bash
|
||||
# Linux only — needed for virtual network bridges
|
||||
sudo apt install socat bridge-utils iproute2
|
||||
```
|
||||
|
||||
### The `qemu-cli.sh` Command
|
||||
|
||||
All QEMU testing is available through a single command:
|
||||
|
||||
```bash
|
||||
bash scripts/qemu-cli.sh <command>
|
||||
```
|
||||
|
||||
| Command | What it does |
|
||||
|---------|-------------|
|
||||
| `install` | Install QEMU (runs the installer above) |
|
||||
| `test` | Run single-node firmware test |
|
||||
| `swarm --preset smoke` | Quick 2-node swarm test |
|
||||
| `swarm --preset standard` | Standard 3-node test |
|
||||
| `mesh 3` | Multi-node mesh test |
|
||||
| `chaos` | Fault injection resilience test |
|
||||
| `fuzz --duration 60` | Run fuzz testing |
|
||||
| `status` | Show what's installed and ready |
|
||||
| `help` | Show all commands |
|
||||
|
||||
### Your First Test Run
|
||||
|
||||
The simplest way to test the firmware:
|
||||
|
||||
```bash
|
||||
# Using the CLI:
|
||||
bash scripts/qemu-cli.sh test
|
||||
|
||||
# Or directly:
|
||||
bash scripts/qemu-esp32s3-test.sh
|
||||
```
|
||||
|
||||
**What happens behind the scenes:**
|
||||
1. The firmware is compiled with a "mock CSI" mode — instead of reading real WiFi signals, it generates synthetic test data that mimics real people walking, falling, or breathing
|
||||
2. The compiled firmware is loaded into QEMU, which boots it like a real ESP32-S3
|
||||
3. The emulator's serial output (what you'd see on a USB cable) is captured
|
||||
4. A validation script checks the output for expected behavior and errors
|
||||
|
||||
If you already built the firmware and want to skip rebuilding:
|
||||
|
||||
```bash
|
||||
SKIP_BUILD=1 bash scripts/qemu-esp32s3-test.sh
|
||||
```
|
||||
|
||||
To give it more time (useful on slower machines):
|
||||
|
||||
```bash
|
||||
QEMU_TIMEOUT=120 bash scripts/qemu-esp32s3-test.sh
|
||||
```
|
||||
|
||||
### Understanding the Test Output
|
||||
|
||||
The test runs 16 checks on the firmware's output. Here's what a successful run looks like:
|
||||
|
||||
```
|
||||
=== QEMU ESP32-S3 Firmware Test (ADR-061) ===
|
||||
|
||||
[PASS] Boot: Firmware booted successfully
|
||||
[PASS] NVS config: Configuration loaded from flash
|
||||
[PASS] Mock CSI: Synthetic WiFi data generator started
|
||||
[PASS] Edge processing: Signal analysis pipeline running
|
||||
[PASS] Frame serialization: Data packets formatted correctly
|
||||
[PASS] No crashes: No error conditions detected
|
||||
...
|
||||
|
||||
16/16 checks passed
|
||||
=== Test Complete (exit code: 0) ===
|
||||
```
|
||||
|
||||
**Exit codes explained:**
|
||||
|
||||
| Code | Meaning | What to do |
|
||||
|------|---------|-----------|
|
||||
| 0 | **PASS** — everything works | Nothing, you're good! |
|
||||
| 1 | **WARN** — minor issues | Review the output; usually safe to continue |
|
||||
| 2 | **FAIL** — something broke | Check the `[FAIL]` lines for what went wrong |
|
||||
| 3 | **FATAL** — can't even start | Usually a missing tool or build failure; check error messages |
|
||||
|
||||
### Testing Multiple Nodes at Once (Swarm)
|
||||
|
||||
Real deployments use 3-8 ESP32 nodes. The **swarm configurator** lets you simulate multiple nodes on your computer, each with a different role:
|
||||
|
||||
- **Sensor nodes** — generate WiFi signal data (like ESP32s placed around a room)
|
||||
- **Coordinator node** — collects data from all sensors and runs analysis
|
||||
- **Gateway node** — bridges data to your computer
|
||||
|
||||
```bash
|
||||
# Quick 2-node smoke test (15 seconds)
|
||||
python3 scripts/qemu_swarm.py --preset smoke
|
||||
|
||||
# Standard 3-node test: 2 sensors + 1 coordinator (60 seconds)
|
||||
python3 scripts/qemu_swarm.py --preset standard
|
||||
|
||||
# See what's available
|
||||
python3 scripts/qemu_swarm.py --list-presets
|
||||
|
||||
# Preview what would run (without actually running)
|
||||
python3 scripts/qemu_swarm.py --preset standard --dry-run
|
||||
```
|
||||
|
||||
**Note:** Multi-node testing with virtual bridges requires Linux and `sudo`. On other systems, nodes use a simpler networking mode where each node can reach the coordinator but not each other.
|
||||
|
||||
### Swarm Presets
|
||||
|
||||
| Preset | Nodes | Duration | Best for |
|
||||
|--------|-------|----------|----------|
|
||||
| `smoke` | 2 | 15s | Quick check that things work |
|
||||
| `standard` | 3 | 60s | Normal development testing |
|
||||
| `ci_matrix` | 3 | 30s | CI/CD pipelines |
|
||||
| `large_mesh` | 6 | 90s | Testing at scale |
|
||||
| `line_relay` | 4 | 60s | Multi-hop relay testing |
|
||||
| `ring_fault` | 4 | 75s | Fault tolerance testing |
|
||||
| `heterogeneous` | 5 | 90s | Mixed scenario testing |
|
||||
|
||||
### Writing Your Own Swarm Config
|
||||
|
||||
Create a YAML file describing your test scenario:
|
||||
|
||||
```yaml
|
||||
# my_test.yaml
|
||||
swarm:
|
||||
name: my-custom-test
|
||||
duration_s: 45
|
||||
topology: star # star, mesh, line, or ring
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0 # 0=empty room (baseline)
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 2 # 2=walking person
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 3 # 3=fall event
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot # Did every node start up?
|
||||
- no_crashes # Any error/panic?
|
||||
- all_nodes_produce_frames # Is each sensor generating data?
|
||||
- fall_detected_by_node_2 # Did node 2 detect the fall?
|
||||
```
|
||||
|
||||
**Available scenarios** (what kind of fake WiFi data to generate):
|
||||
|
||||
| # | Scenario | Description |
|
||||
|---|----------|-------------|
|
||||
| 0 | Empty room | Baseline with just noise |
|
||||
| 1 | Static person | Someone standing still |
|
||||
| 2 | Walking | Someone walking across the room |
|
||||
| 3 | Fall | Someone falling down |
|
||||
| 4 | Multiple people | Two people in the room |
|
||||
| 5 | Channel sweep | Cycling through WiFi channels |
|
||||
| 6 | MAC filter | Testing device filtering |
|
||||
| 7 | Ring overflow | Stress test with burst of data |
|
||||
| 8 | RSSI sweep | Signal strength from weak to strong |
|
||||
| 9 | Zero-length | Edge case: empty data packet |
|
||||
|
||||
**Topology options:**
|
||||
|
||||
| Topology | Shape | When to use |
|
||||
|----------|-------|-------------|
|
||||
| `star` | All sensors connect to one coordinator | Most common setup |
|
||||
| `mesh` | Every node can talk to every other | Testing fully connected networks |
|
||||
| `line` | Nodes in a chain (A → B → C → D) | Testing relay/forwarding |
|
||||
| `ring` | Chain with ends connected | Testing circular routing |
|
||||
|
||||
Run your custom config:
|
||||
|
||||
```bash
|
||||
python3 scripts/qemu_swarm.py --config my_test.yaml
|
||||
```
|
||||
|
||||
### Debugging Firmware in QEMU
|
||||
|
||||
If something goes wrong, you can attach a debugger to the emulated ESP32:
|
||||
|
||||
```bash
|
||||
# Terminal 1: Start QEMU with debug support (paused at boot)
|
||||
qemu-system-xtensa -machine esp32s3 -nographic \
|
||||
-drive file=firmware/esp32-csi-node/build/qemu_flash.bin,if=mtd,format=raw \
|
||||
-s -S
|
||||
|
||||
# Terminal 2: Connect the debugger
|
||||
xtensa-esp-elf-gdb firmware/esp32-csi-node/build/esp32-csi-node.elf \
|
||||
-ex "target remote :1234" \
|
||||
-ex "break app_main" \
|
||||
-ex "continue"
|
||||
```
|
||||
|
||||
Or use VS Code: open the project, press **F5**, and select **"QEMU ESP32-S3 Debug"**.
|
||||
|
||||
### Running the Full Test Suite
|
||||
|
||||
For thorough validation before submitting a pull request:
|
||||
|
||||
```bash
|
||||
# 1. Single-node test (2 minutes)
|
||||
bash scripts/qemu-esp32s3-test.sh
|
||||
|
||||
# 2. Multi-node swarm test (1 minute)
|
||||
python3 scripts/qemu_swarm.py --preset standard
|
||||
|
||||
# 3. Fuzz testing — finds edge-case crashes (1-5 minutes)
|
||||
cd firmware/esp32-csi-node/test
|
||||
make all CC=clang
|
||||
make run_serialize FUZZ_DURATION=60
|
||||
make run_edge FUZZ_DURATION=60
|
||||
make run_nvs FUZZ_DURATION=60
|
||||
|
||||
# 4. NVS configuration matrix — tests 14 config combinations
|
||||
python3 scripts/generate_nvs_matrix.py --output-dir build/nvs_matrix
|
||||
|
||||
# 5. Chaos testing — injects faults to test resilience (2 minutes)
|
||||
bash scripts/qemu-chaos-test.sh
|
||||
```
|
||||
|
||||
All of these also run automatically in CI when you push changes to `firmware/`.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Docker: "no matching manifest for linux/arm64" on macOS
|
||||
|
|
@ -1015,6 +1306,47 @@ The server applies a 3-stage smoothing pipeline (ADR-048). If readings are still
|
|||
- Hard refresh with Ctrl+Shift+R to clear cached settings
|
||||
- The auto-detect probes `/health` on the same origin — cross-origin won't work
|
||||
|
||||
### QEMU: "qemu-system-xtensa: command not found"
|
||||
|
||||
QEMU for ESP32-S3 must be built from Espressif's fork — it is not in standard package managers:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/espressif/qemu.git
|
||||
cd qemu && ./configure --target-list=xtensa-softmmu && make -j$(nproc)
|
||||
export QEMU_PATH=$(pwd)/build/qemu-system-xtensa
|
||||
```
|
||||
|
||||
Or point to an existing build: `QEMU_PATH=/path/to/qemu-system-xtensa bash scripts/qemu-esp32s3-test.sh`
|
||||
|
||||
### QEMU: Test times out with no output
|
||||
|
||||
The emulator is slower than real hardware. Increase the timeout:
|
||||
|
||||
```bash
|
||||
QEMU_TIMEOUT=120 bash scripts/qemu-esp32s3-test.sh
|
||||
```
|
||||
|
||||
If there's truly no output at all, the firmware build may have failed. Rebuild without `SKIP_BUILD`:
|
||||
|
||||
```bash
|
||||
bash scripts/qemu-esp32s3-test.sh # without SKIP_BUILD
|
||||
```
|
||||
|
||||
### QEMU: "esptool not found"
|
||||
|
||||
Install it with pip: `pip install esptool`
|
||||
|
||||
### QEMU Swarm: "Must be run as root"
|
||||
|
||||
Multi-node swarm tests with virtual network bridges require root on Linux. Two options:
|
||||
|
||||
1. Run with sudo: `sudo python3 scripts/qemu_swarm.py --preset standard`
|
||||
2. Skip bridges (nodes use simpler networking): the tool automatically falls back on non-root systems, but nodes can't communicate with each other (only with the aggregator)
|
||||
|
||||
### QEMU Swarm: "yaml module not found"
|
||||
|
||||
Install PyYAML: `pip install pyyaml`
|
||||
|
||||
---
|
||||
|
||||
## FAQ
|
||||
|
|
|
|||
|
|
@ -27,6 +27,9 @@
|
|||
#include "wasm_runtime.h"
|
||||
#include "wasm_upload.h"
|
||||
#include "display_task.h"
|
||||
#ifdef CONFIG_CSI_MOCK_ENABLED
|
||||
#include "mock_csi.h"
|
||||
#endif
|
||||
|
||||
#include "esp_timer.h"
|
||||
|
||||
|
|
@ -134,17 +137,35 @@ void app_main(void)
|
|||
|
||||
ESP_LOGI(TAG, "ESP32-S3 CSI Node (ADR-018) — Node ID: %d", g_nvs_config.node_id);
|
||||
|
||||
/* Initialize WiFi STA */
|
||||
/* Initialize WiFi STA (skip entirely under QEMU mock — no RF hardware) */
|
||||
#ifndef CONFIG_CSI_MOCK_SKIP_WIFI_CONNECT
|
||||
wifi_init_sta();
|
||||
#else
|
||||
ESP_LOGI(TAG, "Mock CSI mode: skipping WiFi init (CONFIG_CSI_MOCK_SKIP_WIFI_CONNECT)");
|
||||
#endif
|
||||
|
||||
/* Initialize UDP sender with runtime target */
|
||||
#ifdef CONFIG_CSI_MOCK_SKIP_WIFI_CONNECT
|
||||
ESP_LOGI(TAG, "Mock CSI mode: skipping UDP sender init (no network)");
|
||||
#else
|
||||
if (stream_sender_init_with(g_nvs_config.target_ip, g_nvs_config.target_port) != 0) {
|
||||
ESP_LOGE(TAG, "Failed to initialize UDP sender");
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Initialize CSI collection */
|
||||
#ifdef CONFIG_CSI_MOCK_ENABLED
|
||||
/* ADR-061: Start mock CSI generator (replaces real WiFi CSI in QEMU) */
|
||||
esp_err_t mock_ret = mock_csi_init(CONFIG_CSI_MOCK_SCENARIO);
|
||||
if (mock_ret != ESP_OK) {
|
||||
ESP_LOGE(TAG, "Mock CSI init failed: %s", esp_err_to_name(mock_ret));
|
||||
} else {
|
||||
ESP_LOGI(TAG, "Mock CSI active (scenario=%d)", CONFIG_CSI_MOCK_SCENARIO);
|
||||
}
|
||||
#else
|
||||
csi_collector_init();
|
||||
#endif
|
||||
|
||||
/* ADR-039: Initialize edge processing pipeline. */
|
||||
edge_config_t edge_cfg = {
|
||||
|
|
@ -162,12 +183,17 @@ void app_main(void)
|
|||
esp_err_to_name(edge_ret));
|
||||
}
|
||||
|
||||
/* Initialize OTA update HTTP server. */
|
||||
/* Initialize OTA update HTTP server (requires network). */
|
||||
httpd_handle_t ota_server = NULL;
|
||||
#ifndef CONFIG_CSI_MOCK_SKIP_WIFI_CONNECT
|
||||
esp_err_t ota_ret = ota_update_init_ex(&ota_server);
|
||||
if (ota_ret != ESP_OK) {
|
||||
ESP_LOGW(TAG, "OTA server init failed: %s", esp_err_to_name(ota_ret));
|
||||
}
|
||||
#else
|
||||
esp_err_t ota_ret = ESP_ERR_NOT_SUPPORTED;
|
||||
ESP_LOGI(TAG, "Mock CSI mode: skipping OTA server (no network)");
|
||||
#endif
|
||||
|
||||
/* ADR-040: Initialize WASM programmable sensing runtime. */
|
||||
esp_err_t wasm_ret = wasm_runtime_init();
|
||||
|
|
@ -205,10 +231,12 @@ void app_main(void)
|
|||
power_mgmt_init(g_nvs_config.power_duty);
|
||||
|
||||
/* ADR-045: Start AMOLED display task (gracefully skips if no display). */
|
||||
#ifdef CONFIG_DISPLAY_ENABLE
|
||||
esp_err_t disp_ret = display_task_start();
|
||||
if (disp_ret != ESP_OK) {
|
||||
ESP_LOGW(TAG, "Display init returned: %s", esp_err_to_name(disp_ret));
|
||||
}
|
||||
#endif
|
||||
|
||||
ESP_LOGI(TAG, "CSI streaming active → %s:%d (edge_tier=%u, OTA=%s, WASM=%s)",
|
||||
g_nvs_config.target_ip, g_nvs_config.target_port,
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@
|
|||
* to nothing on production builds.
|
||||
*/
|
||||
|
||||
#include "sdkconfig.h"
|
||||
|
||||
#ifdef CONFIG_CSI_MOCK_ENABLED
|
||||
|
||||
#include "mock_csi.h"
|
||||
|
|
@ -80,7 +82,7 @@ static const char *TAG = "mock_csi";
|
|||
|
||||
/** Pi constant. */
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846f
|
||||
#define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
/* ---- Channel sweep table ---- */
|
||||
|
|
@ -94,14 +96,14 @@ static const uint8_t s_sweep_channels[] = {1, 6, 11, 36};
|
|||
static const uint8_t s_good_mac[6] = {0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF};
|
||||
|
||||
/** "Wrong" MAC that should be rejected by the filter. */
|
||||
static const uint8_t s_bad_mac[6] = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66};
|
||||
static const uint8_t s_bad_mac[6] __attribute__((unused)) = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66};
|
||||
|
||||
/* ---- LFSR pseudo-random number generator ---- */
|
||||
|
||||
/**
|
||||
* 32-bit Galois LFSR for deterministic pseudo-random noise.
|
||||
* Avoids stdlib rand() which may not be available on ESP32 bare-metal.
|
||||
* Taps: bits 32, 22, 2, 1 (maximal-length polynomial).
|
||||
* Taps: bits 32, 31, 29, 1 (Galois LFSR polynomial 0xD0000001).
|
||||
*/
|
||||
static uint32_t s_lfsr = 0xDEADBEEF;
|
||||
|
||||
|
|
@ -110,7 +112,7 @@ static uint32_t lfsr_next(void)
|
|||
uint32_t lsb = s_lfsr & 1u;
|
||||
s_lfsr >>= 1;
|
||||
if (lsb) {
|
||||
s_lfsr ^= 0xD0000001u; /* x^32 + x^22 + x^2 + x^1 */
|
||||
s_lfsr ^= 0xD0000001u; /* x^32 + x^31 + x^29 + x^1 */
|
||||
}
|
||||
return s_lfsr;
|
||||
}
|
||||
|
|
@ -121,8 +123,8 @@ static uint32_t lfsr_next(void)
|
|||
static float lfsr_float(void)
|
||||
{
|
||||
uint32_t r = lfsr_next();
|
||||
/* Map [0, UINT32_MAX] to [-1.0, +1.0] */
|
||||
return ((float)(r & 0xFFFF) / 32767.5f) - 1.0f;
|
||||
/* Map [0, 65535] to [-1.0, +1.0] using 65535/2 = 32767.5 */
|
||||
return ((float)(r & 0xFFFF) / 32768.0f) - 1.0f;
|
||||
}
|
||||
|
||||
/* ---- Module state ---- */
|
||||
|
|
@ -130,6 +132,12 @@ static float lfsr_float(void)
|
|||
static mock_state_t s_state;
|
||||
static esp_timer_handle_t s_timer = NULL;
|
||||
|
||||
/** Tracks whether the MAC filter has been set up in gen_mac_filter. */
|
||||
static bool s_mac_filter_initialized = false;
|
||||
|
||||
/** Tracks whether the overflow burst has fired in gen_ring_overflow. */
|
||||
static bool s_overflow_burst_done = false;
|
||||
|
||||
/* External NVS config (for MAC filter scenario). */
|
||||
extern nvs_config_t g_nvs_config;
|
||||
|
||||
|
|
@ -157,9 +165,9 @@ static float channel_to_lambda(uint8_t channel)
|
|||
|
||||
/* ---- Helper: elapsed ms since scenario start ---- */
|
||||
|
||||
static uint32_t scenario_elapsed_ms(void)
|
||||
static int64_t scenario_elapsed_ms(void)
|
||||
{
|
||||
uint32_t now = (uint32_t)(esp_timer_get_time() / 1000);
|
||||
int64_t now = esp_timer_get_time() / 1000;
|
||||
return now - s_state.scenario_start_ms;
|
||||
}
|
||||
|
||||
|
|
@ -277,7 +285,7 @@ static void gen_walking(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi)
|
|||
*/
|
||||
static void gen_fall(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi)
|
||||
{
|
||||
uint32_t elapsed = scenario_elapsed_ms();
|
||||
int64_t elapsed = scenario_elapsed_ms();
|
||||
uint32_t duration = CONFIG_CSI_MOCK_SCENARIO_DURATION_MS;
|
||||
|
||||
/* Fall occurs at 70% of scenario duration. */
|
||||
|
|
@ -402,11 +410,11 @@ static void gen_channel_sweep(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi)
|
|||
static void gen_mac_filter(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi,
|
||||
bool *skip_inject)
|
||||
{
|
||||
/* Set up the filter MAC to match s_good_mac on first frame. */
|
||||
if (s_state.frame_count == 0 ||
|
||||
(s_state.frame_count == s_state.scenario_start_ms)) {
|
||||
/* Set up the filter MAC to match s_good_mac on first frame of this scenario. */
|
||||
if (!s_mac_filter_initialized) {
|
||||
memcpy(g_nvs_config.filter_mac, s_good_mac, 6);
|
||||
g_nvs_config.filter_mac_set = 1;
|
||||
s_mac_filter_initialized = true;
|
||||
ESP_LOGI(TAG, "MAC filter scenario: filter set to %02X:%02X:%02X:%02X:%02X:%02X",
|
||||
s_good_mac[0], s_good_mac[1], s_good_mac[2],
|
||||
s_good_mac[3], s_good_mac[4], s_good_mac[5]);
|
||||
|
|
@ -438,10 +446,10 @@ static void gen_ring_overflow(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi,
|
|||
*channel = 6;
|
||||
*rssi = -50;
|
||||
|
||||
/* Only burst on the first timer tick of this scenario. */
|
||||
uint32_t elapsed = scenario_elapsed_ms();
|
||||
if (elapsed < MOCK_CSI_INTERVAL_MS + 10) {
|
||||
/* Burst once on the first timer tick of this scenario. */
|
||||
if (!s_overflow_burst_done) {
|
||||
*burst_count = OVERFLOW_BURST_COUNT;
|
||||
s_overflow_burst_done = true;
|
||||
} else {
|
||||
*burst_count = 1;
|
||||
}
|
||||
|
|
@ -453,7 +461,7 @@ static void gen_ring_overflow(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi,
|
|||
*/
|
||||
static void gen_boundary_rssi(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi)
|
||||
{
|
||||
uint32_t elapsed = scenario_elapsed_ms();
|
||||
int64_t elapsed = scenario_elapsed_ms();
|
||||
uint32_t duration = CONFIG_CSI_MOCK_SCENARIO_DURATION_MS;
|
||||
|
||||
/* Linear sweep: -90 to -10 dBm. */
|
||||
|
|
@ -477,17 +485,21 @@ static void gen_boundary_rssi(uint8_t *iq_buf, uint8_t *channel, int8_t *rssi)
|
|||
/**
|
||||
* Advance to the next scenario when running SCENARIO_ALL.
|
||||
*/
|
||||
/** Flag: set when all scenarios are done so timer callback exits early. */
|
||||
static bool s_all_done = false;
|
||||
|
||||
static void advance_scenario(void)
|
||||
{
|
||||
s_state.all_idx++;
|
||||
if (s_state.all_idx >= MOCK_SCENARIO_COUNT) {
|
||||
ESP_LOGI(TAG, "All %d scenarios complete (%lu total frames)",
|
||||
MOCK_SCENARIO_COUNT, (unsigned long)s_state.frame_count);
|
||||
s_state.all_idx = 0; /* Loop. */
|
||||
s_all_done = true;
|
||||
return; /* Stop generating — timer callback will check s_all_done. */
|
||||
}
|
||||
|
||||
s_state.scenario = s_state.all_idx;
|
||||
s_state.scenario_start_ms = (uint32_t)(esp_timer_get_time() / 1000);
|
||||
s_state.scenario_start_ms = esp_timer_get_time() / 1000;
|
||||
|
||||
/* Reset per-scenario state. */
|
||||
s_state.person_x = 1.0f;
|
||||
|
|
@ -507,11 +519,16 @@ static void mock_timer_cb(void *arg)
|
|||
{
|
||||
(void)arg;
|
||||
|
||||
/* All scenarios finished — stop generating. */
|
||||
if (s_all_done) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Check for scenario timeout in SCENARIO_ALL mode. */
|
||||
if (s_state.scenario == MOCK_SCENARIO_ALL ||
|
||||
(s_state.all_idx > 0 && s_state.all_idx < MOCK_SCENARIO_COUNT)) {
|
||||
/* We're running in sequential mode. */
|
||||
uint32_t elapsed = scenario_elapsed_ms();
|
||||
int64_t elapsed = scenario_elapsed_ms();
|
||||
if (elapsed >= CONFIG_CSI_MOCK_SCENARIO_DURATION_MS) {
|
||||
advance_scenario();
|
||||
}
|
||||
|
|
@ -609,7 +626,10 @@ esp_err_t mock_csi_init(uint8_t scenario)
|
|||
s_state.person_speed = WALK_SPEED_MS;
|
||||
s_state.person2_x = 4.0f;
|
||||
s_state.person2_speed = WALK_SPEED_MS * 0.6f;
|
||||
s_state.scenario_start_ms = (uint32_t)(esp_timer_get_time() / 1000);
|
||||
s_state.scenario_start_ms = esp_timer_get_time() / 1000;
|
||||
s_all_done = false;
|
||||
s_mac_filter_initialized = false;
|
||||
s_overflow_burst_done = false;
|
||||
|
||||
/* Reset LFSR to deterministic seed. */
|
||||
s_lfsr = 0xDEADBEEF;
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ typedef struct {
|
|||
float person2_speed; /**< Second person movement speed. */
|
||||
uint8_t channel_idx; /**< Index into channel sweep table. */
|
||||
int8_t rssi_sweep; /**< Current RSSI for boundary sweep. */
|
||||
uint32_t scenario_start_ms; /**< Timestamp when current scenario started. */
|
||||
int64_t scenario_start_ms; /**< Timestamp when current scenario started. */
|
||||
uint8_t all_idx; /**< Current scenario index in SCENARIO_ALL mode. */
|
||||
} mock_state_t;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,54 @@
|
|||
# sdkconfig.coverage -- ESP-IDF sdkconfig overlay for gcov/lcov code coverage
|
||||
#
|
||||
# This overlay enables GCC code coverage instrumentation (gcov) and the
|
||||
# application-level trace (apptrace) channel required to extract .gcda
|
||||
# files from the target via JTAG/QEMU GDB.
|
||||
#
|
||||
# Usage (combine with sdkconfig.defaults as the base):
|
||||
#
|
||||
# idf.py -D SDKCONFIG_DEFAULTS="sdkconfig.defaults;sdkconfig.coverage" build
|
||||
#
|
||||
# After running the firmware under QEMU, dump coverage data through GDB:
|
||||
#
|
||||
# (gdb) mon gcov dump
|
||||
#
|
||||
# Then process the .gcda files on the host with lcov/genhtml:
|
||||
#
|
||||
# lcov --capture --directory build --output-file coverage.info \
|
||||
# --gcov-tool xtensa-esp-elf-gcov
|
||||
# genhtml coverage.info --output-directory coverage_html
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Compiler: disable optimizations so every source line maps 1:1 to object code
|
||||
# ---------------------------------------------------------------------------
|
||||
CONFIG_COMPILER_OPTIMIZATION_NONE=y
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Application-level trace: enables the gcov data channel over JTAG
|
||||
# ---------------------------------------------------------------------------
|
||||
CONFIG_APPTRACE_ENABLE=y
|
||||
CONFIG_APPTRACE_DEST_JTAG=y
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CSI mock mode: identical to sdkconfig.qemu so coverage runs use the same
|
||||
# deterministic mock data path (no real WiFi hardware needed)
|
||||
# ---------------------------------------------------------------------------
|
||||
CONFIG_CSI_MOCK_ENABLED=y
|
||||
CONFIG_CSI_MOCK_SKIP_WIFI_CONNECT=y
|
||||
CONFIG_CSI_MOCK_SCENARIO=255
|
||||
CONFIG_CSI_TARGET_IP="10.0.2.2"
|
||||
CONFIG_CSI_MOCK_SCENARIO_DURATION_MS=5000
|
||||
CONFIG_CSI_MOCK_LOG_FRAMES=y
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FreeRTOS and watchdog: match sdkconfig.qemu for QEMU timing tolerance
|
||||
# ---------------------------------------------------------------------------
|
||||
CONFIG_FREERTOS_TIMER_TASK_STACK_DEPTH=4096
|
||||
CONFIG_ESP_TASK_WDT_TIMEOUT_S=30
|
||||
CONFIG_ESP_INT_WDT_TIMEOUT_MS=800
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging and display
|
||||
# ---------------------------------------------------------------------------
|
||||
CONFIG_LOG_DEFAULT_LEVEL_INFO=y
|
||||
CONFIG_DISPLAY_ENABLE=n
|
||||
|
|
@ -1,7 +1,27 @@
|
|||
# QEMU ESP32-S3 sdkconfig overlay (ADR-061)
|
||||
#
|
||||
# Merge with: idf.py -D SDKCONFIG_DEFAULTS="sdkconfig.defaults;sdkconfig.qemu" build
|
||||
|
||||
# ---- Mock CSI generator (replaces real WiFi CSI) ----
|
||||
CONFIG_CSI_MOCK_ENABLED=y
|
||||
CONFIG_CSI_MOCK_SKIP_WIFI_CONNECT=y
|
||||
CONFIG_CSI_MOCK_SCENARIO=255
|
||||
CONFIG_CSI_TARGET_IP="10.0.2.2"
|
||||
CONFIG_CSI_MOCK_SCENARIO_DURATION_MS=5000
|
||||
CONFIG_CSI_MOCK_LOG_FRAMES=y
|
||||
|
||||
# ---- Network (QEMU SLIRP provides 10.0.2.x) ----
|
||||
CONFIG_CSI_TARGET_IP="10.0.2.2"
|
||||
|
||||
# ---- Logging (verbose for validation) ----
|
||||
CONFIG_LOG_DEFAULT_LEVEL_INFO=y
|
||||
|
||||
# ---- FreeRTOS tuning for QEMU ----
|
||||
# Increase timer task stack to prevent overflow from mock_csi timer callback
|
||||
CONFIG_FREERTOS_TIMER_TASK_STACK_DEPTH=4096
|
||||
|
||||
# ---- Watchdog (relaxed for emulation — QEMU timing is not cycle-accurate) ----
|
||||
CONFIG_ESP_TASK_WDT_TIMEOUT_S=30
|
||||
CONFIG_ESP_INT_WDT_TIMEOUT_MS=800
|
||||
|
||||
# ---- Disable hardware-dependent features ----
|
||||
CONFIG_DISPLAY_ENABLE=n
|
||||
|
|
|
|||
|
|
@ -61,19 +61,19 @@ fuzz_nvs: fuzz_nvs_config.c $(STUBS_SRC)
|
|||
|
||||
# --- Run targets ---
|
||||
run_serialize: fuzz_serialize
|
||||
@mkdir -p corpus
|
||||
./fuzz_serialize corpus/ -max_total_time=$(FUZZ_DURATION) -max_len=2048 -jobs=$(FUZZ_JOBS)
|
||||
@mkdir -p corpus_serialize
|
||||
./fuzz_serialize corpus_serialize/ -max_total_time=$(FUZZ_DURATION) -max_len=2048 -jobs=$(FUZZ_JOBS)
|
||||
|
||||
run_edge: fuzz_edge
|
||||
@mkdir -p corpus
|
||||
./fuzz_edge corpus/ -max_total_time=$(FUZZ_DURATION) -max_len=4096 -jobs=$(FUZZ_JOBS)
|
||||
@mkdir -p corpus_edge
|
||||
./fuzz_edge corpus_edge/ -max_total_time=$(FUZZ_DURATION) -max_len=4096 -jobs=$(FUZZ_JOBS)
|
||||
|
||||
run_nvs: fuzz_nvs
|
||||
@mkdir -p corpus
|
||||
./fuzz_nvs corpus/ -max_total_time=$(FUZZ_DURATION) -max_len=256 -jobs=$(FUZZ_JOBS)
|
||||
@mkdir -p corpus_nvs
|
||||
./fuzz_nvs corpus_nvs/ -max_total_time=$(FUZZ_DURATION) -max_len=256 -jobs=$(FUZZ_JOBS)
|
||||
|
||||
run_all: run_serialize run_edge run_nvs
|
||||
|
||||
clean:
|
||||
rm -f fuzz_serialize fuzz_edge fuzz_nvs
|
||||
rm -rf corpus/
|
||||
rm -rf corpus_serialize/ corpus_edge/ corpus_nvs/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,290 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
QEMU Post-Fault Health Checker — ADR-061 Layer 9
|
||||
|
||||
Reads a log segment captured after a fault injection and checks whether
|
||||
the firmware is still healthy. Used by qemu-chaos-test.sh after each
|
||||
fault in the chaos testing loop.
|
||||
|
||||
Health checks:
|
||||
1. No crash patterns (Guru Meditation, assert, panic, abort)
|
||||
2. No heap errors (OOM, heap corruption, alloc failure)
|
||||
3. No stack overflow (FreeRTOS stack overflow hook)
|
||||
4. Firmware still producing frames (CSI frame activity)
|
||||
|
||||
Exit codes:
|
||||
0 HEALTHY — all checks pass
|
||||
1 DEGRADED — no crash, but missing expected activity
|
||||
2 UNHEALTHY — crash, heap error, or stack overflow detected
|
||||
|
||||
Usage:
|
||||
python3 check_health.py --log /path/to/fault_segment.log --after-fault wifi_kill
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
|
||||
# ANSI colors
|
||||
USE_COLOR = sys.stdout.isatty()
|
||||
|
||||
|
||||
def color(text: str, code: str) -> str:
|
||||
if not USE_COLOR:
|
||||
return text
|
||||
return f"\033[{code}m{text}\033[0m"
|
||||
|
||||
|
||||
def green(t: str) -> str:
|
||||
return color(t, "32")
|
||||
|
||||
|
||||
def yellow(t: str) -> str:
|
||||
return color(t, "33")
|
||||
|
||||
|
||||
def red(t: str) -> str:
|
||||
return color(t, "1;31")
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthCheck:
|
||||
name: str
|
||||
passed: bool
|
||||
message: str
|
||||
severity: int # 0=pass, 1=degraded, 2=unhealthy
|
||||
|
||||
|
||||
def check_no_crash(lines: List[str]) -> HealthCheck:
|
||||
"""Check for crash indicators in the log."""
|
||||
crash_patterns = [
|
||||
r"Guru Meditation",
|
||||
r"assert failed",
|
||||
r"abort\(\)",
|
||||
r"panic",
|
||||
r"LoadProhibited",
|
||||
r"StoreProhibited",
|
||||
r"InstrFetchProhibited",
|
||||
r"IllegalInstruction",
|
||||
r"Unhandled debug exception",
|
||||
r"Fatal exception",
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
for pat in crash_patterns:
|
||||
if re.search(pat, line):
|
||||
return HealthCheck(
|
||||
name="No crash",
|
||||
passed=False,
|
||||
message=f"Crash detected: {line.strip()[:120]}",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
name="No crash",
|
||||
passed=True,
|
||||
message="No crash indicators found",
|
||||
severity=0,
|
||||
)
|
||||
|
||||
|
||||
def check_no_heap_errors(lines: List[str]) -> HealthCheck:
|
||||
"""Check for heap/memory errors."""
|
||||
heap_patterns = [
|
||||
r"HEAP_ERROR",
|
||||
r"out of memory",
|
||||
r"heap_caps_alloc.*failed",
|
||||
r"malloc.*fail",
|
||||
r"heap corruption",
|
||||
r"CORRUPT HEAP",
|
||||
r"multi_heap",
|
||||
r"heap_lock",
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
for pat in heap_patterns:
|
||||
if re.search(pat, line, re.IGNORECASE):
|
||||
return HealthCheck(
|
||||
name="No heap errors",
|
||||
passed=False,
|
||||
message=f"Heap error: {line.strip()[:120]}",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
name="No heap errors",
|
||||
passed=True,
|
||||
message="No heap errors found",
|
||||
severity=0,
|
||||
)
|
||||
|
||||
|
||||
def check_no_stack_overflow(lines: List[str]) -> HealthCheck:
|
||||
"""Check for FreeRTOS stack overflow."""
|
||||
stack_patterns = [
|
||||
r"[Ss]tack overflow",
|
||||
r"stack_overflow",
|
||||
r"vApplicationStackOverflowHook",
|
||||
r"stack smashing",
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
for pat in stack_patterns:
|
||||
if re.search(pat, line):
|
||||
return HealthCheck(
|
||||
name="No stack overflow",
|
||||
passed=False,
|
||||
message=f"Stack overflow: {line.strip()[:120]}",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
return HealthCheck(
|
||||
name="No stack overflow",
|
||||
passed=True,
|
||||
message="No stack overflow detected",
|
||||
severity=0,
|
||||
)
|
||||
|
||||
|
||||
def check_frame_activity(lines: List[str]) -> HealthCheck:
|
||||
"""Check that the firmware is still producing CSI frames."""
|
||||
frame_patterns = [
|
||||
r"frame",
|
||||
r"CSI",
|
||||
r"mock_csi",
|
||||
r"iq_data",
|
||||
r"subcarrier",
|
||||
r"csi_collector",
|
||||
r"enqueue",
|
||||
r"presence",
|
||||
r"vitals",
|
||||
r"breathing",
|
||||
]
|
||||
|
||||
activity_lines = 0
|
||||
for line in lines:
|
||||
for pat in frame_patterns:
|
||||
if re.search(pat, line, re.IGNORECASE):
|
||||
activity_lines += 1
|
||||
break
|
||||
|
||||
if activity_lines > 0:
|
||||
return HealthCheck(
|
||||
name="Frame activity",
|
||||
passed=True,
|
||||
message=f"Firmware producing output ({activity_lines} activity lines)",
|
||||
severity=0,
|
||||
)
|
||||
else:
|
||||
return HealthCheck(
|
||||
name="Frame activity",
|
||||
passed=False,
|
||||
message="No frame/CSI activity detected after fault",
|
||||
severity=1, # Degraded, not fatal
|
||||
)
|
||||
|
||||
|
||||
def run_health_checks(
|
||||
log_path: Path,
|
||||
fault_name: str,
|
||||
tail_lines: int = 200,
|
||||
) -> int:
|
||||
"""Run all health checks and report results.
|
||||
|
||||
Returns:
|
||||
0 = healthy, 1 = degraded, 2 = unhealthy
|
||||
"""
|
||||
if not log_path.exists():
|
||||
print(f" ERROR: Log file not found: {log_path}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
text = log_path.read_text(encoding="utf-8", errors="replace")
|
||||
all_lines = text.splitlines()
|
||||
|
||||
# Use last N lines (most recent, after fault injection)
|
||||
lines = all_lines[-tail_lines:] if len(all_lines) > tail_lines else all_lines
|
||||
|
||||
if not lines:
|
||||
print(f" WARNING: Log file is empty (fault may have killed output)")
|
||||
# Empty log after fault is degraded, not necessarily unhealthy
|
||||
return 1
|
||||
|
||||
print(f" Health check after fault: {fault_name}")
|
||||
print(f" Log lines analyzed: {len(lines)} (of {len(all_lines)} total)")
|
||||
print()
|
||||
|
||||
# Run checks
|
||||
checks = [
|
||||
check_no_crash(lines),
|
||||
check_no_heap_errors(lines),
|
||||
check_no_stack_overflow(lines),
|
||||
check_frame_activity(lines),
|
||||
]
|
||||
|
||||
max_severity = 0
|
||||
for check in checks:
|
||||
if check.passed:
|
||||
icon = green("PASS")
|
||||
elif check.severity == 1:
|
||||
icon = yellow("WARN")
|
||||
else:
|
||||
icon = red("FAIL")
|
||||
|
||||
print(f" [{icon}] {check.name}: {check.message}")
|
||||
max_severity = max(max_severity, check.severity)
|
||||
|
||||
print()
|
||||
|
||||
# Summary
|
||||
passed = sum(1 for c in checks if c.passed)
|
||||
total = len(checks)
|
||||
|
||||
if max_severity == 0:
|
||||
print(f" {green(f'HEALTHY')} — {passed}/{total} checks passed")
|
||||
elif max_severity == 1:
|
||||
print(f" {yellow(f'DEGRADED')} — {passed}/{total} checks passed")
|
||||
else:
|
||||
print(f" {red(f'UNHEALTHY')} — {passed}/{total} checks passed")
|
||||
|
||||
return max_severity
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="QEMU Post-Fault Health Checker — ADR-061 Layer 9",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=(
|
||||
"Example output:\n"
|
||||
" [HEALTHY] t=30s frames=150 (5.0 fps) crashes=0 heap_err=0 wdt=0 reboots=0\n"
|
||||
" \n"
|
||||
" VERDICT: Firmware is healthy. No critical issues detected."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log", required=True,
|
||||
help="Path to the log file (or log segment) to check",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--after-fault", required=True,
|
||||
help="Name of the fault that was injected (for reporting)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tail", type=int, default=200,
|
||||
help="Number of lines from end of log to analyze (default: 200)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
exit_code = run_health_checks(
|
||||
log_path=Path(args.log),
|
||||
fault_name=args.after_fault,
|
||||
tail_lines=args.tail,
|
||||
)
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -131,7 +131,7 @@ def define_configs() -> List[NvsConfig]:
|
|||
NvsEntry("edge_tier", "data", "u8", "2"),
|
||||
NvsEntry("pres_thresh", "data", "u16", "100"),
|
||||
NvsEntry("fall_thresh", "data", "u16", "3000"),
|
||||
NvsEntry("vital_win", "data", "u16", "512"),
|
||||
NvsEntry("vital_win", "data", "u16", "256"),
|
||||
NvsEntry("vital_int", "data", "u16", "500"),
|
||||
NvsEntry("subk_count", "data", "u8", "16"),
|
||||
],
|
||||
|
|
@ -160,6 +160,10 @@ def define_configs() -> List[NvsConfig]:
|
|||
NvsEntry("password", "data", "string", "testpass123"),
|
||||
NvsEntry("target_ip", "data", "string", "10.0.2.2"),
|
||||
NvsEntry("edge_tier", "data", "u8", "2"),
|
||||
# wasm_verify=1 + a 32-byte dummy Ed25519 pubkey
|
||||
NvsEntry("wasm_verify", "data", "u8", "1"),
|
||||
NvsEntry("wasm_pubkey", "data", "hex2bin",
|
||||
"0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"),
|
||||
],
|
||||
))
|
||||
|
||||
|
|
@ -172,6 +176,8 @@ def define_configs() -> List[NvsConfig]:
|
|||
NvsEntry("password", "data", "string", "testpass123"),
|
||||
NvsEntry("target_ip", "data", "string", "10.0.2.2"),
|
||||
NvsEntry("edge_tier", "data", "u8", "2"),
|
||||
NvsEntry("wasm_verify", "data", "u8", "0"),
|
||||
NvsEntry("wasm_max", "data", "u8", "2"),
|
||||
],
|
||||
))
|
||||
|
||||
|
|
@ -187,10 +193,12 @@ def define_configs() -> List[NvsConfig]:
|
|||
],
|
||||
))
|
||||
|
||||
# 11. boundary-max - maximum values for all numeric fields
|
||||
# 11. boundary-max - maximum VALID values for all numeric fields
|
||||
# Uses firmware-validated max ranges (not raw u8/u16 max):
|
||||
# vital_win: 32-256, top_k: 1-32, power_duty: 10-100
|
||||
configs.append(NvsConfig(
|
||||
name="boundary-max",
|
||||
description="Boundary test: maximum values for all numeric NVS fields",
|
||||
description="Boundary test: maximum valid values per firmware validation ranges",
|
||||
entries=[
|
||||
NvsEntry("ssid", "data", "string", "TestNetwork"),
|
||||
NvsEntry("password", "data", "string", "testpass123"),
|
||||
|
|
@ -200,16 +208,17 @@ def define_configs() -> List[NvsConfig]:
|
|||
NvsEntry("edge_tier", "data", "u8", "2"),
|
||||
NvsEntry("pres_thresh", "data", "u16", "65535"),
|
||||
NvsEntry("fall_thresh", "data", "u16", "65535"),
|
||||
NvsEntry("vital_win", "data", "u16", "65535"),
|
||||
NvsEntry("vital_win", "data", "u16", "256"), # max validated
|
||||
NvsEntry("vital_int", "data", "u16", "10000"),
|
||||
NvsEntry("subk_count", "data", "u8", "32"),
|
||||
NvsEntry("power_duty", "data", "u8", "100"),
|
||||
],
|
||||
))
|
||||
|
||||
# 12. boundary-min - minimum values for all numeric fields
|
||||
# 12. boundary-min - minimum VALID values for all numeric fields
|
||||
configs.append(NvsConfig(
|
||||
name="boundary-min",
|
||||
description="Boundary test: minimum values for all numeric NVS fields",
|
||||
description="Boundary test: minimum valid values per firmware validation ranges",
|
||||
entries=[
|
||||
NvsEntry("ssid", "data", "string", "TestNetwork"),
|
||||
NvsEntry("password", "data", "string", "testpass123"),
|
||||
|
|
@ -218,10 +227,11 @@ def define_configs() -> List[NvsConfig]:
|
|||
NvsEntry("node_id", "data", "u8", "0"),
|
||||
NvsEntry("edge_tier", "data", "u8", "0"),
|
||||
NvsEntry("pres_thresh", "data", "u16", "1"),
|
||||
NvsEntry("fall_thresh", "data", "u16", "1"),
|
||||
NvsEntry("vital_win", "data", "u16", "1"),
|
||||
NvsEntry("fall_thresh", "data", "u16", "100"), # min valid (0.1 rad/s²)
|
||||
NvsEntry("vital_win", "data", "u16", "32"), # min validated
|
||||
NvsEntry("vital_int", "data", "u16", "100"),
|
||||
NvsEntry("subk_count", "data", "u8", "1"),
|
||||
NvsEntry("power_duty", "data", "u8", "10"),
|
||||
],
|
||||
))
|
||||
|
||||
|
|
@ -234,6 +244,7 @@ def define_configs() -> List[NvsConfig]:
|
|||
NvsEntry("password", "data", "string", "testpass123"),
|
||||
NvsEntry("target_ip", "data", "string", "10.0.2.2"),
|
||||
NvsEntry("edge_tier", "data", "u8", "1"),
|
||||
NvsEntry("power_duty", "data", "u8", "10"),
|
||||
],
|
||||
))
|
||||
|
||||
|
|
@ -303,15 +314,24 @@ def generate_nvs_binary(csv_content: str, size: int) -> bytes:
|
|||
return f.read()
|
||||
|
||||
# Last resort: try as a module
|
||||
subprocess.check_call([
|
||||
sys.executable, "-m", "nvs_partition_gen", "generate",
|
||||
csv_path, bin_path, hex(size)
|
||||
])
|
||||
with open(bin_path, "rb") as f:
|
||||
return f.read()
|
||||
try:
|
||||
subprocess.check_call([
|
||||
sys.executable, "-m", "nvs_partition_gen", "generate",
|
||||
csv_path, bin_path, hex(size)
|
||||
])
|
||||
with open(bin_path, "rb") as f:
|
||||
return f.read()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("ERROR: NVS partition generator tool not found.", file=sys.stderr)
|
||||
print("Install: pip install esp-idf-nvs-partition-gen", file=sys.stderr)
|
||||
print("Or set IDF_PATH to your ESP-IDF installation", file=sys.stderr)
|
||||
raise RuntimeError(
|
||||
"NVS partition generator not available. "
|
||||
"Install: pip install esp-idf-nvs-partition-gen"
|
||||
)
|
||||
|
||||
finally:
|
||||
for p in (csv_path, bin_path):
|
||||
for p in set((csv_path, bin_path)): # deduplicate in case paths are identical
|
||||
if os.path.isfile(p):
|
||||
os.unlink(p)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,258 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
QEMU Fault Injector — ADR-061 Layer 9
|
||||
|
||||
Connects to a QEMU monitor socket and injects a specified fault type.
|
||||
Used by qemu-chaos-test.sh to stress-test firmware resilience.
|
||||
|
||||
Supported faults:
|
||||
wifi_kill - Pause/resume VM (simulates WiFi reconnect)
|
||||
ring_flood - Send 1000 rapid commands to stress ring buffer
|
||||
heap_exhaust - Write to heap metadata region to simulate OOM
|
||||
timer_starvation - Pause VM for 500ms to starve FreeRTOS timers
|
||||
corrupt_frame - Write bad magic bytes to CSI frame buffer area
|
||||
nvs_corrupt - Write garbage to NVS flash region (offset 0x9000)
|
||||
|
||||
Usage:
|
||||
python3 inject_fault.py --socket /path/to/qemu.sock --fault wifi_kill
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
# Timeout for each monitor command (seconds)
|
||||
CMD_TIMEOUT = 5.0
|
||||
|
||||
# QEMU monitor response buffer size
|
||||
RECV_BUFSIZE = 4096
|
||||
|
||||
|
||||
def connect_monitor(sock_path: str, timeout: float = CMD_TIMEOUT) -> socket.socket:
|
||||
"""Connect to the QEMU monitor Unix domain socket."""
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
s.settimeout(timeout)
|
||||
try:
|
||||
s.connect(sock_path)
|
||||
except (socket.error, FileNotFoundError) as e:
|
||||
print(f"ERROR: Cannot connect to QEMU monitor at {sock_path}: {e}",
|
||||
file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# Read the initial QEMU monitor banner/prompt
|
||||
try:
|
||||
banner = s.recv(RECV_BUFSIZE).decode("utf-8", errors="replace")
|
||||
if banner:
|
||||
pass # Consume silently
|
||||
else:
|
||||
print(f"WARNING: Connected to {sock_path} but received no banner data. "
|
||||
f"QEMU monitor may not be ready.", file=sys.stderr)
|
||||
except socket.timeout:
|
||||
print(f"WARNING: Connected to {sock_path} but timed out waiting for banner "
|
||||
f"after {timeout}s. QEMU monitor may be unresponsive.", file=sys.stderr)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def send_cmd(s: socket.socket, cmd: str, timeout: float = CMD_TIMEOUT) -> str:
|
||||
"""Send a command to the QEMU monitor and return the response."""
|
||||
s.settimeout(timeout)
|
||||
try:
|
||||
s.sendall((cmd + "\n").encode("utf-8"))
|
||||
except (BrokenPipeError, ConnectionResetError) as e:
|
||||
print(f"ERROR: Lost connection to QEMU monitor: {e}", file=sys.stderr)
|
||||
return ""
|
||||
|
||||
# Read response (may be multi-line)
|
||||
response = ""
|
||||
try:
|
||||
while True:
|
||||
chunk = s.recv(RECV_BUFSIZE).decode("utf-8", errors="replace")
|
||||
if not chunk:
|
||||
break
|
||||
response += chunk
|
||||
# QEMU monitor prompt ends with "(qemu) "
|
||||
if "(qemu)" in chunk:
|
||||
break
|
||||
except socket.timeout:
|
||||
pass # Response may not have a clean prompt
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def fault_wifi_kill(s: socket.socket) -> None:
|
||||
"""Pause VM for 2s then resume — simulates WiFi disconnect/reconnect."""
|
||||
print("[wifi_kill] Pausing VM...")
|
||||
send_cmd(s, "stop")
|
||||
time.sleep(2.0)
|
||||
print("[wifi_kill] Resuming VM...")
|
||||
send_cmd(s, "cont")
|
||||
print("[wifi_kill] Injected: 2s pause/resume cycle")
|
||||
|
||||
|
||||
def fault_ring_flood(s: socket.socket) -> None:
|
||||
"""Send 1000 rapid NMI injections to stress the ring buffer.
|
||||
|
||||
On real hardware, scenario 7 is a high-rate CSI burst. Under QEMU
|
||||
we simulate this by rapidly triggering NMIs which the mock CSI
|
||||
handler processes as frame events.
|
||||
"""
|
||||
print("[ring_flood] Sending 1000 rapid commands...")
|
||||
sent = 0
|
||||
for i in range(1000):
|
||||
try:
|
||||
# Use 'nmi' to trigger interrupt handler (mock CSI frame path)
|
||||
s.sendall(b"nmi\n")
|
||||
sent += 1
|
||||
except (BrokenPipeError, ConnectionResetError):
|
||||
print(f"[ring_flood] Connection lost after {sent} commands")
|
||||
break
|
||||
|
||||
# Drain any accumulated responses
|
||||
s.settimeout(1.0)
|
||||
try:
|
||||
while True:
|
||||
chunk = s.recv(RECV_BUFSIZE)
|
||||
if not chunk:
|
||||
break
|
||||
except socket.timeout:
|
||||
pass
|
||||
|
||||
print(f"[ring_flood] Injected: {sent}/1000 rapid NMI triggers")
|
||||
|
||||
|
||||
def fault_heap_exhaust(s: socket.socket, flash_path: str = None) -> None:
|
||||
"""Simulate memory pressure by pausing VM to trigger watchdog/heap checks.
|
||||
|
||||
Actual heap memory writes require a GDB stub (-gdb tcp::1234).
|
||||
This function probes the heap region and pauses the VM to stress
|
||||
heap management as a realistic simulation.
|
||||
"""
|
||||
heap_base = 0x3FC88000
|
||||
print("[heap_exhaust] Probing heap region...")
|
||||
resp = send_cmd(s, f"xp /4xw 0x{heap_base:08x}")
|
||||
print(f"[heap_exhaust] Heap header: {resp.strip()}")
|
||||
# Pause VM to stress memory management
|
||||
print("[heap_exhaust] Pausing VM for 3s to stress heap management...")
|
||||
send_cmd(s, "stop")
|
||||
time.sleep(3.0)
|
||||
send_cmd(s, "cont")
|
||||
print("[heap_exhaust] WARNING: Actual heap corruption requires GDB stub (-gdb tcp::1234)")
|
||||
print("[heap_exhaust] Injected: 3s VM pause (simulates memory pressure)")
|
||||
|
||||
|
||||
def fault_timer_starvation(s: socket.socket) -> None:
|
||||
"""Pause VM for 500ms — starves FreeRTOS tick and timer callbacks."""
|
||||
print("[timer_starvation] Pausing VM for 500ms...")
|
||||
send_cmd(s, "stop")
|
||||
time.sleep(0.5)
|
||||
send_cmd(s, "cont")
|
||||
print("[timer_starvation] Injected: 500ms execution pause")
|
||||
|
||||
|
||||
def fault_corrupt_frame(s: socket.socket, flash_path: str = None) -> None:
|
||||
"""Simulate CSI frame corruption by pausing VM during frame processing.
|
||||
|
||||
Actual memory writes to the frame buffer require a GDB stub
|
||||
(-gdb tcp::1234). This function probes the frame buffer region
|
||||
and pauses the VM mid-frame to simulate corruption effects.
|
||||
"""
|
||||
frame_buf_addr = 0x3FCA0000
|
||||
print(f"[corrupt_frame] Probing frame buffer at 0x{frame_buf_addr:08X}...")
|
||||
resp = send_cmd(s, f"xp /4xb 0x{frame_buf_addr:08x}")
|
||||
print(f"[corrupt_frame] Frame buffer: {resp.strip()}")
|
||||
# Pause VM briefly to disrupt frame processing timing
|
||||
print("[corrupt_frame] Pausing VM for 1s to disrupt frame processing...")
|
||||
send_cmd(s, "stop")
|
||||
time.sleep(1.0)
|
||||
send_cmd(s, "cont")
|
||||
print("[corrupt_frame] WARNING: Actual frame corruption requires GDB stub (-gdb tcp::1234)")
|
||||
print(f"[corrupt_frame] Injected: 1s VM pause during frame processing")
|
||||
|
||||
|
||||
def fault_nvs_corrupt(s: socket.socket, flash_path: str = None) -> None:
|
||||
"""Write garbage to the NVS flash region on disk.
|
||||
|
||||
When a flash image path is provided, writes random bytes directly
|
||||
to the NVS partition offset (0x9000) in the flash image file.
|
||||
Without a flash path, falls back to a read-only probe via monitor.
|
||||
"""
|
||||
if flash_path and os.path.isfile(flash_path):
|
||||
nvs_offset = 0x9000
|
||||
garbage = bytes(random.randint(0, 255) for _ in range(16))
|
||||
with open(flash_path, "r+b") as f:
|
||||
f.seek(nvs_offset)
|
||||
f.write(garbage)
|
||||
print(f"[nvs_corrupt] Wrote 16 garbage bytes at flash offset 0x{nvs_offset:X}")
|
||||
print(f"[nvs_corrupt] Flash image: {flash_path}")
|
||||
else:
|
||||
# Fallback: attempt via monitor (read-only probe)
|
||||
resp = send_cmd(s, f"xp /8xb 0x3C009000")
|
||||
print(f"[nvs_corrupt] NVS region (read-only probe): {resp.strip()}")
|
||||
print(f"[nvs_corrupt] WARNING: No --flash path provided; NVS corruption was NOT injected")
|
||||
print(f"[nvs_corrupt] Pass --flash /path/to/flash.bin for actual corruption")
|
||||
|
||||
|
||||
# Map fault names to injection functions
|
||||
FAULT_MAP = {
|
||||
"wifi_kill": fault_wifi_kill,
|
||||
"ring_flood": fault_ring_flood,
|
||||
"heap_exhaust": fault_heap_exhaust,
|
||||
"timer_starvation": fault_timer_starvation,
|
||||
"corrupt_frame": fault_corrupt_frame,
|
||||
"nvs_corrupt": fault_nvs_corrupt,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="QEMU Fault Injector — ADR-061 Layer 9",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--socket", required=True,
|
||||
help="Path to QEMU monitor Unix domain socket",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fault", required=True, choices=list(FAULT_MAP.keys()),
|
||||
help="Fault type to inject",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout", type=float, default=CMD_TIMEOUT,
|
||||
help=f"Per-command timeout in seconds (default: {CMD_TIMEOUT})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--flash", default=None,
|
||||
help="Path to flash image (for nvs_corrupt direct file writes)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"[inject_fault] Connecting to {args.socket}...")
|
||||
s = connect_monitor(args.socket, timeout=args.timeout)
|
||||
|
||||
print(f"[inject_fault] Injecting fault: {args.fault}")
|
||||
try:
|
||||
fault_fn = FAULT_MAP[args.fault]
|
||||
# Pass flash_path to faults that accept it
|
||||
import inspect
|
||||
sig = inspect.signature(fault_fn)
|
||||
if "flash_path" in sig.parameters:
|
||||
fault_fn(s, flash_path=args.flash)
|
||||
else:
|
||||
fault_fn(s)
|
||||
except Exception as e:
|
||||
print(f"ERROR: Fault injection failed: {e}", file=sys.stderr)
|
||||
s.close()
|
||||
sys.exit(1)
|
||||
|
||||
s.close()
|
||||
print(f"[inject_fault] Complete: {args.fault}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,337 @@
|
|||
#!/bin/bash
|
||||
# install-qemu.sh — Install QEMU with ESP32-S3 support (Espressif fork)
|
||||
# Usage: bash scripts/install-qemu.sh [OPTIONS]
|
||||
set -euo pipefail
|
||||
|
||||
# ── Colors ────────────────────────────────────────────────────────────────────
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
|
||||
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
err() { echo -e "${RED}[ERROR]${NC} $*"; }
|
||||
step() { echo -e "\n${CYAN}${BOLD}▶ $*${NC}"; }
|
||||
|
||||
# ── Defaults ──────────────────────────────────────────────────────────────────
|
||||
INSTALL_DIR="$HOME/.espressif/qemu"
|
||||
BRANCH="esp-develop"
|
||||
JOBS=""
|
||||
SKIP_DEPS=false
|
||||
UNINSTALL=false
|
||||
CHECK_ONLY=false
|
||||
QEMU_REPO="https://github.com/espressif/qemu.git"
|
||||
|
||||
# ── Usage ─────────────────────────────────────────────────────────────────────
|
||||
usage() {
|
||||
cat <<EOF
|
||||
${BOLD}install-qemu.sh${NC} — Install QEMU with ESP32-S3 support (Espressif fork)
|
||||
|
||||
${BOLD}USAGE${NC}
|
||||
bash scripts/install-qemu.sh [OPTIONS]
|
||||
|
||||
${BOLD}OPTIONS${NC}
|
||||
--install-dir DIR Installation directory (default: ~/.espressif/qemu)
|
||||
--branch TAG QEMU branch or tag to build (default: esp-develop)
|
||||
--jobs N Parallel build jobs (default: nproc)
|
||||
--skip-deps Skip system dependency installation
|
||||
--uninstall Remove QEMU installation
|
||||
--check Verify existing installation and exit
|
||||
-h, --help Show this help
|
||||
|
||||
${BOLD}EXIT CODES${NC}
|
||||
0 Success
|
||||
1 Dependency installation failed
|
||||
2 Build failed
|
||||
3 Unsupported OS
|
||||
|
||||
${BOLD}EXAMPLES${NC}
|
||||
bash scripts/install-qemu.sh
|
||||
bash scripts/install-qemu.sh --install-dir /opt/qemu-esp --jobs 8
|
||||
bash scripts/install-qemu.sh --check
|
||||
bash scripts/install-qemu.sh --uninstall
|
||||
EOF
|
||||
}
|
||||
|
||||
# ── Parse args ────────────────────────────────────────────────────────────────
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install-dir) INSTALL_DIR="$2"; shift 2 ;;
|
||||
--branch) BRANCH="$2"; shift 2 ;;
|
||||
--jobs) JOBS="$2"; shift 2 ;;
|
||||
--skip-deps) SKIP_DEPS=true; shift ;;
|
||||
--uninstall) UNINSTALL=true; shift ;;
|
||||
--check) CHECK_ONLY=true; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) err "Unknown option: $1"; usage; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ── OS detection ──────────────────────────────────────────────────────────────
|
||||
detect_os() {
|
||||
OS="unknown"
|
||||
DISTRO="unknown"
|
||||
IS_WSL=false
|
||||
|
||||
case "$(uname -s)" in
|
||||
Linux)
|
||||
OS="linux"
|
||||
if grep -qi microsoft /proc/version 2>/dev/null; then
|
||||
IS_WSL=true
|
||||
fi
|
||||
if [ -f /etc/os-release ]; then
|
||||
# shellcheck disable=SC1091
|
||||
. /etc/os-release
|
||||
case "$ID" in
|
||||
ubuntu|debian|pop|linuxmint|elementary) DISTRO="debian" ;;
|
||||
fedora|rhel|centos|rocky|alma) DISTRO="fedora" ;;
|
||||
arch|manjaro|endeavouros) DISTRO="arch" ;;
|
||||
opensuse*|sles) DISTRO="suse" ;;
|
||||
*) DISTRO="$ID" ;;
|
||||
esac
|
||||
fi
|
||||
;;
|
||||
Darwin) OS="macos"; DISTRO="macos" ;;
|
||||
MINGW*|MSYS*)
|
||||
err "Native Windows/MINGW detected."
|
||||
err "QEMU ESP32-S3 must be built on Linux or macOS."
|
||||
err "Options:"
|
||||
err " 1. Use WSL: wsl bash scripts/install-qemu.sh"
|
||||
err " 2. Use Docker: docker run -it ubuntu:22.04 bash"
|
||||
err " 3. Download pre-built: https://github.com/espressif/qemu/releases"
|
||||
exit 3
|
||||
;;
|
||||
*) err "Unsupported OS: $(uname -s)"; exit 3 ;;
|
||||
esac
|
||||
|
||||
info "Detected: OS=${OS} Distro=${DISTRO} WSL=${IS_WSL}"
|
||||
}
|
||||
|
||||
# ── Check existing installation ───────────────────────────────────────────────
|
||||
check_installation() {
|
||||
local qemu_bin="$INSTALL_DIR/build/qemu-system-xtensa"
|
||||
if [ -x "$qemu_bin" ]; then
|
||||
local version
|
||||
version=$("$qemu_bin" --version 2>/dev/null | head -1) || true
|
||||
if [ -n "$version" ]; then
|
||||
ok "QEMU installed: $version"
|
||||
ok "Binary: $qemu_bin"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
# Check PATH
|
||||
if command -v qemu-system-xtensa &>/dev/null; then
|
||||
local version
|
||||
version=$(qemu-system-xtensa --version 2>/dev/null | head -1) || true
|
||||
ok "QEMU found in PATH: $version"
|
||||
return 0
|
||||
fi
|
||||
warn "QEMU with ESP32-S3 support not found"
|
||||
return 1
|
||||
}
|
||||
|
||||
if $CHECK_ONLY; then
|
||||
detect_os
|
||||
if check_installation; then exit 0; else exit 1; fi
|
||||
fi
|
||||
|
||||
# ── Uninstall ─────────────────────────────────────────────────────────────────
|
||||
if $UNINSTALL; then
|
||||
step "Uninstalling QEMU from $INSTALL_DIR"
|
||||
if [ -d "$INSTALL_DIR" ]; then
|
||||
rm -rf "$INSTALL_DIR"
|
||||
ok "Removed $INSTALL_DIR"
|
||||
else
|
||||
warn "Directory not found: $INSTALL_DIR"
|
||||
fi
|
||||
# Remove symlink
|
||||
local_bin="$HOME/.local/bin/qemu-system-xtensa"
|
||||
if [ -L "$local_bin" ]; then
|
||||
rm -f "$local_bin"
|
||||
ok "Removed symlink $local_bin"
|
||||
fi
|
||||
ok "Uninstall complete"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Main install flow ─────────────────────────────────────────────────────────
|
||||
detect_os
|
||||
|
||||
# Default jobs = nproc
|
||||
if [ -z "$JOBS" ]; then
|
||||
if command -v nproc &>/dev/null; then
|
||||
JOBS=$(nproc)
|
||||
elif command -v sysctl &>/dev/null; then
|
||||
JOBS=$(sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
else
|
||||
JOBS=4
|
||||
fi
|
||||
fi
|
||||
info "Build parallelism: $JOBS jobs"
|
||||
|
||||
# ── Step 1: Install dependencies ──────────────────────────────────────────────
|
||||
install_deps() {
|
||||
step "Installing build dependencies"
|
||||
|
||||
case "$DISTRO" in
|
||||
debian)
|
||||
info "Using apt (Debian/Ubuntu)"
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y -qq \
|
||||
git build-essential python3 python3-pip python3-venv \
|
||||
ninja-build pkg-config libglib2.0-dev libpixman-1-dev \
|
||||
libslirp-dev libgcrypt-dev
|
||||
;;
|
||||
fedora)
|
||||
info "Using dnf (Fedora/RHEL)"
|
||||
sudo dnf install -y \
|
||||
git gcc gcc-c++ make python3 python3-pip \
|
||||
ninja-build pkgconfig glib2-devel pixman-devel \
|
||||
libslirp-devel libgcrypt-devel
|
||||
;;
|
||||
arch)
|
||||
info "Using pacman (Arch)"
|
||||
sudo pacman -S --needed --noconfirm \
|
||||
git base-devel python python-pip \
|
||||
ninja pkgconf glib2 pixman libslirp libgcrypt
|
||||
;;
|
||||
suse)
|
||||
info "Using zypper (openSUSE)"
|
||||
sudo zypper install -y \
|
||||
git gcc gcc-c++ make python3 python3-pip \
|
||||
ninja pkg-config glib2-devel libpixman-1-0-devel \
|
||||
libslirp-devel libgcrypt-devel
|
||||
;;
|
||||
macos)
|
||||
info "Using Homebrew"
|
||||
if ! command -v brew &>/dev/null; then
|
||||
err "Homebrew not found. Install from https://brew.sh"
|
||||
exit 1
|
||||
fi
|
||||
brew install glib pixman ninja pkg-config libslirp libgcrypt || true
|
||||
;;
|
||||
*)
|
||||
warn "Unknown distro '$DISTRO' — install these manually:"
|
||||
warn " git, gcc/g++, python3, ninja, pkg-config, glib2-dev, pixman-dev, libslirp-dev"
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
ok "Dependencies installed"
|
||||
}
|
||||
|
||||
if ! $SKIP_DEPS; then
|
||||
install_deps || { err "Dependency installation failed"; exit 1; }
|
||||
else
|
||||
info "Skipping dependency installation (--skip-deps)"
|
||||
fi
|
||||
|
||||
# ── Step 2: Clone Espressif QEMU fork ─────────────────────────────────────────
|
||||
step "Cloning Espressif QEMU fork"
|
||||
|
||||
SRC_DIR="$INSTALL_DIR"
|
||||
if [ -d "$SRC_DIR/.git" ]; then
|
||||
info "Repository already exists at $SRC_DIR"
|
||||
info "Fetching latest changes on branch $BRANCH"
|
||||
git -C "$SRC_DIR" fetch origin "$BRANCH" --depth=1
|
||||
git -C "$SRC_DIR" checkout "$BRANCH" 2>/dev/null || git -C "$SRC_DIR" checkout "origin/$BRANCH"
|
||||
ok "Updated to latest $BRANCH"
|
||||
else
|
||||
info "Cloning $QEMU_REPO (branch: $BRANCH)"
|
||||
mkdir -p "$(dirname "$SRC_DIR")"
|
||||
git clone --depth=1 --branch "$BRANCH" "$QEMU_REPO" "$SRC_DIR"
|
||||
ok "Cloned to $SRC_DIR"
|
||||
fi
|
||||
|
||||
# ── Step 3: Configure and build ───────────────────────────────────────────────
|
||||
step "Configuring QEMU (target: xtensa-softmmu)"
|
||||
|
||||
BUILD_DIR="$SRC_DIR/build"
|
||||
mkdir -p "$BUILD_DIR"
|
||||
cd "$SRC_DIR"
|
||||
|
||||
./configure \
|
||||
--target-list=xtensa-softmmu \
|
||||
--enable-slirp \
|
||||
--enable-gcrypt \
|
||||
--prefix="$INSTALL_DIR/dist" \
|
||||
2>&1 | tail -5
|
||||
|
||||
step "Building QEMU ($JOBS parallel jobs)"
|
||||
make -j"$JOBS" -C "$BUILD_DIR" 2>&1 | tail -20
|
||||
|
||||
if [ ! -x "$BUILD_DIR/qemu-system-xtensa" ]; then
|
||||
err "Build failed — qemu-system-xtensa binary not found"
|
||||
err "Troubleshooting:"
|
||||
err " 1. Check build output above for errors"
|
||||
err " 2. Ensure all dependencies are installed: re-run without --skip-deps"
|
||||
err " 3. Try with fewer jobs: --jobs 1"
|
||||
err " 4. On macOS, ensure Xcode CLT: xcode-select --install"
|
||||
exit 2
|
||||
fi
|
||||
ok "Build succeeded: $BUILD_DIR/qemu-system-xtensa"
|
||||
|
||||
# ── Step 4: Create symlink / add to PATH ──────────────────────────────────────
|
||||
step "Setting up PATH access"
|
||||
|
||||
LOCAL_BIN="$HOME/.local/bin"
|
||||
mkdir -p "$LOCAL_BIN"
|
||||
ln -sf "$BUILD_DIR/qemu-system-xtensa" "$LOCAL_BIN/qemu-system-xtensa"
|
||||
ok "Symlinked to $LOCAL_BIN/qemu-system-xtensa"
|
||||
|
||||
# Check if ~/.local/bin is in PATH
|
||||
if ! echo "$PATH" | tr ':' '\n' | grep -qx "$LOCAL_BIN"; then
|
||||
warn "$LOCAL_BIN is not in your PATH"
|
||||
warn "Add this to your shell profile (~/.bashrc or ~/.zshrc):"
|
||||
echo -e " ${BOLD}export PATH=\"\$HOME/.local/bin:\$PATH\"${NC}"
|
||||
fi
|
||||
|
||||
# ── Step 5: Verify ────────────────────────────────────────────────────────────
|
||||
step "Verifying installation"
|
||||
|
||||
QEMU_VERSION=$("$BUILD_DIR/qemu-system-xtensa" --version | head -1)
|
||||
ok "$QEMU_VERSION"
|
||||
|
||||
# Check ESP32-S3 machine support
|
||||
if "$BUILD_DIR/qemu-system-xtensa" -machine help 2>/dev/null | grep -q esp32s3; then
|
||||
ok "ESP32-S3 machine type available"
|
||||
else
|
||||
warn "ESP32-S3 machine type not listed (may still work with newer builds)"
|
||||
fi
|
||||
|
||||
# ── Step 6: Install Python packages ──────────────────────────────────────────
|
||||
step "Installing Python packages (esptool, pyyaml, nvs-partition-gen)"
|
||||
|
||||
PIP_CMD="pip3"
|
||||
if ! command -v pip3 &>/dev/null; then
|
||||
PIP_CMD="python3 -m pip"
|
||||
fi
|
||||
|
||||
$PIP_CMD install --user --quiet \
|
||||
esptool \
|
||||
pyyaml \
|
||||
esp-idf-nvs-partition-gen \
|
||||
2>&1 || warn "Some Python packages failed to install (non-fatal)"
|
||||
|
||||
ok "Python packages installed"
|
||||
|
||||
# ── Done ──────────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo -e "${GREEN}${BOLD}Installation complete!${NC}"
|
||||
echo ""
|
||||
echo -e "${BOLD}Next steps:${NC}"
|
||||
echo ""
|
||||
echo " 1. Run a smoke test:"
|
||||
echo -e " ${CYAN}qemu-system-xtensa -nographic -machine esp32s3 \\${NC}"
|
||||
echo -e " ${CYAN} -drive file=firmware.bin,if=mtd,format=raw \\${NC}"
|
||||
echo -e " ${CYAN} -serial mon:stdio${NC}"
|
||||
echo ""
|
||||
echo " 2. Run the project QEMU tests:"
|
||||
echo -e " ${CYAN}cd $(dirname "$0")/.."
|
||||
echo -e " pytest firmware/esp32-csi-node/tests/qemu/ -v${NC}"
|
||||
echo ""
|
||||
echo " 3. Binary location:"
|
||||
echo -e " ${CYAN}$BUILD_DIR/qemu-system-xtensa${NC}"
|
||||
echo ""
|
||||
echo -e " 4. Uninstall:"
|
||||
echo -e " ${CYAN}bash scripts/install-qemu.sh --uninstall${NC}"
|
||||
echo ""
|
||||
|
|
@ -0,0 +1,397 @@
|
|||
#!/bin/bash
|
||||
# QEMU Chaos / Fault Injection Test Runner — ADR-061 Layer 9
|
||||
#
|
||||
# Launches firmware under QEMU and injects a series of faults to verify
|
||||
# the firmware's resilience. Each fault is injected via the QEMU monitor
|
||||
# socket (or GDB stub), followed by a recovery window and health check.
|
||||
#
|
||||
# Fault types:
|
||||
# 1. wifi_kill — Pause/resume VM to simulate WiFi reconnect
|
||||
# 2. ring_flood — Inject 1000 rapid mock frames (ring buffer stress)
|
||||
# 3. heap_exhaust — Write to heap metadata to simulate low memory
|
||||
# 4. timer_starvation — Pause VM for 500ms to starve FreeRTOS timers
|
||||
# 5. corrupt_frame — Inject a CSI frame with bad magic bytes
|
||||
# 6. nvs_corrupt — Write garbage to NVS flash region
|
||||
#
|
||||
# Environment variables:
|
||||
# QEMU_PATH - Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
# QEMU_TIMEOUT - Boot timeout in seconds (default: 15)
|
||||
# FLASH_IMAGE - Path to merged flash image (default: build/qemu_flash.bin)
|
||||
# FAULT_WAIT - Seconds to wait after fault injection (default: 5)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 PASS — all checks passed
|
||||
# 1 WARN — non-critical checks failed
|
||||
# 2 FAIL — critical checks failed
|
||||
# 3 FATAL — build error, crash, or infrastructure failure
|
||||
|
||||
# ── Help ──────────────────────────────────────────────────────────────
|
||||
usage() {
|
||||
cat <<'HELP'
|
||||
Usage: qemu-chaos-test.sh [OPTIONS]
|
||||
|
||||
Launch firmware under QEMU and inject a series of faults to verify the
|
||||
firmware's resilience. Each fault is injected via the QEMU monitor socket,
|
||||
followed by a recovery window and health check.
|
||||
|
||||
Fault types:
|
||||
wifi_kill Pause/resume VM to simulate WiFi reconnect
|
||||
ring_flood Inject 1000 rapid mock frames (ring buffer stress)
|
||||
heap_exhaust Write to heap metadata to simulate low memory
|
||||
timer_starvation Pause VM for 500ms to starve FreeRTOS timers
|
||||
corrupt_frame Inject a CSI frame with bad magic bytes
|
||||
nvs_corrupt Write garbage to NVS flash region
|
||||
|
||||
Options:
|
||||
-h, --help Show this help message and exit
|
||||
|
||||
Environment variables:
|
||||
QEMU_PATH Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
QEMU_TIMEOUT Boot timeout in seconds (default: 15)
|
||||
FLASH_IMAGE Path to merged flash image (default: build/qemu_flash.bin)
|
||||
FAULT_WAIT Seconds to wait after injection (default: 5)
|
||||
|
||||
Examples:
|
||||
./qemu-chaos-test.sh
|
||||
QEMU_TIMEOUT=30 FAULT_WAIT=10 ./qemu-chaos-test.sh
|
||||
FLASH_IMAGE=/path/to/image.bin ./qemu-chaos-test.sh
|
||||
|
||||
Exit codes:
|
||||
0 PASS — all checks passed
|
||||
1 WARN — non-critical checks failed
|
||||
2 FAIL — critical checks failed
|
||||
3 FATAL — build error, crash, or infrastructure failure
|
||||
HELP
|
||||
exit 0
|
||||
}
|
||||
|
||||
case "${1:-}" in -h|--help) usage ;; esac
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
FIRMWARE_DIR="$PROJECT_ROOT/firmware/esp32-csi-node"
|
||||
BUILD_DIR="$FIRMWARE_DIR/build"
|
||||
QEMU_BIN="${QEMU_PATH:-qemu-system-xtensa}"
|
||||
FLASH_IMAGE="${FLASH_IMAGE:-$BUILD_DIR/qemu_flash.bin}"
|
||||
BOOT_TIMEOUT="${QEMU_TIMEOUT:-15}"
|
||||
FAULT_WAIT="${FAULT_WAIT:-5}"
|
||||
MONITOR_SOCK="$BUILD_DIR/qemu-chaos.sock"
|
||||
LOG_DIR="$BUILD_DIR/chaos-tests"
|
||||
UART_LOG="$LOG_DIR/qemu_uart.log"
|
||||
QEMU_PID=""
|
||||
|
||||
# Fault definitions
|
||||
FAULTS=("wifi_kill" "ring_flood" "heap_exhaust" "timer_starvation" "corrupt_frame" "nvs_corrupt")
|
||||
declare -a FAULT_RESULTS=()
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Cleanup
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "[cleanup] Shutting down QEMU and removing socket..."
|
||||
if [ -n "$QEMU_PID" ] && kill -0 "$QEMU_PID" 2>/dev/null; then
|
||||
kill "$QEMU_PID" 2>/dev/null || true
|
||||
wait "$QEMU_PID" 2>/dev/null || true
|
||||
fi
|
||||
rm -f "$MONITOR_SOCK"
|
||||
echo "[cleanup] Done."
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Helpers
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
monitor_cmd() {
|
||||
local cmd="$1"
|
||||
local timeout="${2:-5}"
|
||||
echo "$cmd" | socat - "UNIX-CONNECT:$MONITOR_SOCK,connect-timeout=$timeout" 2>/dev/null
|
||||
}
|
||||
|
||||
log_line_count() {
|
||||
wc -l < "$UART_LOG" 2>/dev/null || echo 0
|
||||
}
|
||||
|
||||
wait_for_boot() {
|
||||
local elapsed=0
|
||||
while [ "$elapsed" -lt "$BOOT_TIMEOUT" ]; do
|
||||
if [ -f "$UART_LOG" ] && grep -qE "app_main|main_task|ESP32-S3|mock_csi" "$UART_LOG" 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
sleep 1
|
||||
elapsed=$((elapsed + 1))
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Fault injection functions
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
inject_wifi_kill() {
|
||||
# Simulate WiFi disconnect/reconnect by pausing and resuming the VM.
|
||||
# The firmware should handle the time gap gracefully.
|
||||
echo " [inject] Pausing VM for 2s (simulating WiFi disconnect)..."
|
||||
monitor_cmd "stop"
|
||||
sleep 2
|
||||
echo " [inject] Resuming VM (simulating WiFi reconnect)..."
|
||||
monitor_cmd "cont"
|
||||
}
|
||||
|
||||
inject_ring_flood() {
|
||||
# Send 1000 rapid mock frames by triggering scenario 7 repeatedly.
|
||||
# This stresses the ring buffer and tests backpressure handling.
|
||||
echo " [inject] Flooding ring buffer with 1000 rapid frame triggers..."
|
||||
python3 "$SCRIPT_DIR/inject_fault.py" \
|
||||
--socket "$MONITOR_SOCK" \
|
||||
--fault ring_flood
|
||||
}
|
||||
|
||||
inject_heap_exhaust() {
|
||||
# Simulate memory pressure by pausing the VM to stress heap management.
|
||||
# Actual heap memory writes require GDB stub.
|
||||
echo " [inject] Simulating heap pressure via VM pause..."
|
||||
python3 "$SCRIPT_DIR/inject_fault.py" \
|
||||
--socket "$MONITOR_SOCK" \
|
||||
--fault heap_exhaust
|
||||
}
|
||||
|
||||
inject_timer_starvation() {
|
||||
# Pause execution for 500ms to starve FreeRTOS timer callbacks.
|
||||
# Tests watchdog recovery and timer resilience.
|
||||
echo " [inject] Starving timers (500ms pause)..."
|
||||
monitor_cmd "stop"
|
||||
sleep 0.5
|
||||
monitor_cmd "cont"
|
||||
}
|
||||
|
||||
inject_corrupt_frame() {
|
||||
# Inject a CSI frame with bad magic bytes via monitor memory write.
|
||||
# The frame parser should reject it without crashing.
|
||||
echo " [inject] Injecting corrupt CSI frame (bad magic)..."
|
||||
python3 "$SCRIPT_DIR/inject_fault.py" \
|
||||
--socket "$MONITOR_SOCK" \
|
||||
--fault corrupt_frame
|
||||
}
|
||||
|
||||
inject_nvs_corrupt() {
|
||||
# Write garbage to the NVS flash region (offset 0x9000) via direct file write.
|
||||
# The firmware should detect NVS corruption and fall back to defaults.
|
||||
echo " [inject] Corrupting NVS flash region..."
|
||||
python3 "$SCRIPT_DIR/inject_fault.py" \
|
||||
--socket "$MONITOR_SOCK" \
|
||||
--fault nvs_corrupt \
|
||||
--flash "$FLASH_IMAGE"
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Pre-flight checks
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "=== QEMU Chaos Test Runner — ADR-061 Layer 9 ==="
|
||||
echo "QEMU binary: $QEMU_BIN"
|
||||
echo "Flash image: $FLASH_IMAGE"
|
||||
echo "Boot timeout: ${BOOT_TIMEOUT}s"
|
||||
echo "Fault wait: ${FAULT_WAIT}s"
|
||||
echo "Faults: ${FAULTS[*]}"
|
||||
echo ""
|
||||
|
||||
if ! command -v "$QEMU_BIN" &>/dev/null; then
|
||||
echo "ERROR: QEMU binary not found: $QEMU_BIN"
|
||||
echo " Install: sudo apt install qemu-system-misc # Debian/Ubuntu"
|
||||
echo " Install: brew install qemu # macOS"
|
||||
echo " Or set QEMU_PATH to the qemu-system-xtensa binary."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v socat &>/dev/null; then
|
||||
echo "ERROR: socat not found (needed for QEMU monitor communication)."
|
||||
echo " Install: sudo apt install socat # Debian/Ubuntu"
|
||||
echo " Install: brew install socat # macOS"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v python3 &>/dev/null; then
|
||||
echo "ERROR: python3 not found (needed for fault injection scripts)."
|
||||
echo " Install: sudo apt install python3 # Debian/Ubuntu"
|
||||
echo " Install: brew install python # macOS"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if [ ! -f "$FLASH_IMAGE" ]; then
|
||||
echo "ERROR: Flash image not found: $FLASH_IMAGE"
|
||||
echo "Run qemu-esp32s3-test.sh first to build the flash image."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Launch QEMU
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Launching QEMU ──"
|
||||
echo ""
|
||||
|
||||
rm -f "$MONITOR_SOCK"
|
||||
> "$UART_LOG"
|
||||
|
||||
QEMU_ARGS=(
|
||||
-machine esp32s3
|
||||
-nographic
|
||||
-drive "file=$FLASH_IMAGE,if=mtd,format=raw"
|
||||
-serial "file:$UART_LOG"
|
||||
-no-reboot
|
||||
-monitor "unix:$MONITOR_SOCK,server,nowait"
|
||||
)
|
||||
|
||||
"$QEMU_BIN" "${QEMU_ARGS[@]}" &
|
||||
QEMU_PID=$!
|
||||
echo "[qemu] PID=$QEMU_PID"
|
||||
|
||||
# Wait for monitor socket
|
||||
waited=0
|
||||
while [ ! -S "$MONITOR_SOCK" ] && [ "$waited" -lt 10 ]; do
|
||||
sleep 1
|
||||
waited=$((waited + 1))
|
||||
done
|
||||
|
||||
if [ ! -S "$MONITOR_SOCK" ]; then
|
||||
echo "ERROR: QEMU monitor socket did not appear after 10s"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Wait for boot
|
||||
echo "[boot] Waiting for firmware boot (up to ${BOOT_TIMEOUT}s)..."
|
||||
if wait_for_boot; then
|
||||
echo "[boot] Firmware booted successfully."
|
||||
else
|
||||
echo "[boot] No boot indicator found (continuing anyway)."
|
||||
fi
|
||||
|
||||
# Let firmware stabilize for a few seconds
|
||||
echo "[boot] Stabilizing (3s)..."
|
||||
sleep 3
|
||||
echo ""
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Fault injection loop
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Fault Injection ──"
|
||||
echo ""
|
||||
|
||||
MAX_EXIT=0
|
||||
|
||||
for fault in "${FAULTS[@]}"; do
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo " Fault: $fault"
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
# Record log position before injection
|
||||
pre_lines=$(log_line_count)
|
||||
|
||||
# Check QEMU is still alive
|
||||
if ! kill -0 "$QEMU_PID" 2>/dev/null; then
|
||||
echo " ERROR: QEMU process died before fault injection"
|
||||
FAULT_RESULTS+=("${fault}:3")
|
||||
MAX_EXIT=3
|
||||
break
|
||||
fi
|
||||
|
||||
# Inject the fault
|
||||
case "$fault" in
|
||||
wifi_kill) inject_wifi_kill ;;
|
||||
ring_flood) inject_ring_flood ;;
|
||||
heap_exhaust) inject_heap_exhaust ;;
|
||||
timer_starvation) inject_timer_starvation ;;
|
||||
corrupt_frame) inject_corrupt_frame ;;
|
||||
nvs_corrupt) inject_nvs_corrupt ;;
|
||||
*)
|
||||
echo " ERROR: Unknown fault type: $fault"
|
||||
FAULT_RESULTS+=("${fault}:2")
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
|
||||
# Wait for firmware to respond/recover
|
||||
echo " [recovery] Waiting ${FAULT_WAIT}s for recovery..."
|
||||
sleep "$FAULT_WAIT"
|
||||
|
||||
# Extract post-fault log segment
|
||||
post_lines=$(log_line_count)
|
||||
new_lines=$((post_lines - pre_lines))
|
||||
fault_log="$LOG_DIR/fault_${fault}.log"
|
||||
|
||||
if [ "$new_lines" -gt 0 ]; then
|
||||
tail -n "$new_lines" "$UART_LOG" > "$fault_log"
|
||||
else
|
||||
# Grab last 50 lines as context
|
||||
tail -n 50 "$UART_LOG" > "$fault_log"
|
||||
fi
|
||||
|
||||
echo " [check] Captured $new_lines new log lines"
|
||||
|
||||
# Health check
|
||||
fault_exit=0
|
||||
python3 "$SCRIPT_DIR/check_health.py" \
|
||||
--log "$fault_log" \
|
||||
--after-fault "$fault" || fault_exit=$?
|
||||
|
||||
case "$fault_exit" in
|
||||
0) echo " [result] HEALTHY — firmware recovered gracefully" ;;
|
||||
1) echo " [result] DEGRADED — firmware running but with issues" ;;
|
||||
*) echo " [result] UNHEALTHY — firmware in bad state" ;;
|
||||
esac
|
||||
|
||||
FAULT_RESULTS+=("${fault}:${fault_exit}")
|
||||
if [ "$fault_exit" -gt "$MAX_EXIT" ]; then
|
||||
MAX_EXIT=$fault_exit
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Summary
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Chaos Test Results ──"
|
||||
echo ""
|
||||
|
||||
PASS=0
|
||||
DEGRADED=0
|
||||
FAIL=0
|
||||
|
||||
for result in "${FAULT_RESULTS[@]}"; do
|
||||
name="${result%%:*}"
|
||||
code="${result##*:}"
|
||||
case "$code" in
|
||||
0) echo " [PASS] $name"; PASS=$((PASS + 1)) ;;
|
||||
1) echo " [DEGRADED] $name"; DEGRADED=$((DEGRADED + 1)) ;;
|
||||
*) echo " [FAIL] $name"; FAIL=$((FAIL + 1)) ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo " $PASS passed, $DEGRADED degraded, $FAIL failed out of ${#FAULTS[@]} faults"
|
||||
echo ""
|
||||
|
||||
# Check if QEMU survived all faults
|
||||
if kill -0 "$QEMU_PID" 2>/dev/null; then
|
||||
echo " QEMU process survived all fault injections."
|
||||
else
|
||||
echo " WARNING: QEMU process died during fault injection."
|
||||
if [ "$MAX_EXIT" -lt 3 ]; then
|
||||
MAX_EXIT=3
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Chaos Test Complete (exit code: $MAX_EXIT) ==="
|
||||
exit "$MAX_EXIT"
|
||||
|
|
@ -0,0 +1,362 @@
|
|||
#!/usr/bin/env bash
|
||||
# ============================================================================
|
||||
# qemu-cli.sh — Unified QEMU ESP32-S3 testing CLI (ADR-061)
|
||||
# Version: 1.0.0
|
||||
#
|
||||
# Single entry point for all QEMU testing operations.
|
||||
# Run `qemu-cli.sh help` or `qemu-cli.sh --help` for usage.
|
||||
# ============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
VERSION="1.0.0"
|
||||
|
||||
# --- Colors ----------------------------------------------------------------
|
||||
if [[ -t 1 ]]; then
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m'; RST='\033[0m'
|
||||
else
|
||||
RED=''; GREEN=''; YELLOW=''; BLUE=''; CYAN=''; BOLD=''; RST=''
|
||||
fi
|
||||
|
||||
# --- Resolve paths ---------------------------------------------------------
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
FIRMWARE_DIR="$PROJECT_ROOT/firmware/esp32-csi-node"
|
||||
FUZZ_DIR="$FIRMWARE_DIR/test"
|
||||
|
||||
# --- Helpers ---------------------------------------------------------------
|
||||
info() { echo -e "${BLUE}[INFO]${RST} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${RST} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${RST} $*"; }
|
||||
err() { echo -e "${RED}[ERROR]${RST} $*" >&2; }
|
||||
die() { err "$@"; exit 1; }
|
||||
|
||||
need_qemu() {
|
||||
detect_qemu >/dev/null 2>&1 || \
|
||||
die "QEMU not found. Install with: ${CYAN}qemu-cli.sh install${RST}"
|
||||
}
|
||||
|
||||
detect_qemu() {
|
||||
# 1. Explicit env var
|
||||
if [[ -n "${QEMU_PATH:-}" ]] && [[ -x "$QEMU_PATH" ]]; then
|
||||
echo "$QEMU_PATH"; return 0
|
||||
fi
|
||||
# 2. On PATH
|
||||
local qemu
|
||||
qemu="$(command -v qemu-system-xtensa 2>/dev/null || true)"
|
||||
if [[ -n "$qemu" ]]; then echo "$qemu"; return 0; fi
|
||||
# 3. Espressif default build location
|
||||
local espressif_qemu="$HOME/.espressif/qemu/build/qemu-system-xtensa"
|
||||
if [[ -x "$espressif_qemu" ]]; then echo "$espressif_qemu"; return 0; fi
|
||||
return 1
|
||||
}
|
||||
|
||||
detect_python() {
|
||||
command -v python3 2>/dev/null || command -v python 2>/dev/null || echo "python3"
|
||||
}
|
||||
|
||||
# --- Command: help ---------------------------------------------------------
|
||||
cmd_help() {
|
||||
cat <<EOF
|
||||
${BOLD}qemu-cli.sh${RST} v${VERSION} — Unified QEMU ESP32-S3 testing CLI
|
||||
|
||||
${BOLD}USAGE${RST}
|
||||
qemu-cli.sh <command> [options]
|
||||
|
||||
${BOLD}COMMANDS${RST}
|
||||
${CYAN}install${RST} Install QEMU with ESP32-S3 support
|
||||
${CYAN}test${RST} Run single-node firmware test
|
||||
${CYAN}mesh${RST} [N] Run multi-node mesh test (default: 3 nodes)
|
||||
${CYAN}swarm${RST} [args] Run swarm configurator (qemu_swarm.py)
|
||||
${CYAN}snapshot${RST} [args] Run snapshot-based tests
|
||||
${CYAN}chaos${RST} [args] Run chaos / fault injection tests
|
||||
${CYAN}fuzz${RST} [--duration N] Run all 3 fuzz targets (clang libFuzzer)
|
||||
${CYAN}nvs${RST} [args] Generate NVS test matrix
|
||||
${CYAN}health${RST} <logfile> Check firmware health from QEMU log
|
||||
${CYAN}status${RST} Show installation status and versions
|
||||
${CYAN}help${RST} Show this help message
|
||||
|
||||
${BOLD}EXAMPLES${RST}
|
||||
qemu-cli.sh install # Install QEMU
|
||||
qemu-cli.sh test # Run basic firmware test
|
||||
qemu-cli.sh test --timeout 120 # Test with longer timeout
|
||||
qemu-cli.sh swarm --preset smoke # Quick swarm test
|
||||
qemu-cli.sh swarm --preset standard # Standard 3-node test
|
||||
qemu-cli.sh swarm --list-presets # List available presets
|
||||
qemu-cli.sh mesh 3 # 3-node mesh test
|
||||
qemu-cli.sh chaos # Run chaos tests
|
||||
qemu-cli.sh fuzz --duration 60 # Fuzz for 60 seconds
|
||||
qemu-cli.sh nvs --list # List NVS configs
|
||||
qemu-cli.sh health build/qemu_output.log
|
||||
qemu-cli.sh status # Show what's installed
|
||||
|
||||
${BOLD}TAB COMPLETION${RST}
|
||||
Source the completions in your shell:
|
||||
eval "\$(qemu-cli.sh --completions)"
|
||||
|
||||
${BOLD}ENVIRONMENT${RST}
|
||||
QEMU_PATH Path to qemu-system-xtensa binary (auto-detected)
|
||||
FUZZ_DURATION Override fuzz duration in seconds (default: 30)
|
||||
FUZZ_JOBS Parallel fuzzing jobs (default: 1)
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
# --- Command: install ------------------------------------------------------
|
||||
cmd_install() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh install"
|
||||
echo "Install QEMU with Espressif ESP32-S3 support."
|
||||
return 0
|
||||
fi
|
||||
local installer="$SCRIPT_DIR/install-qemu.sh"
|
||||
if [[ -f "$installer" ]]; then
|
||||
info "Running install-qemu.sh ..."
|
||||
bash "$installer" "$@"
|
||||
else
|
||||
info "No install-qemu.sh found. Showing manual install steps."
|
||||
cat <<EOF
|
||||
|
||||
${BOLD}Manual QEMU ESP32-S3 installation:${RST}
|
||||
1. git clone https://github.com/espressif/qemu.git ~/.espressif/qemu-src
|
||||
2. cd ~/.espressif/qemu-src
|
||||
3. ./configure --target-list=xtensa-softmmu --prefix=\$HOME/.espressif/qemu/build \\
|
||||
--enable-gcrypt --disable-bsd-user --disable-docs
|
||||
4. make -j\$(nproc) && make install
|
||||
5. Add to PATH: export PATH="\$HOME/.espressif/qemu/build/bin:\$PATH"
|
||||
|
||||
EOF
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Command: test ----------------------------------------------------------
|
||||
cmd_test() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh test [--timeout N] [extra args...]"
|
||||
echo "Run single-node QEMU ESP32-S3 firmware test."
|
||||
return 0
|
||||
fi
|
||||
need_qemu
|
||||
info "Running single-node firmware test ..."
|
||||
bash "$SCRIPT_DIR/qemu-esp32s3-test.sh" "$@"
|
||||
}
|
||||
|
||||
# --- Command: mesh ----------------------------------------------------------
|
||||
cmd_mesh() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh mesh [N] [extra args...]"
|
||||
echo "Run multi-node mesh test. N = number of nodes (default: 3)."
|
||||
return 0
|
||||
fi
|
||||
need_qemu
|
||||
local nodes="${1:-3}"
|
||||
shift 2>/dev/null || true
|
||||
info "Running ${nodes}-node mesh test ..."
|
||||
bash "$SCRIPT_DIR/qemu-mesh-test.sh" "$nodes" "$@"
|
||||
}
|
||||
|
||||
# --- Command: swarm ---------------------------------------------------------
|
||||
cmd_swarm() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh swarm [--preset NAME] [--list-presets] [args...]"
|
||||
echo "Run QEMU swarm configurator (qemu_swarm.py)."
|
||||
echo ""
|
||||
echo "Presets: smoke, standard, full, stress"
|
||||
echo "List: qemu-cli.sh swarm --list-presets"
|
||||
return 0
|
||||
fi
|
||||
need_qemu
|
||||
local py; py="$(detect_python)"
|
||||
info "Running swarm configurator ..."
|
||||
"$py" "$SCRIPT_DIR/qemu_swarm.py" "$@"
|
||||
}
|
||||
|
||||
# --- Command: snapshot ------------------------------------------------------
|
||||
cmd_snapshot() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh snapshot [args...]"
|
||||
echo "Run snapshot-based QEMU tests."
|
||||
return 0
|
||||
fi
|
||||
need_qemu
|
||||
info "Running snapshot tests ..."
|
||||
bash "$SCRIPT_DIR/qemu-snapshot-test.sh" "$@"
|
||||
}
|
||||
|
||||
# --- Command: chaos ---------------------------------------------------------
|
||||
cmd_chaos() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh chaos [args...]"
|
||||
echo "Run chaos / fault injection tests."
|
||||
return 0
|
||||
fi
|
||||
need_qemu
|
||||
info "Running chaos tests ..."
|
||||
bash "$SCRIPT_DIR/qemu-chaos-test.sh" "$@"
|
||||
}
|
||||
|
||||
# --- Command: fuzz ----------------------------------------------------------
|
||||
cmd_fuzz() {
|
||||
local duration="${FUZZ_DURATION:-30}"
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh fuzz [--duration N]"
|
||||
echo "Build and run all 3 fuzz targets (clang libFuzzer)."
|
||||
echo "Requires: clang with libFuzzer support."
|
||||
return 0
|
||||
fi
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--duration) duration="$2"; shift 2 ;;
|
||||
*) warn "Unknown fuzz option: $1"; shift ;;
|
||||
esac
|
||||
done
|
||||
if ! command -v clang >/dev/null 2>&1; then
|
||||
die "clang not found. Fuzz targets require clang with libFuzzer."
|
||||
fi
|
||||
info "Building and running fuzz targets (${duration}s each) ..."
|
||||
make -C "$FUZZ_DIR" run_all FUZZ_DURATION="$duration"
|
||||
ok "Fuzz testing complete."
|
||||
}
|
||||
|
||||
# --- Command: nvs -----------------------------------------------------------
|
||||
cmd_nvs() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh nvs [--list] [args...]"
|
||||
echo "Generate NVS test configuration matrix."
|
||||
return 0
|
||||
fi
|
||||
local py; py="$(detect_python)"
|
||||
info "Running NVS matrix generator ..."
|
||||
"$py" "$SCRIPT_DIR/generate_nvs_matrix.py" "$@"
|
||||
}
|
||||
|
||||
# --- Command: health --------------------------------------------------------
|
||||
cmd_health() {
|
||||
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
||||
echo "Usage: qemu-cli.sh health <logfile>"
|
||||
echo "Analyze firmware health from a QEMU output log."
|
||||
return 0
|
||||
fi
|
||||
local logfile="${1:-}"
|
||||
if [[ -z "$logfile" ]]; then
|
||||
die "Usage: qemu-cli.sh health <logfile>"
|
||||
fi
|
||||
if [[ ! -f "$logfile" ]]; then
|
||||
die "Log file not found: $logfile"
|
||||
fi
|
||||
local py; py="$(detect_python)"
|
||||
info "Analyzing health from: $logfile"
|
||||
"$py" "$SCRIPT_DIR/check_health.py" --log "$logfile" --after-fault manual
|
||||
}
|
||||
|
||||
# --- Command: status --------------------------------------------------------
|
||||
cmd_status() {
|
||||
# Status should never fail — disable errexit locally
|
||||
set +e
|
||||
echo -e "${BOLD}=== QEMU ESP32-S3 Testing Status ===${RST}"
|
||||
echo ""
|
||||
|
||||
# QEMU
|
||||
local qemu_bin
|
||||
qemu_bin="$(detect_qemu 2>/dev/null)"
|
||||
if [[ -n "$qemu_bin" ]]; then
|
||||
local qemu_ver
|
||||
qemu_ver="$("$qemu_bin" --version 2>/dev/null | head -1 || echo "unknown")"
|
||||
ok "QEMU: ${GREEN}installed${RST} ($qemu_ver)"
|
||||
echo " Path: $qemu_bin"
|
||||
else
|
||||
warn "QEMU: ${YELLOW}not found${RST} (run: qemu-cli.sh install)"
|
||||
fi
|
||||
|
||||
# ESP-IDF
|
||||
if [[ -n "${IDF_PATH:-}" ]] && [[ -d "$IDF_PATH" ]]; then
|
||||
ok "ESP-IDF: ${GREEN}available${RST} ($IDF_PATH)"
|
||||
else
|
||||
warn "ESP-IDF: ${YELLOW}IDF_PATH not set${RST}"
|
||||
fi
|
||||
|
||||
# Python
|
||||
local py; py="$(detect_python)"
|
||||
if command -v "$py" >/dev/null 2>&1; then
|
||||
ok "Python: ${GREEN}$("$py" --version 2>&1)${RST}"
|
||||
else
|
||||
warn "Python: ${YELLOW}not found${RST}"
|
||||
fi
|
||||
|
||||
# Clang (for fuzz)
|
||||
if command -v clang >/dev/null 2>&1; then
|
||||
ok "Clang: ${GREEN}$(clang --version 2>/dev/null | head -1)${RST}"
|
||||
else
|
||||
warn "Clang: ${YELLOW}not found${RST} (needed for fuzz targets only)"
|
||||
fi
|
||||
|
||||
# Firmware binary
|
||||
local fw_bin="$FIRMWARE_DIR/build/esp32-csi-node.bin"
|
||||
if [[ -f "$fw_bin" ]]; then
|
||||
local fw_size
|
||||
fw_size="$(stat -c%s "$fw_bin" 2>/dev/null || stat -f%z "$fw_bin" 2>/dev/null || echo "?")"
|
||||
ok "Firmware: ${GREEN}built${RST} ($fw_bin, ${fw_size} bytes)"
|
||||
else
|
||||
warn "Firmware: ${YELLOW}not built${RST} (expected at $fw_bin)"
|
||||
fi
|
||||
|
||||
# Swarm presets
|
||||
local preset_dir="$SCRIPT_DIR/swarm_presets"
|
||||
if [[ -d "$preset_dir" ]]; then
|
||||
local presets
|
||||
presets="$(ls "$preset_dir"/ 2>/dev/null | \
|
||||
sed 's/\.\(yaml\|json\)$//' | sort -u | tr '\n' ', ' | sed 's/,$//')"
|
||||
if [[ -n "$presets" ]]; then
|
||||
ok "Presets: ${GREEN}${presets}${RST}"
|
||||
else
|
||||
warn "Presets: ${YELLOW}none found${RST} in $preset_dir"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
set -e
|
||||
}
|
||||
|
||||
# --- Completions output -----------------------------------------------------
|
||||
print_completions() {
|
||||
cat <<'COMP'
|
||||
_qemu_cli_completions() {
|
||||
local cmds="install test mesh swarm snapshot chaos fuzz nvs health status help"
|
||||
local cur="${COMP_WORDS[COMP_CWORD]}"
|
||||
if [[ $COMP_CWORD -eq 1 ]]; then
|
||||
COMPREPLY=( $(compgen -W "$cmds" -- "$cur") )
|
||||
fi
|
||||
}
|
||||
complete -F _qemu_cli_completions qemu-cli.sh
|
||||
COMP
|
||||
}
|
||||
|
||||
# --- Main dispatch ----------------------------------------------------------
|
||||
main() {
|
||||
local cmd="${1:-help}"
|
||||
shift 2>/dev/null || true
|
||||
|
||||
case "$cmd" in
|
||||
install) cmd_install "$@" ;;
|
||||
test) cmd_test "$@" ;;
|
||||
mesh) cmd_mesh "$@" ;;
|
||||
swarm) cmd_swarm "$@" ;;
|
||||
snapshot) cmd_snapshot "$@" ;;
|
||||
chaos) cmd_chaos "$@" ;;
|
||||
fuzz) cmd_fuzz "$@" ;;
|
||||
nvs) cmd_nvs "$@" ;;
|
||||
health) cmd_health "$@" ;;
|
||||
status) cmd_status "$@" ;;
|
||||
help|-h|--help) cmd_help ;;
|
||||
--version) echo "qemu-cli.sh v${VERSION}" ;;
|
||||
--completions) print_completions ;;
|
||||
*)
|
||||
err "Unknown command: ${BOLD}${cmd}${RST}"
|
||||
echo ""
|
||||
cmd_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -12,10 +12,44 @@
|
|||
# NVS_BIN - Path to a pre-built NVS binary to inject (optional)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 All checks passed
|
||||
# 1 Warnings (non-critical checks failed)
|
||||
# 2 Errors (critical checks failed)
|
||||
# 3 Fatal (crash detected or build failure)
|
||||
# 0 PASS — all checks passed
|
||||
# 1 WARN — non-critical checks failed
|
||||
# 2 FAIL — critical checks failed
|
||||
# 3 FATAL — build error, crash, or infrastructure failure
|
||||
|
||||
# ── Help ──────────────────────────────────────────────────────────────
|
||||
usage() {
|
||||
cat <<'HELP'
|
||||
Usage: qemu-esp32s3-test.sh [OPTIONS]
|
||||
|
||||
Build ESP32-S3 firmware with mock CSI, merge binaries into a single flash
|
||||
image, run under QEMU with a timeout, and validate the UART output.
|
||||
|
||||
Options:
|
||||
-h, --help Show this help message and exit
|
||||
|
||||
Environment variables:
|
||||
QEMU_PATH Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
QEMU_TIMEOUT Timeout in seconds (default: 60)
|
||||
SKIP_BUILD Set to "1" to skip idf.py build (default: unset)
|
||||
NVS_BIN Path to pre-built NVS binary (optional)
|
||||
QEMU_NET Set to "0" to disable networking (default: 1)
|
||||
|
||||
Examples:
|
||||
./qemu-esp32s3-test.sh
|
||||
SKIP_BUILD=1 ./qemu-esp32s3-test.sh
|
||||
QEMU_PATH=/opt/qemu/bin/qemu-system-xtensa QEMU_TIMEOUT=120 ./qemu-esp32s3-test.sh
|
||||
|
||||
Exit codes:
|
||||
0 PASS — all checks passed
|
||||
1 WARN — non-critical checks failed
|
||||
2 FAIL — critical checks failed
|
||||
3 FATAL — build error, crash, or infrastructure failure
|
||||
HELP
|
||||
exit 0
|
||||
}
|
||||
|
||||
case "${1:-}" in -h|--help) usage ;; esac
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
|
@ -35,10 +69,33 @@ echo "QEMU binary: $QEMU_BIN"
|
|||
echo "Timeout: ${TIMEOUT_SEC}s"
|
||||
echo ""
|
||||
|
||||
# Verify QEMU is available
|
||||
# ── Prerequisite checks ───────────────────────────────────────────────
|
||||
if ! command -v "$QEMU_BIN" &>/dev/null; then
|
||||
echo "ERROR: QEMU binary not found: $QEMU_BIN"
|
||||
echo "Set QEMU_PATH to the qemu-system-xtensa binary."
|
||||
echo " Install: sudo apt install qemu-system-misc # Debian/Ubuntu"
|
||||
echo " Install: brew install qemu # macOS"
|
||||
echo " Or set QEMU_PATH to the qemu-system-xtensa binary."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v python3 &>/dev/null; then
|
||||
echo "ERROR: python3 not found."
|
||||
echo " Install: sudo apt install python3 # Debian/Ubuntu"
|
||||
echo " Install: brew install python # macOS"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! python3 -m esptool version &>/dev/null 2>&1; then
|
||||
echo "ERROR: esptool not found (needed to merge flash binaries)."
|
||||
echo " Install: pip install esptool"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# ── SKIP_BUILD precheck ──────────────────────────────────────────────
|
||||
if [ "${SKIP_BUILD:-}" = "1" ] && [ ! -f "$BUILD_DIR/esp32-csi-node.bin" ]; then
|
||||
echo "ERROR: SKIP_BUILD=1 but flash image not found: $BUILD_DIR/esp32-csi-node.bin"
|
||||
echo "Build the firmware first: ./qemu-esp32s3-test.sh (without SKIP_BUILD)"
|
||||
echo "Or unset SKIP_BUILD to build automatically."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
|
|
@ -111,21 +168,26 @@ if ! command -v timeout &>/dev/null; then
|
|||
fi
|
||||
|
||||
QEMU_EXIT=0
|
||||
|
||||
# Common QEMU arguments
|
||||
QEMU_ARGS=(
|
||||
-machine esp32s3
|
||||
-nographic
|
||||
-drive "file=$FLASH_IMAGE,if=mtd,format=raw"
|
||||
-serial mon:stdio
|
||||
-no-reboot
|
||||
)
|
||||
|
||||
# Enable SLIRP user-mode networking for UDP if available
|
||||
if [ "${QEMU_NET:-1}" != "0" ]; then
|
||||
QEMU_ARGS+=(-nic "user,model=open_eth,net=10.0.2.0/24,host=10.0.2.2")
|
||||
fi
|
||||
|
||||
if [ -n "$TIMEOUT_CMD" ]; then
|
||||
$TIMEOUT_CMD "$TIMEOUT_SEC" "$QEMU_BIN" \
|
||||
-machine esp32s3 \
|
||||
-nographic \
|
||||
-drive file="$FLASH_IMAGE",if=mtd,format=raw \
|
||||
-serial mon:stdio \
|
||||
-no-reboot \
|
||||
$TIMEOUT_CMD "$TIMEOUT_SEC" "$QEMU_BIN" "${QEMU_ARGS[@]}" \
|
||||
2>&1 | tee "$LOG_FILE" || QEMU_EXIT=$?
|
||||
else
|
||||
"$QEMU_BIN" \
|
||||
-machine esp32s3 \
|
||||
-nographic \
|
||||
-drive file="$FLASH_IMAGE",if=mtd,format=raw \
|
||||
-serial mon:stdio \
|
||||
-no-reboot \
|
||||
"$QEMU_BIN" "${QEMU_ARGS[@]}" \
|
||||
2>&1 | tee "$LOG_FILE" || QEMU_EXIT=$?
|
||||
fi
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,414 @@
|
|||
#!/bin/bash
|
||||
# QEMU ESP32-S3 Multi-Node Mesh Simulation (ADR-061 Layer 3)
|
||||
#
|
||||
# Spawns N ESP32-S3 QEMU instances connected via a Linux bridge, each with
|
||||
# unique NVS provisioning (node ID, TDM slot), and a Rust aggregator that
|
||||
# collects frames from all nodes. After a configurable timeout the script
|
||||
# tears everything down and runs validate_mesh_test.py.
|
||||
#
|
||||
# Usage:
|
||||
# sudo ./qemu-mesh-test.sh [N_NODES]
|
||||
#
|
||||
# Environment variables:
|
||||
# QEMU_PATH - Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
# QEMU_TIMEOUT - Timeout in seconds (default: 45)
|
||||
# MESH_TIMEOUT - Deprecated alias for QEMU_TIMEOUT
|
||||
# SKIP_BUILD - Set to "1" to skip the idf.py build step
|
||||
# BRIDGE_NAME - Bridge interface name (default: qemu-br0)
|
||||
# BRIDGE_SUBNET - Bridge IP/mask (default: 10.0.0.1/24)
|
||||
# AGGREGATOR_PORT - UDP port the aggregator listens on (default: 5005)
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Linux with bridge-utils and iproute2
|
||||
# - QEMU with ESP32-S3 machine support (qemu-system-xtensa)
|
||||
# - provision.py capable of --dry-run NVS generation
|
||||
# - Rust workspace with wifi-densepose-hardware crate (aggregator binary)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 PASS — all checks passed
|
||||
# 1 WARN — non-critical checks failed
|
||||
# 2 FAIL — critical checks failed
|
||||
# 3 FATAL — build error, crash, or infrastructure failure
|
||||
|
||||
# ── Help ──────────────────────────────────────────────────────────────
|
||||
usage() {
|
||||
cat <<'HELP'
|
||||
Usage: sudo ./qemu-mesh-test.sh [OPTIONS] [N_NODES]
|
||||
|
||||
Spawn N ESP32-S3 QEMU instances connected via a Linux bridge, each with
|
||||
unique NVS provisioning (node ID, TDM slot), and a Rust aggregator that
|
||||
collects frames from all nodes.
|
||||
|
||||
NOTE: Requires root/sudo for TAP/bridge creation.
|
||||
|
||||
Options:
|
||||
-h, --help Show this help message and exit
|
||||
|
||||
Positional:
|
||||
N_NODES Number of mesh nodes (default: 3, minimum: 2)
|
||||
|
||||
Environment variables:
|
||||
QEMU_PATH Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
QEMU_TIMEOUT Timeout in seconds (default: 45)
|
||||
MESH_TIMEOUT Alias for QEMU_TIMEOUT (deprecated)(default: 45)
|
||||
SKIP_BUILD Set to "1" to skip idf.py build (default: unset)
|
||||
BRIDGE_NAME Bridge interface name (default: qemu-br0)
|
||||
BRIDGE_SUBNET Bridge IP/mask (default: 10.0.0.1/24)
|
||||
AGGREGATOR_PORT UDP port for aggregator (default: 5005)
|
||||
|
||||
Examples:
|
||||
sudo ./qemu-mesh-test.sh
|
||||
sudo QEMU_TIMEOUT=90 ./qemu-mesh-test.sh 5
|
||||
sudo SKIP_BUILD=1 ./qemu-mesh-test.sh 4
|
||||
|
||||
Exit codes:
|
||||
0 PASS — all checks passed
|
||||
1 WARN — non-critical checks failed
|
||||
2 FAIL — critical checks failed
|
||||
3 FATAL — build error, crash, or infrastructure failure
|
||||
HELP
|
||||
exit 0
|
||||
}
|
||||
|
||||
case "${1:-}" in -h|--help) usage ;; esac
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Paths
|
||||
# ---------------------------------------------------------------------------
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
FIRMWARE_DIR="$PROJECT_ROOT/firmware/esp32-csi-node"
|
||||
BUILD_DIR="$FIRMWARE_DIR/build"
|
||||
RUST_DIR="$PROJECT_ROOT/rust-port/wifi-densepose-rs"
|
||||
PROVISION_SCRIPT="$FIRMWARE_DIR/provision.py"
|
||||
VALIDATE_SCRIPT="$SCRIPT_DIR/validate_mesh_test.py"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
N_NODES="${1:-3}"
|
||||
QEMU_BIN="${QEMU_PATH:-qemu-system-xtensa}"
|
||||
TIMEOUT="${QEMU_TIMEOUT:-${MESH_TIMEOUT:-45}}"
|
||||
BRIDGE="${BRIDGE_NAME:-qemu-br0}"
|
||||
BRIDGE_IP="${BRIDGE_SUBNET:-10.0.0.1/24}"
|
||||
AGG_PORT="${AGGREGATOR_PORT:-5005}"
|
||||
RESULTS_FILE="$BUILD_DIR/mesh_test_results.json"
|
||||
|
||||
echo "=== QEMU Multi-Node Mesh Test (ADR-061 Layer 3) ==="
|
||||
echo "Nodes: $N_NODES"
|
||||
echo "Bridge: $BRIDGE ($BRIDGE_IP)"
|
||||
echo "Aggregator: 0.0.0.0:$AGG_PORT"
|
||||
echo "QEMU binary: $QEMU_BIN"
|
||||
echo "Timeout: ${TIMEOUT}s"
|
||||
echo ""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Preflight checks
|
||||
# ---------------------------------------------------------------------------
|
||||
if [ "$N_NODES" -lt 2 ]; then
|
||||
echo "ERROR: Need at least 2 nodes for mesh simulation (got $N_NODES)"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v "$QEMU_BIN" &>/dev/null; then
|
||||
echo "ERROR: QEMU binary not found: $QEMU_BIN"
|
||||
echo " Install: sudo apt install qemu-system-misc # Debian/Ubuntu"
|
||||
echo " Install: brew install qemu # macOS"
|
||||
echo " Or set QEMU_PATH to the qemu-system-xtensa binary."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v python3 &>/dev/null; then
|
||||
echo "ERROR: python3 not found."
|
||||
echo " Install: sudo apt install python3 # Debian/Ubuntu"
|
||||
echo " Install: brew install python # macOS"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v ip &>/dev/null; then
|
||||
echo "ERROR: 'ip' command not found."
|
||||
echo " Install: sudo apt install iproute2 # Debian/Ubuntu"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v brctl &>/dev/null && ! ip link help bridge &>/dev/null 2>&1; then
|
||||
echo "WARNING: bridge-utils not found; will use 'ip link' for bridge creation."
|
||||
fi
|
||||
|
||||
if command -v socat &>/dev/null; then
|
||||
true # optional, available
|
||||
else
|
||||
echo "NOTE: socat not found (optional, used for advanced monitor communication)."
|
||||
echo " Install: sudo apt install socat # Debian/Ubuntu"
|
||||
echo " Install: brew install socat # macOS"
|
||||
fi
|
||||
|
||||
if ! command -v cargo &>/dev/null; then
|
||||
echo "ERROR: cargo not found (needed to build the Rust aggregator)."
|
||||
echo " Install: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
echo "ERROR: This script must be run as root (for TAP/bridge creation)."
|
||||
echo "Usage: sudo $0 [N_NODES]"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
mkdir -p "$BUILD_DIR"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cleanup trap — runs on EXIT regardless of success/failure
|
||||
# ---------------------------------------------------------------------------
|
||||
QEMU_PIDS=()
|
||||
AGG_PID=""
|
||||
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "--- Cleaning up ---"
|
||||
|
||||
# Kill QEMU instances
|
||||
for pid in "${QEMU_PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$pid" 2>/dev/null || true
|
||||
wait "$pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill aggregator
|
||||
if [ -n "$AGG_PID" ] && kill -0 "$AGG_PID" 2>/dev/null; then
|
||||
kill "$AGG_PID" 2>/dev/null || true
|
||||
wait "$AGG_PID" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Tear down TAP interfaces and bridge
|
||||
for i in $(seq 0 $((N_NODES - 1))); do
|
||||
local tap="tap${i}"
|
||||
if ip link show "$tap" &>/dev/null; then
|
||||
ip link set "$tap" down 2>/dev/null || true
|
||||
ip link delete "$tap" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
if ip link show "$BRIDGE" &>/dev/null; then
|
||||
ip link set "$BRIDGE" down 2>/dev/null || true
|
||||
ip link delete "$BRIDGE" type bridge 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo "Cleanup complete."
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Build flash image (if not already built)
|
||||
# ---------------------------------------------------------------------------
|
||||
if [ "${SKIP_BUILD:-}" != "1" ]; then
|
||||
echo "[1/6] Building firmware (mock CSI + QEMU overlay)..."
|
||||
idf.py -C "$FIRMWARE_DIR" \
|
||||
-D SDKCONFIG_DEFAULTS="sdkconfig.defaults;sdkconfig.qemu" \
|
||||
build
|
||||
echo ""
|
||||
else
|
||||
echo "[1/6] Skipping build (SKIP_BUILD=1)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Verify build artifacts
|
||||
FLASH_IMAGE_BASE="$BUILD_DIR/qemu_flash_base.bin"
|
||||
for artifact in \
|
||||
"$BUILD_DIR/bootloader/bootloader.bin" \
|
||||
"$BUILD_DIR/partition_table/partition-table.bin" \
|
||||
"$BUILD_DIR/esp32-csi-node.bin"; do
|
||||
if [ ! -f "$artifact" ]; then
|
||||
echo "ERROR: Build artifact not found: $artifact"
|
||||
echo "Run without SKIP_BUILD=1 or build the firmware first."
|
||||
exit 3
|
||||
fi
|
||||
done
|
||||
|
||||
# Merge into base flash image
|
||||
echo "[2/6] Creating base flash image..."
|
||||
OTA_DATA_ARGS=""
|
||||
if [ -f "$BUILD_DIR/ota_data_initial.bin" ]; then
|
||||
OTA_DATA_ARGS="0xf000 $BUILD_DIR/ota_data_initial.bin"
|
||||
fi
|
||||
|
||||
python3 -m esptool --chip esp32s3 merge_bin -o "$FLASH_IMAGE_BASE" \
|
||||
--flash_mode dio --flash_freq 80m --flash_size 8MB \
|
||||
0x0 "$BUILD_DIR/bootloader/bootloader.bin" \
|
||||
0x8000 "$BUILD_DIR/partition_table/partition-table.bin" \
|
||||
$OTA_DATA_ARGS \
|
||||
0x20000 "$BUILD_DIR/esp32-csi-node.bin"
|
||||
|
||||
echo "Base flash image: $FLASH_IMAGE_BASE ($(stat -c%s "$FLASH_IMAGE_BASE" 2>/dev/null || stat -f%z "$FLASH_IMAGE_BASE") bytes)"
|
||||
echo ""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Generate per-node NVS and flash images
|
||||
# ---------------------------------------------------------------------------
|
||||
echo "[3/6] Generating per-node NVS images..."
|
||||
|
||||
# Extract the aggregator IP from the bridge subnet (first host)
|
||||
AGG_IP="${BRIDGE_IP%%/*}"
|
||||
|
||||
for i in $(seq 0 $((N_NODES - 1))); do
|
||||
NVS_BIN="$BUILD_DIR/nvs_node${i}.bin"
|
||||
NODE_FLASH="$BUILD_DIR/qemu_flash_node${i}.bin"
|
||||
|
||||
# Generate NVS with provision.py --dry-run
|
||||
# --port is required by argparse but unused in dry-run; pass a dummy
|
||||
python3 "$PROVISION_SCRIPT" \
|
||||
--port /dev/null \
|
||||
--dry-run \
|
||||
--node-id "$i" \
|
||||
--tdm-slot "$i" \
|
||||
--tdm-total "$N_NODES" \
|
||||
--target-ip "$AGG_IP" \
|
||||
--target-port "$AGG_PORT"
|
||||
|
||||
# provision.py --dry-run writes to nvs_provision.bin in CWD
|
||||
if [ -f "nvs_provision.bin" ]; then
|
||||
mv "nvs_provision.bin" "$NVS_BIN"
|
||||
else
|
||||
echo "ERROR: provision.py did not produce nvs_provision.bin for node $i"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Copy base image and inject NVS at 0x9000
|
||||
cp "$FLASH_IMAGE_BASE" "$NODE_FLASH"
|
||||
dd if="$NVS_BIN" of="$NODE_FLASH" \
|
||||
bs=1 seek=$((0x9000)) conv=notrunc 2>/dev/null
|
||||
|
||||
echo " Node $i: flash=$NODE_FLASH nvs=$NVS_BIN (TDM slot $i/$N_NODES)"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Create bridge and TAP interfaces
|
||||
# ---------------------------------------------------------------------------
|
||||
echo "[4/6] Setting up network bridge and TAP interfaces..."
|
||||
|
||||
# Create bridge
|
||||
ip link add name "$BRIDGE" type bridge 2>/dev/null || true
|
||||
ip addr add "$BRIDGE_IP" dev "$BRIDGE" 2>/dev/null || true
|
||||
ip link set "$BRIDGE" up
|
||||
|
||||
# Create TAP interfaces and attach to bridge
|
||||
for i in $(seq 0 $((N_NODES - 1))); do
|
||||
TAP="tap${i}"
|
||||
ip tuntap add dev "$TAP" mode tap 2>/dev/null || true
|
||||
ip link set "$TAP" master "$BRIDGE"
|
||||
ip link set "$TAP" up
|
||||
echo " $TAP -> $BRIDGE"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 5. Start aggregator and QEMU instances
|
||||
# ---------------------------------------------------------------------------
|
||||
echo "[5/6] Starting aggregator and $N_NODES QEMU nodes..."
|
||||
|
||||
# Start Rust aggregator in background
|
||||
echo " Starting aggregator: listen=0.0.0.0:$AGG_PORT expect-nodes=$N_NODES"
|
||||
cargo run --manifest-path "$RUST_DIR/Cargo.toml" \
|
||||
-p wifi-densepose-hardware --bin aggregator -- \
|
||||
--listen "0.0.0.0:$AGG_PORT" \
|
||||
--expect-nodes "$N_NODES" \
|
||||
--output "$RESULTS_FILE" \
|
||||
> "$BUILD_DIR/aggregator.log" 2>&1 &
|
||||
AGG_PID=$!
|
||||
echo " Aggregator PID: $AGG_PID"
|
||||
|
||||
# Give aggregator a moment to bind
|
||||
sleep 1
|
||||
|
||||
if ! kill -0 "$AGG_PID" 2>/dev/null; then
|
||||
echo "ERROR: Aggregator failed to start. Check $BUILD_DIR/aggregator.log"
|
||||
cat "$BUILD_DIR/aggregator.log" 2>/dev/null || true
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Launch QEMU instances
|
||||
for i in $(seq 0 $((N_NODES - 1))); do
|
||||
TAP="tap${i}"
|
||||
NODE_FLASH="$BUILD_DIR/qemu_flash_node${i}.bin"
|
||||
NODE_LOG="$BUILD_DIR/qemu_node${i}.log"
|
||||
NODE_MAC=$(printf "52:54:00:00:00:%02x" "$i")
|
||||
|
||||
echo " Starting QEMU node $i (tap=$TAP, mac=$NODE_MAC)..."
|
||||
|
||||
"$QEMU_BIN" \
|
||||
-machine esp32s3 \
|
||||
-nographic \
|
||||
-drive "file=$NODE_FLASH,if=mtd,format=raw" \
|
||||
-serial "file:$NODE_LOG" \
|
||||
-no-reboot \
|
||||
-nic "tap,ifname=$TAP,script=no,downscript=no,mac=$NODE_MAC" \
|
||||
> /dev/null 2>&1 &
|
||||
|
||||
QEMU_PIDS+=($!)
|
||||
echo " PID: ${QEMU_PIDS[-1]}, log: $NODE_LOG"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "All nodes launched. Waiting ${TIMEOUT}s for mesh simulation..."
|
||||
echo ""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Wait for timeout
|
||||
# ---------------------------------------------------------------------------
|
||||
sleep "$TIMEOUT"
|
||||
|
||||
echo "Timeout reached. Stopping all processes..."
|
||||
|
||||
# Kill QEMU instances (aggregator killed in cleanup)
|
||||
for pid in "${QEMU_PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Give aggregator a moment to flush results
|
||||
sleep 2
|
||||
|
||||
# Kill aggregator
|
||||
if [ -n "$AGG_PID" ] && kill -0 "$AGG_PID" 2>/dev/null; then
|
||||
kill "$AGG_PID" 2>/dev/null || true
|
||||
wait "$AGG_PID" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 6. Validate results
|
||||
# ---------------------------------------------------------------------------
|
||||
echo "[6/6] Validating mesh test results..."
|
||||
|
||||
VALIDATE_ARGS=("--nodes" "$N_NODES")
|
||||
|
||||
# Pass results file if it was produced
|
||||
if [ -f "$RESULTS_FILE" ]; then
|
||||
VALIDATE_ARGS+=("--results" "$RESULTS_FILE")
|
||||
else
|
||||
echo "WARNING: Aggregator results file not found: $RESULTS_FILE"
|
||||
echo "Validation will rely on node logs only."
|
||||
fi
|
||||
|
||||
# Pass node log files
|
||||
for i in $(seq 0 $((N_NODES - 1))); do
|
||||
NODE_LOG="$BUILD_DIR/qemu_node${i}.log"
|
||||
if [ -f "$NODE_LOG" ]; then
|
||||
VALIDATE_ARGS+=("--log" "$NODE_LOG")
|
||||
fi
|
||||
done
|
||||
|
||||
python3 "$VALIDATE_SCRIPT" "${VALIDATE_ARGS[@]}"
|
||||
VALIDATE_EXIT=$?
|
||||
|
||||
echo ""
|
||||
echo "=== Mesh Test Complete (exit code: $VALIDATE_EXIT) ==="
|
||||
exit $VALIDATE_EXIT
|
||||
|
|
@ -0,0 +1,373 @@
|
|||
#!/bin/bash
|
||||
# QEMU Snapshot-Based Test Runner — ADR-061 Layer 8
|
||||
#
|
||||
# Uses QEMU VM snapshots to accelerate repeated test runs.
|
||||
# Instead of rebooting and re-initializing for each test scenario,
|
||||
# we snapshot the VM state after boot and after the first CSI frame,
|
||||
# then restore from the snapshot for each individual test.
|
||||
#
|
||||
# This dramatically reduces per-test wall time from ~15s (full boot)
|
||||
# to ~2s (snapshot restore + execution).
|
||||
#
|
||||
# Environment variables:
|
||||
# QEMU_PATH - Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
# QEMU_TIMEOUT - Per-test timeout in seconds (default: 10)
|
||||
# FLASH_IMAGE - Path to merged flash image (default: build/qemu_flash.bin)
|
||||
# SKIP_SNAPSHOT - Set to "1" to run without snapshots (baseline timing)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 PASS — all checks passed
|
||||
# 1 WARN — non-critical checks failed
|
||||
# 2 FAIL — critical checks failed
|
||||
# 3 FATAL — build error, crash, or infrastructure failure
|
||||
|
||||
# ── Help ──────────────────────────────────────────────────────────────
|
||||
usage() {
|
||||
cat <<'HELP'
|
||||
Usage: qemu-snapshot-test.sh [OPTIONS]
|
||||
|
||||
Use QEMU VM snapshots to accelerate repeated test runs. Snapshots the VM
|
||||
state after boot and after the first CSI frame, then restores from the
|
||||
snapshot for each individual test (~2s vs ~15s per test).
|
||||
|
||||
Options:
|
||||
-h, --help Show this help message and exit
|
||||
|
||||
Environment variables:
|
||||
QEMU_PATH Path to qemu-system-xtensa (default: qemu-system-xtensa)
|
||||
QEMU_TIMEOUT Per-test timeout in seconds (default: 10)
|
||||
FLASH_IMAGE Path to merged flash image (default: build/qemu_flash.bin)
|
||||
SKIP_SNAPSHOT Set to "1" to run without snapshots (baseline timing)
|
||||
|
||||
Examples:
|
||||
./qemu-snapshot-test.sh
|
||||
QEMU_TIMEOUT=20 ./qemu-snapshot-test.sh
|
||||
FLASH_IMAGE=/path/to/image.bin ./qemu-snapshot-test.sh
|
||||
|
||||
Exit codes:
|
||||
0 PASS — all checks passed
|
||||
1 WARN — non-critical checks failed
|
||||
2 FAIL — critical checks failed
|
||||
3 FATAL — build error, crash, or infrastructure failure
|
||||
HELP
|
||||
exit 0
|
||||
}
|
||||
|
||||
case "${1:-}" in -h|--help) usage ;; esac
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
FIRMWARE_DIR="$PROJECT_ROOT/firmware/esp32-csi-node"
|
||||
BUILD_DIR="$FIRMWARE_DIR/build"
|
||||
QEMU_BIN="${QEMU_PATH:-qemu-system-xtensa}"
|
||||
FLASH_IMAGE="${FLASH_IMAGE:-$BUILD_DIR/qemu_flash.bin}"
|
||||
TIMEOUT_SEC="${QEMU_TIMEOUT:-10}"
|
||||
MONITOR_SOCK="$BUILD_DIR/qemu-monitor.sock"
|
||||
LOG_DIR="$BUILD_DIR/snapshot-tests"
|
||||
QEMU_PID=""
|
||||
|
||||
# Timing accumulators
|
||||
SNAPSHOT_TOTAL_MS=0
|
||||
BASELINE_TOTAL_MS=0
|
||||
|
||||
# Track test results: array of "test_name:exit_code"
|
||||
declare -a TEST_RESULTS=()
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Cleanup
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "[cleanup] Shutting down QEMU and removing socket..."
|
||||
if [ -n "$QEMU_PID" ] && kill -0 "$QEMU_PID" 2>/dev/null; then
|
||||
kill "$QEMU_PID" 2>/dev/null || true
|
||||
wait "$QEMU_PID" 2>/dev/null || true
|
||||
fi
|
||||
rm -f "$MONITOR_SOCK"
|
||||
echo "[cleanup] Done."
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Helpers
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
now_ms() {
|
||||
# Millisecond timestamp (portable: Linux date +%s%N, macOS perl fallback)
|
||||
local ns
|
||||
ns=$(date +%s%N 2>/dev/null)
|
||||
if [[ "$ns" =~ ^[0-9]+$ ]]; then
|
||||
echo $(( ns / 1000000 ))
|
||||
else
|
||||
perl -MTime::HiRes=time -e 'printf "%d\n", time()*1000' 2>/dev/null || \
|
||||
echo $(( $(date +%s) * 1000 ))
|
||||
fi
|
||||
}
|
||||
|
||||
monitor_cmd() {
|
||||
# Send a command to QEMU monitor via socat and capture response
|
||||
local cmd="$1"
|
||||
local timeout="${2:-5}"
|
||||
if ! command -v socat &>/dev/null; then
|
||||
echo "ERROR: socat not found (required for QEMU monitor)" >&2
|
||||
return 1
|
||||
fi
|
||||
echo "$cmd" | socat - "UNIX-CONNECT:$MONITOR_SOCK,connect-timeout=$timeout" 2>/dev/null
|
||||
}
|
||||
|
||||
wait_for_pattern() {
|
||||
# Wait until a pattern appears in the log file, or timeout
|
||||
local log_file="$1"
|
||||
local pattern="$2"
|
||||
local timeout="$3"
|
||||
local elapsed=0
|
||||
while [ "$elapsed" -lt "$timeout" ]; do
|
||||
if [ -f "$log_file" ] && grep -q "$pattern" "$log_file" 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
sleep 1
|
||||
elapsed=$((elapsed + 1))
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
start_qemu() {
|
||||
# Launch QEMU in background with monitor socket
|
||||
echo "[qemu] Launching QEMU with monitor socket..."
|
||||
|
||||
rm -f "$MONITOR_SOCK"
|
||||
|
||||
local qemu_args=(
|
||||
-machine esp32s3
|
||||
-nographic
|
||||
-drive "file=$FLASH_IMAGE,if=mtd,format=raw"
|
||||
-serial "file:$LOG_DIR/qemu_uart.log"
|
||||
-no-reboot
|
||||
-monitor "unix:$MONITOR_SOCK,server,nowait"
|
||||
)
|
||||
|
||||
"$QEMU_BIN" "${qemu_args[@]}" &
|
||||
QEMU_PID=$!
|
||||
echo "[qemu] PID=$QEMU_PID"
|
||||
|
||||
# Wait for monitor socket to appear
|
||||
local waited=0
|
||||
while [ ! -S "$MONITOR_SOCK" ] && [ "$waited" -lt 10 ]; do
|
||||
sleep 1
|
||||
waited=$((waited + 1))
|
||||
done
|
||||
|
||||
if [ ! -S "$MONITOR_SOCK" ]; then
|
||||
echo "ERROR: QEMU monitor socket did not appear after 10s"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Verify QEMU is still running
|
||||
if ! kill -0 "$QEMU_PID" 2>/dev/null; then
|
||||
echo "ERROR: QEMU process exited prematurely"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "[qemu] Monitor socket ready: $MONITOR_SOCK"
|
||||
}
|
||||
|
||||
save_snapshot() {
|
||||
local name="$1"
|
||||
echo "[snapshot] Saving snapshot: $name"
|
||||
monitor_cmd "savevm $name" 5
|
||||
echo "[snapshot] Saved: $name"
|
||||
}
|
||||
|
||||
restore_snapshot() {
|
||||
local name="$1"
|
||||
echo "[snapshot] Restoring snapshot: $name"
|
||||
monitor_cmd "loadvm $name" 5
|
||||
echo "[snapshot] Restored: $name"
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Pre-flight checks
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "=== QEMU Snapshot Test Runner — ADR-061 Layer 8 ==="
|
||||
echo "QEMU binary: $QEMU_BIN"
|
||||
echo "Flash image: $FLASH_IMAGE"
|
||||
echo "Timeout/test: ${TIMEOUT_SEC}s"
|
||||
echo ""
|
||||
|
||||
if ! command -v "$QEMU_BIN" &>/dev/null; then
|
||||
echo "ERROR: QEMU binary not found: $QEMU_BIN"
|
||||
echo " Install: sudo apt install qemu-system-misc # Debian/Ubuntu"
|
||||
echo " Install: brew install qemu # macOS"
|
||||
echo " Or set QEMU_PATH to the qemu-system-xtensa binary."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v qemu-img &>/dev/null; then
|
||||
echo "ERROR: qemu-img not found (needed for snapshot disk management)."
|
||||
echo " Install: sudo apt install qemu-utils # Debian/Ubuntu"
|
||||
echo " Install: brew install qemu # macOS"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if ! command -v socat &>/dev/null; then
|
||||
echo "ERROR: socat not found (needed for QEMU monitor communication)."
|
||||
echo " Install: sudo apt install socat # Debian/Ubuntu"
|
||||
echo " Install: brew install socat # macOS"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if [ ! -f "$FLASH_IMAGE" ]; then
|
||||
echo "ERROR: Flash image not found: $FLASH_IMAGE"
|
||||
echo "Run qemu-esp32s3-test.sh first to build the flash image."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Phase 1: Boot and create snapshots
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Phase 1: Boot and snapshot creation ──"
|
||||
echo ""
|
||||
|
||||
# Clear any previous UART log
|
||||
> "$LOG_DIR/qemu_uart.log"
|
||||
|
||||
start_qemu
|
||||
|
||||
# Wait for boot (look for boot indicators, max 5s)
|
||||
echo "[boot] Waiting for firmware boot (up to 5s)..."
|
||||
if wait_for_pattern "$LOG_DIR/qemu_uart.log" "app_main\|main_task\|ESP32-S3" 5; then
|
||||
echo "[boot] Firmware booted successfully."
|
||||
else
|
||||
echo "[boot] No boot indicator found after 5s (continuing anyway)."
|
||||
fi
|
||||
|
||||
# Save post-boot snapshot
|
||||
save_snapshot "post_boot"
|
||||
echo ""
|
||||
|
||||
# Wait for first mock CSI frame (additional 5s)
|
||||
echo "[frame] Waiting for first CSI frame (up to 5s)..."
|
||||
if wait_for_pattern "$LOG_DIR/qemu_uart.log" "frame\|CSI\|mock_csi\|iq_data\|subcarrier" 5; then
|
||||
echo "[frame] First CSI frame detected."
|
||||
else
|
||||
echo "[frame] No frame indicator found after 5s (continuing anyway)."
|
||||
fi
|
||||
|
||||
# Save post-first-frame snapshot
|
||||
save_snapshot "post_first_frame"
|
||||
echo ""
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Phase 2: Run tests from snapshot
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Phase 2: Running tests from snapshot ──"
|
||||
echo ""
|
||||
|
||||
TESTS=("test_presence" "test_fall" "test_multi_person")
|
||||
MAX_EXIT=0
|
||||
|
||||
for test_name in "${TESTS[@]}"; do
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo " Test: $test_name"
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
test_log="$LOG_DIR/${test_name}.log"
|
||||
t_start=$(now_ms)
|
||||
|
||||
# Restore to post_first_frame state
|
||||
restore_snapshot "post_first_frame"
|
||||
|
||||
# Record current log length so we can extract only new lines
|
||||
pre_lines=$(wc -l < "$LOG_DIR/qemu_uart.log" 2>/dev/null || echo 0)
|
||||
|
||||
# Let execution continue for TIMEOUT_SEC seconds
|
||||
echo "[test] Running for ${TIMEOUT_SEC}s..."
|
||||
sleep "$TIMEOUT_SEC"
|
||||
|
||||
# Capture only the new log lines produced during this test
|
||||
tail -n +$((pre_lines + 1)) "$LOG_DIR/qemu_uart.log" > "$test_log"
|
||||
|
||||
t_end=$(now_ms)
|
||||
elapsed_ms=$((t_end - t_start))
|
||||
SNAPSHOT_TOTAL_MS=$((SNAPSHOT_TOTAL_MS + elapsed_ms))
|
||||
|
||||
echo "[test] Captured $(wc -l < "$test_log") lines in ${elapsed_ms}ms"
|
||||
|
||||
# Validate
|
||||
echo "[test] Validating..."
|
||||
test_exit=0
|
||||
python3 "$SCRIPT_DIR/validate_qemu_output.py" "$test_log" || test_exit=$?
|
||||
|
||||
TEST_RESULTS+=("${test_name}:${test_exit}")
|
||||
if [ "$test_exit" -gt "$MAX_EXIT" ]; then
|
||||
MAX_EXIT=$test_exit
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Phase 3: Baseline timing (without snapshots) for comparison
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Phase 3: Timing comparison ──"
|
||||
echo ""
|
||||
|
||||
# Estimate baseline: full boot (5s) + frame wait (5s) + test run per test
|
||||
BASELINE_PER_TEST=$((5 + 5 + TIMEOUT_SEC))
|
||||
BASELINE_TOTAL_MS=$((BASELINE_PER_TEST * ${#TESTS[@]} * 1000))
|
||||
SNAPSHOT_PER_TEST=$((SNAPSHOT_TOTAL_MS / ${#TESTS[@]}))
|
||||
|
||||
echo "Timing Summary:"
|
||||
echo " Tests run: ${#TESTS[@]}"
|
||||
echo " With snapshots:"
|
||||
echo " Total wall time: ${SNAPSHOT_TOTAL_MS}ms"
|
||||
echo " Per-test average: ${SNAPSHOT_PER_TEST}ms"
|
||||
echo " Without snapshots (estimated):"
|
||||
echo " Total wall time: ${BASELINE_TOTAL_MS}ms"
|
||||
echo " Per-test average: $((BASELINE_PER_TEST * 1000))ms"
|
||||
echo ""
|
||||
|
||||
if [ "$SNAPSHOT_TOTAL_MS" -gt 0 ] && [ "$BASELINE_TOTAL_MS" -gt 0 ]; then
|
||||
SPEEDUP=$((BASELINE_TOTAL_MS * 100 / SNAPSHOT_TOTAL_MS))
|
||||
echo " Speedup: ${SPEEDUP}% (${SPEEDUP}x/100)"
|
||||
else
|
||||
echo " Speedup: N/A (insufficient data)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Summary
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "── Test Results Summary ──"
|
||||
echo ""
|
||||
PASS_COUNT=0
|
||||
FAIL_COUNT=0
|
||||
for result in "${TEST_RESULTS[@]}"; do
|
||||
name="${result%%:*}"
|
||||
code="${result##*:}"
|
||||
if [ "$code" -le 1 ]; then
|
||||
echo " [PASS] $name (exit=$code)"
|
||||
PASS_COUNT=$((PASS_COUNT + 1))
|
||||
else
|
||||
echo " [FAIL] $name (exit=$code)"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo " $PASS_COUNT passed, $FAIL_COUNT failed out of ${#TESTS[@]} tests"
|
||||
echo ""
|
||||
echo "=== Snapshot Test Complete (exit code: $MAX_EXIT) ==="
|
||||
exit "$MAX_EXIT"
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,671 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
QEMU Swarm Health Oracle (ADR-062)
|
||||
|
||||
Validates collective health of a multi-node ESP32-S3 QEMU swarm.
|
||||
Checks cross-node assertions like TDM ordering, inter-node communication,
|
||||
and swarm-level frame rates.
|
||||
|
||||
Usage:
|
||||
python3 swarm_health.py --config swarm_config.yaml --log-dir build/swarm_logs/
|
||||
python3 swarm_health.py --log-dir build/swarm_logs/ --assertions all_nodes_boot no_crashes
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ANSI helpers (disabled when not a TTY)
|
||||
# ---------------------------------------------------------------------------
|
||||
USE_COLOR = sys.stdout.isatty()
|
||||
|
||||
|
||||
def _color(text: str, code: str) -> str:
|
||||
return f"\033[{code}m{text}\033[0m" if USE_COLOR else text
|
||||
|
||||
|
||||
def green(t: str) -> str:
|
||||
return _color(t, "32")
|
||||
|
||||
|
||||
def yellow(t: str) -> str:
|
||||
return _color(t, "33")
|
||||
|
||||
|
||||
def red(t: str) -> str:
|
||||
return _color(t, "1;31")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class AssertionResult:
|
||||
"""Result of a single swarm-level assertion."""
|
||||
name: str
|
||||
passed: bool
|
||||
message: str
|
||||
severity: int # 0 = pass, 1 = warn, 2 = fail
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeLog:
|
||||
"""Parsed log for a single QEMU node."""
|
||||
node_id: int
|
||||
lines: List[str]
|
||||
text: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Log loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_logs(log_dir: Path, node_count: int) -> List[NodeLog]:
|
||||
"""Load qemu_node{i}.log (or node_{i}.log fallback) from *log_dir*."""
|
||||
logs: List[NodeLog] = []
|
||||
for i in range(node_count):
|
||||
path = log_dir / f"qemu_node{i}.log"
|
||||
if not path.exists():
|
||||
path = log_dir / f"node_{i}.log"
|
||||
if path.exists():
|
||||
text = path.read_text(encoding="utf-8", errors="replace")
|
||||
else:
|
||||
text = ""
|
||||
logs.append(NodeLog(node_id=i, lines=text.splitlines(), text=text))
|
||||
return logs
|
||||
|
||||
|
||||
def _node_count_from_dir(log_dir: Path) -> int:
|
||||
"""Auto-detect node count by scanning for qemu_node*.log (or node_*.log) files."""
|
||||
count = 0
|
||||
while (log_dir / f"qemu_node{count}.log").exists() or (log_dir / f"node_{count}.log").exists():
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Individual assertions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_BOOT_PATTERNS = [
|
||||
r"app_main\(\)", r"main_task:", r"main:", r"ESP32-S3 CSI Node",
|
||||
]
|
||||
|
||||
_CRASH_PATTERNS = [
|
||||
r"Guru Meditation", r"assert failed", r"abort\(\)", r"panic",
|
||||
r"LoadProhibited", r"StoreProhibited", r"InstrFetchProhibited",
|
||||
r"IllegalInstruction", r"Unhandled debug exception", r"Fatal exception",
|
||||
]
|
||||
|
||||
_HEAP_PATTERNS = [
|
||||
r"HEAP_ERROR", r"out of memory", r"heap_caps_alloc.*failed",
|
||||
r"malloc.*fail", r"heap corruption", r"CORRUPT HEAP",
|
||||
r"multi_heap", r"heap_lock",
|
||||
]
|
||||
|
||||
_FRAME_PATTERNS = [
|
||||
r"frame", r"CSI", r"mock_csi", r"iq_data", r"subcarrier",
|
||||
r"csi_collector", r"enqueue",
|
||||
]
|
||||
|
||||
_FALL_PATTERNS = [r"fall[=: ]+1", r"fall detected", r"fall_event"]
|
||||
|
||||
|
||||
def assert_all_nodes_boot(logs: List[NodeLog], timeout_s: float = 10.0) -> AssertionResult:
|
||||
"""Check each node's log for boot patterns."""
|
||||
missing: List[int] = []
|
||||
for nl in logs:
|
||||
found = any(
|
||||
re.search(p, nl.text) for p in _BOOT_PATTERNS
|
||||
)
|
||||
if not found:
|
||||
missing.append(nl.node_id)
|
||||
|
||||
if not missing:
|
||||
return AssertionResult(
|
||||
name="all_nodes_boot", passed=True,
|
||||
message=f"All {len(logs)} nodes booted (timeout={timeout_s}s)",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="all_nodes_boot", passed=False,
|
||||
message=f"Nodes missing boot indicator: {missing}",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
|
||||
def assert_no_crashes(logs: List[NodeLog]) -> AssertionResult:
|
||||
"""Check no node has crash patterns."""
|
||||
crashed: List[str] = []
|
||||
for nl in logs:
|
||||
for line in nl.lines:
|
||||
for pat in _CRASH_PATTERNS:
|
||||
if re.search(pat, line):
|
||||
crashed.append(f"node_{nl.node_id}: {line.strip()[:100]}")
|
||||
break
|
||||
if crashed and crashed[-1].startswith(f"node_{nl.node_id}:"):
|
||||
break # one crash per node is enough
|
||||
|
||||
if not crashed:
|
||||
return AssertionResult(
|
||||
name="no_crashes", passed=True,
|
||||
message="No crash indicators in any node",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="no_crashes", passed=False,
|
||||
message=f"Crashes found: {crashed[0]}" + (
|
||||
f" (+{len(crashed)-1} more)" if len(crashed) > 1 else ""
|
||||
),
|
||||
severity=2,
|
||||
)
|
||||
|
||||
|
||||
def assert_tdm_no_collision(logs: List[NodeLog]) -> AssertionResult:
|
||||
"""Parse TDM slot assignments from logs, verify uniqueness."""
|
||||
slot_map: Dict[int, List[int]] = {} # slot -> [node_ids]
|
||||
tdm_pat = re.compile(r"tdm[_ ]?slot[=: ]+(\d+)", re.IGNORECASE)
|
||||
|
||||
for nl in logs:
|
||||
for line in nl.lines:
|
||||
m = tdm_pat.search(line)
|
||||
if m:
|
||||
slot = int(m.group(1))
|
||||
slot_map.setdefault(slot, [])
|
||||
if nl.node_id not in slot_map[slot]:
|
||||
slot_map[slot].append(nl.node_id)
|
||||
break # first occurrence per node
|
||||
|
||||
collisions = {s: nids for s, nids in slot_map.items() if len(nids) > 1}
|
||||
|
||||
if not slot_map:
|
||||
return AssertionResult(
|
||||
name="tdm_no_collision", passed=True,
|
||||
message="No TDM slot assignments found (may be N/A)",
|
||||
severity=0,
|
||||
)
|
||||
if not collisions:
|
||||
return AssertionResult(
|
||||
name="tdm_no_collision", passed=True,
|
||||
message=f"TDM slots unique across {len(slot_map)} assignments",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="tdm_no_collision", passed=False,
|
||||
message=f"TDM collisions: {collisions}",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
|
||||
def assert_all_nodes_produce_frames(
|
||||
logs: List[NodeLog],
|
||||
sensor_ids: Optional[List[int]] = None,
|
||||
) -> AssertionResult:
|
||||
"""Each sensor node has CSI frame output.
|
||||
|
||||
Args:
|
||||
logs: Parsed node logs.
|
||||
sensor_ids: If provided, only check these node IDs (skip coordinators).
|
||||
If None, check all nodes (legacy behavior).
|
||||
"""
|
||||
silent: List[int] = []
|
||||
for nl in logs:
|
||||
if sensor_ids is not None and nl.node_id not in sensor_ids:
|
||||
continue
|
||||
found = any(
|
||||
re.search(p, line, re.IGNORECASE)
|
||||
for line in nl.lines for p in _FRAME_PATTERNS
|
||||
)
|
||||
if not found:
|
||||
silent.append(nl.node_id)
|
||||
|
||||
checked = len(sensor_ids) if sensor_ids is not None else len(logs)
|
||||
if not silent:
|
||||
return AssertionResult(
|
||||
name="all_nodes_produce_frames", passed=True,
|
||||
message=f"All {checked} checked nodes show frame activity",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="all_nodes_produce_frames", passed=False,
|
||||
message=f"Nodes with no frame activity: {silent}",
|
||||
severity=1,
|
||||
)
|
||||
|
||||
|
||||
def assert_coordinator_receives_from_all(
|
||||
logs: List[NodeLog],
|
||||
coordinator_id: int = 0,
|
||||
sensor_ids: Optional[List[int]] = None,
|
||||
) -> AssertionResult:
|
||||
"""Coordinator log shows frames from each sensor's node_id."""
|
||||
coord_log = None
|
||||
for nl in logs:
|
||||
if nl.node_id == coordinator_id:
|
||||
coord_log = nl
|
||||
break
|
||||
|
||||
if coord_log is None:
|
||||
return AssertionResult(
|
||||
name="coordinator_receives_from_all", passed=False,
|
||||
message=f"Coordinator node_{coordinator_id} log not found",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
if sensor_ids is None:
|
||||
sensor_ids = [nl.node_id for nl in logs if nl.node_id != coordinator_id]
|
||||
|
||||
missing: List[int] = []
|
||||
recv_pat = re.compile(r"(from|node_id|src)[=: ]+(\d+)", re.IGNORECASE)
|
||||
received_ids: set = set()
|
||||
for line in coord_log.lines:
|
||||
m = recv_pat.search(line)
|
||||
if m:
|
||||
received_ids.add(int(m.group(2)))
|
||||
|
||||
for sid in sensor_ids:
|
||||
if sid not in received_ids:
|
||||
missing.append(sid)
|
||||
|
||||
if not missing:
|
||||
return AssertionResult(
|
||||
name="coordinator_receives_from_all", passed=True,
|
||||
message=f"Coordinator received from all sensors: {sensor_ids}",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="coordinator_receives_from_all", passed=False,
|
||||
message=f"Coordinator missing frames from nodes: {missing}",
|
||||
severity=1,
|
||||
)
|
||||
|
||||
|
||||
def assert_fall_detected(logs: List[NodeLog], node_id: int) -> AssertionResult:
|
||||
"""Specific node reports fall detection."""
|
||||
for nl in logs:
|
||||
if nl.node_id == node_id:
|
||||
found = any(
|
||||
re.search(p, line, re.IGNORECASE)
|
||||
for line in nl.lines for p in _FALL_PATTERNS
|
||||
)
|
||||
if found:
|
||||
return AssertionResult(
|
||||
name=f"fall_detected_node_{node_id}", passed=True,
|
||||
message=f"Node {node_id} reported fall event",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name=f"fall_detected_node_{node_id}", passed=False,
|
||||
message=f"Node {node_id} did not report fall event",
|
||||
severity=1,
|
||||
)
|
||||
|
||||
return AssertionResult(
|
||||
name=f"fall_detected_node_{node_id}", passed=False,
|
||||
message=f"Node {node_id} log not found",
|
||||
severity=2,
|
||||
)
|
||||
|
||||
|
||||
def assert_frame_rate_above(logs: List[NodeLog], min_fps: float = 10.0) -> AssertionResult:
|
||||
"""Each node meets minimum frame rate."""
|
||||
fps_pat = re.compile(r"(?:fps|frame.?rate)[=: ]+([0-9.]+)", re.IGNORECASE)
|
||||
count_pat = re.compile(r"(?:frame[_ ]?count|frames)[=: ]+(\d+)", re.IGNORECASE)
|
||||
below: List[str] = []
|
||||
|
||||
for nl in logs:
|
||||
best_fps: Optional[float] = None
|
||||
# Try explicit FPS
|
||||
for line in nl.lines:
|
||||
m = fps_pat.search(line)
|
||||
if m:
|
||||
try:
|
||||
best_fps = max(best_fps or 0.0, float(m.group(1)))
|
||||
except ValueError:
|
||||
pass
|
||||
# Fallback: estimate from frame count (assume 1-second intervals)
|
||||
if best_fps is None:
|
||||
counts = []
|
||||
for line in nl.lines:
|
||||
m = count_pat.search(line)
|
||||
if m:
|
||||
try:
|
||||
counts.append(int(m.group(1)))
|
||||
except ValueError:
|
||||
pass
|
||||
if len(counts) >= 2:
|
||||
best_fps = float(counts[-1] - counts[0]) / max(len(counts) - 1, 1)
|
||||
|
||||
if best_fps is not None and best_fps < min_fps:
|
||||
below.append(f"node_{nl.node_id}={best_fps:.1f}")
|
||||
|
||||
if not below:
|
||||
return AssertionResult(
|
||||
name="frame_rate_above", passed=True,
|
||||
message=f"All nodes meet minimum {min_fps} fps",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="frame_rate_above", passed=False,
|
||||
message=f"Nodes below {min_fps} fps: {', '.join(below)}",
|
||||
severity=1,
|
||||
)
|
||||
|
||||
|
||||
def assert_max_boot_time(logs: List[NodeLog], max_seconds: float = 10.0) -> AssertionResult:
|
||||
"""All nodes boot within N seconds (based on timestamp in log)."""
|
||||
boot_time_pat = re.compile(r"\((\d+)\)\s", re.IGNORECASE)
|
||||
slow: List[str] = []
|
||||
|
||||
for nl in logs:
|
||||
boot_found = False
|
||||
for line in nl.lines:
|
||||
if any(re.search(p, line) for p in _BOOT_PATTERNS):
|
||||
boot_found = True
|
||||
m = boot_time_pat.search(line)
|
||||
if m:
|
||||
ms = int(m.group(1))
|
||||
if ms > max_seconds * 1000:
|
||||
slow.append(f"node_{nl.node_id}={ms}ms")
|
||||
break
|
||||
if not boot_found:
|
||||
slow.append(f"node_{nl.node_id}=no_boot")
|
||||
|
||||
if not slow:
|
||||
return AssertionResult(
|
||||
name="max_boot_time", passed=True,
|
||||
message=f"All nodes booted within {max_seconds}s",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="max_boot_time", passed=False,
|
||||
message=f"Slow/missing boot: {', '.join(slow)}",
|
||||
severity=1,
|
||||
)
|
||||
|
||||
|
||||
def assert_no_heap_errors(logs: List[NodeLog]) -> AssertionResult:
|
||||
"""No OOM/heap errors in any log."""
|
||||
errors: List[str] = []
|
||||
for nl in logs:
|
||||
for line in nl.lines:
|
||||
for pat in _HEAP_PATTERNS:
|
||||
if re.search(pat, line, re.IGNORECASE):
|
||||
errors.append(f"node_{nl.node_id}: {line.strip()[:100]}")
|
||||
break
|
||||
if errors and errors[-1].startswith(f"node_{nl.node_id}:"):
|
||||
break
|
||||
|
||||
if not errors:
|
||||
return AssertionResult(
|
||||
name="no_heap_errors", passed=True,
|
||||
message="No heap errors in any node",
|
||||
severity=0,
|
||||
)
|
||||
return AssertionResult(
|
||||
name="no_heap_errors", passed=False,
|
||||
message=f"Heap errors: {errors[0]}" + (
|
||||
f" (+{len(errors)-1} more)" if len(errors) > 1 else ""
|
||||
),
|
||||
severity=2,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Assertion registry & dispatcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ASSERTION_REGISTRY: Dict[str, Any] = {
|
||||
"all_nodes_boot": assert_all_nodes_boot,
|
||||
"no_crashes": assert_no_crashes,
|
||||
"tdm_no_collision": assert_tdm_no_collision,
|
||||
"all_nodes_produce_frames": assert_all_nodes_produce_frames,
|
||||
"coordinator_receives_from_all": assert_coordinator_receives_from_all,
|
||||
"frame_rate_above": assert_frame_rate_above,
|
||||
"max_boot_time": assert_max_boot_time,
|
||||
"no_heap_errors": assert_no_heap_errors,
|
||||
# fall_detected is parameterized, handled separately
|
||||
}
|
||||
|
||||
|
||||
def _parse_assertion_spec(spec: Any) -> tuple:
|
||||
"""Parse a YAML assertion entry into (name, kwargs).
|
||||
|
||||
Supported forms:
|
||||
- "all_nodes_boot" -> ("all_nodes_boot", {})
|
||||
- {"frame_rate_above": 15} -> ("frame_rate_above", {"min_fps": 15})
|
||||
- "fall_detected_by_node_2" -> ("fall_detected", {"node_id": 2})
|
||||
- {"max_boot_time_s": 10} -> ("max_boot_time", {"max_seconds": 10})
|
||||
"""
|
||||
if isinstance(spec, str):
|
||||
# Check for fall_detected_by_node_N pattern
|
||||
m = re.match(r"fall_detected_by_node_(\d+)", spec)
|
||||
if m:
|
||||
return ("fall_detected", {"node_id": int(m.group(1))})
|
||||
return (spec, {})
|
||||
|
||||
if isinstance(spec, dict):
|
||||
for key, val in spec.items():
|
||||
m = re.match(r"fall_detected_by_node_(\d+)", str(key))
|
||||
if m:
|
||||
return ("fall_detected", {"node_id": int(m.group(1))})
|
||||
if key == "frame_rate_above":
|
||||
return ("frame_rate_above", {"min_fps": float(val)})
|
||||
if key == "max_boot_time_s":
|
||||
return ("max_boot_time", {"max_seconds": float(val)})
|
||||
if key == "coordinator_receives_from_all":
|
||||
return ("coordinator_receives_from_all", {})
|
||||
return (str(key), {})
|
||||
|
||||
return (str(spec), {})
|
||||
|
||||
|
||||
def run_assertions(
|
||||
logs: List[NodeLog],
|
||||
assertion_specs: List[Any],
|
||||
config: Optional[Dict] = None,
|
||||
) -> List[AssertionResult]:
|
||||
"""Run all requested assertions against loaded logs."""
|
||||
results: List[AssertionResult] = []
|
||||
|
||||
# Derive coordinator/sensor IDs from config if available
|
||||
coordinator_id = 0
|
||||
sensor_ids: Optional[List[int]] = None
|
||||
if config and "nodes" in config:
|
||||
for node_def in config["nodes"]:
|
||||
if node_def.get("role") == "coordinator":
|
||||
coordinator_id = node_def.get("node_id", 0)
|
||||
sensor_ids = [
|
||||
n["node_id"] for n in config["nodes"]
|
||||
if n.get("role") == "sensor"
|
||||
]
|
||||
|
||||
for spec in assertion_specs:
|
||||
name, kwargs = _parse_assertion_spec(spec)
|
||||
|
||||
if name == "fall_detected":
|
||||
results.append(assert_fall_detected(logs, **kwargs))
|
||||
elif name == "coordinator_receives_from_all":
|
||||
results.append(assert_coordinator_receives_from_all(
|
||||
logs, coordinator_id=coordinator_id, sensor_ids=sensor_ids,
|
||||
))
|
||||
elif name == "all_nodes_produce_frames":
|
||||
results.append(assert_all_nodes_produce_frames(
|
||||
logs, sensor_ids=sensor_ids, **kwargs,
|
||||
))
|
||||
elif name in ASSERTION_REGISTRY:
|
||||
fn = ASSERTION_REGISTRY[name]
|
||||
results.append(fn(logs, **kwargs))
|
||||
else:
|
||||
results.append(AssertionResult(
|
||||
name=name, passed=False,
|
||||
message=f"Unknown assertion: {name}",
|
||||
severity=1,
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Report printing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def print_report(results: List[AssertionResult], swarm_name: str = "") -> int:
|
||||
"""Print the assertion report and return max severity."""
|
||||
header = "QEMU Swarm Health Report (ADR-062)"
|
||||
if swarm_name:
|
||||
header += f" - {swarm_name}"
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f" {header}")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
max_sev = 0
|
||||
for r in results:
|
||||
if r.severity == 0:
|
||||
icon = green("PASS")
|
||||
elif r.severity == 1:
|
||||
icon = yellow("WARN")
|
||||
else:
|
||||
icon = red("FAIL")
|
||||
|
||||
print(f" [{icon}] {r.name}: {r.message}")
|
||||
max_sev = max(max_sev, r.severity)
|
||||
|
||||
print()
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
total = len(results)
|
||||
summary = f" {passed}/{total} assertions passed"
|
||||
|
||||
if max_sev == 0:
|
||||
print(green(summary))
|
||||
elif max_sev == 1:
|
||||
print(yellow(summary + " (with warnings)"))
|
||||
else:
|
||||
print(red(summary + " (with failures)"))
|
||||
|
||||
print()
|
||||
return max_sev
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="QEMU Swarm Health Oracle (ADR-062)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=(
|
||||
"Example:\n"
|
||||
" python3 swarm_health.py --config scripts/swarm_presets/standard.yaml \\\n"
|
||||
" --log-dir build/swarm_logs/\n"
|
||||
"\n"
|
||||
" python3 swarm_health.py --log-dir build/swarm_logs/ \\\n"
|
||||
" --assertions all_nodes_boot no_crashes\n"
|
||||
"\n"
|
||||
"Example output:\n"
|
||||
" ============================================================\n"
|
||||
" QEMU Swarm Health Report (ADR-062) - standard\n"
|
||||
" ============================================================\n"
|
||||
"\n"
|
||||
" [PASS] all_nodes_boot: All 3 nodes booted (timeout=10.0s)\n"
|
||||
" [PASS] no_crashes: No crash indicators in any node\n"
|
||||
" [PASS] tdm_no_collision: TDM slots unique across 3 assignments\n"
|
||||
" [PASS] all_nodes_produce_frames: All 3 nodes show frame activity\n"
|
||||
" [PASS] coordinator_receives_from_all: Coordinator received from all\n"
|
||||
" [WARN] fall_detected_node_2: Node 2 did not report fall event\n"
|
||||
" [PASS] frame_rate_above: All nodes meet minimum 15.0 fps\n"
|
||||
"\n"
|
||||
" 6/7 assertions passed (with warnings)\n"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config", type=str, default=None,
|
||||
help="Path to swarm YAML config (defines nodes and assertions)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-dir", type=str, required=True,
|
||||
help="Directory containing node_0.log, node_1.log, etc.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--assertions", nargs="*", default=None,
|
||||
help="Override assertions (space-separated). Ignores YAML assertion list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--node-count", type=int, default=None,
|
||||
help="Number of nodes (auto-detected from log files if omitted)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
log_dir = Path(args.log_dir)
|
||||
if not log_dir.is_dir():
|
||||
print(f"ERROR: Log directory not found: {log_dir}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# Load YAML config if provided
|
||||
config: Optional[Dict] = None
|
||||
swarm_name = ""
|
||||
yaml_assertions: List[Any] = []
|
||||
|
||||
if args.config:
|
||||
if yaml is None:
|
||||
print("ERROR: PyYAML is required for --config. Install with: pip install pyyaml",
|
||||
file=sys.stderr)
|
||||
sys.exit(2)
|
||||
config_path = Path(args.config)
|
||||
if not config_path.exists():
|
||||
print(f"ERROR: Config file not found: {config_path}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
with open(config_path, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
swarm_name = config.get("swarm", {}).get("name", "")
|
||||
yaml_assertions = config.get("assertions", [])
|
||||
|
||||
# Determine node count
|
||||
if args.node_count is not None:
|
||||
node_count = args.node_count
|
||||
elif config and "nodes" in config:
|
||||
node_count = len(config["nodes"])
|
||||
else:
|
||||
node_count = _node_count_from_dir(log_dir)
|
||||
|
||||
if node_count == 0:
|
||||
print("ERROR: No node logs found and node count not specified.", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# Load logs
|
||||
logs = load_logs(log_dir, node_count)
|
||||
|
||||
# Determine which assertions to run
|
||||
if args.assertions is not None:
|
||||
assertion_specs = args.assertions
|
||||
elif yaml_assertions:
|
||||
assertion_specs = yaml_assertions
|
||||
else:
|
||||
# Default set
|
||||
assertion_specs = ["all_nodes_boot", "no_crashes", "no_heap_errors"]
|
||||
|
||||
# Run assertions
|
||||
results = run_assertions(logs, assertion_specs, config)
|
||||
|
||||
# Print report and exit
|
||||
max_sev = print_report(results, swarm_name)
|
||||
sys.exit(max_sev)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
# CI-optimized preset: 3 nodes, star topology, 30s, minimal assertions
|
||||
swarm:
|
||||
name: ci-matrix
|
||||
duration_s: 30
|
||||
topology: star
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 1
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 2
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- max_boot_time_s: 10
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
# Mixed scenarios: 5 nodes with different CSI scenarios, star topology, 90s
|
||||
swarm:
|
||||
name: heterogeneous
|
||||
duration_s: 90
|
||||
topology: star
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
is_gateway: true
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 1
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 2
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
- role: sensor
|
||||
node_id: 3
|
||||
scenario: 3
|
||||
channel: 6
|
||||
tdm_slot: 3
|
||||
|
||||
- role: sensor
|
||||
node_id: 4
|
||||
scenario: 5
|
||||
channel: 11
|
||||
tdm_slot: 4
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- all_nodes_produce_frames
|
||||
- coordinator_receives_from_all
|
||||
- fall_detected_by_node_3
|
||||
- no_heap_errors
|
||||
- frame_rate_above: 12
|
||||
- max_boot_time_s: 12
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
# Scale test: 6 fully-connected nodes in mesh topology, 90s
|
||||
swarm:
|
||||
name: large-mesh
|
||||
duration_s: 90
|
||||
topology: mesh
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
is_gateway: true
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 1
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 2
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
- role: sensor
|
||||
node_id: 3
|
||||
scenario: 3
|
||||
channel: 6
|
||||
tdm_slot: 3
|
||||
|
||||
- role: sensor
|
||||
node_id: 4
|
||||
scenario: 4
|
||||
channel: 6
|
||||
tdm_slot: 4
|
||||
|
||||
- role: sensor
|
||||
node_id: 5
|
||||
scenario: 5
|
||||
channel: 6
|
||||
tdm_slot: 5
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- all_nodes_produce_frames
|
||||
- coordinator_receives_from_all
|
||||
- no_heap_errors
|
||||
- frame_rate_above: 10
|
||||
- max_boot_time_s: 15
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# Multi-hop relay chain: 4 nodes in line topology, 60s
|
||||
swarm:
|
||||
name: line-relay
|
||||
duration_s: 60
|
||||
topology: line
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: gateway
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
is_gateway: true
|
||||
|
||||
- role: coordinator
|
||||
node_id: 1
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 2
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
- role: sensor
|
||||
node_id: 3
|
||||
scenario: 1
|
||||
channel: 6
|
||||
tdm_slot: 3
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- all_nodes_produce_frames
|
||||
- max_boot_time_s: 12
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
# Ring topology with fault injection: 4 nodes, 75s
|
||||
swarm:
|
||||
name: ring-fault
|
||||
duration_s: 75
|
||||
topology: ring
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
is_gateway: true
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 1
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 2
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
- role: sensor
|
||||
node_id: 3
|
||||
scenario: 3
|
||||
channel: 6
|
||||
tdm_slot: 3
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- all_nodes_produce_frames
|
||||
- coordinator_receives_from_all
|
||||
- no_heap_errors
|
||||
- max_boot_time_s: 12
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Quick CI smoke test: 2 nodes, star topology, 15s duration
|
||||
swarm:
|
||||
name: smoke
|
||||
duration_s: 15
|
||||
topology: star
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 1
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- max_boot_time_s: 10
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
# Standard 3-node test: 2 sensors + 1 coordinator, star topology, 60s
|
||||
swarm:
|
||||
name: standard
|
||||
duration_s: 60
|
||||
topology: star
|
||||
aggregator_port: 5005
|
||||
|
||||
nodes:
|
||||
- role: coordinator
|
||||
node_id: 0
|
||||
scenario: 0
|
||||
channel: 6
|
||||
edge_tier: 2
|
||||
is_gateway: true
|
||||
|
||||
- role: sensor
|
||||
node_id: 1
|
||||
scenario: 2
|
||||
channel: 6
|
||||
tdm_slot: 1
|
||||
|
||||
- role: sensor
|
||||
node_id: 2
|
||||
scenario: 3
|
||||
channel: 6
|
||||
tdm_slot: 2
|
||||
|
||||
assertions:
|
||||
- all_nodes_boot
|
||||
- no_crashes
|
||||
- tdm_no_collision
|
||||
- all_nodes_produce_frames
|
||||
- coordinator_receives_from_all
|
||||
- fall_detected_by_node_2
|
||||
- frame_rate_above: 15
|
||||
- max_boot_time_s: 10
|
||||
|
|
@ -0,0 +1,504 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
QEMU Multi-Node Mesh Validation (ADR-061 Layer 3)
|
||||
|
||||
Validates the output of a multi-node mesh simulation run by qemu-mesh-test.sh.
|
||||
Parses the aggregator results JSON and per-node UART logs, then runs 6 checks:
|
||||
|
||||
1. All nodes booted - every node log contains a boot indicator
|
||||
2. TDM ordering - slot assignments are sequential 0..N-1
|
||||
3. No slot collision - no two nodes share a TDM slot
|
||||
4. Frame count balance - per-node frame counts within +/-10%
|
||||
5. ADR-018 compliance - magic 0xC5110001 present in frames
|
||||
6. Vitals per node - each node produced vitals output
|
||||
|
||||
Usage:
|
||||
python3 validate_mesh_test.py --nodes N [results.json] [--log node0.log] ...
|
||||
|
||||
Exit codes:
|
||||
0 All checks passed (or only SKIP-level)
|
||||
1 Warnings (non-critical checks failed)
|
||||
2 Errors (critical checks failed)
|
||||
3 Fatal (crash or missing nodes)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Severity / reporting (matches validate_qemu_output.py pattern)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class Severity(IntEnum):
|
||||
PASS = 0
|
||||
SKIP = 1
|
||||
WARN = 2
|
||||
ERROR = 3
|
||||
FATAL = 4
|
||||
|
||||
|
||||
USE_COLOR = sys.stdout.isatty()
|
||||
|
||||
|
||||
def color(text: str, code: str) -> str:
|
||||
if not USE_COLOR:
|
||||
return text
|
||||
return f"\033[{code}m{text}\033[0m"
|
||||
|
||||
|
||||
def green(text: str) -> str:
|
||||
return color(text, "32")
|
||||
|
||||
|
||||
def yellow(text: str) -> str:
|
||||
return color(text, "33")
|
||||
|
||||
|
||||
def red(text: str) -> str:
|
||||
return color(text, "31")
|
||||
|
||||
|
||||
def bold_red(text: str) -> str:
|
||||
return color(text, "1;31")
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckResult:
|
||||
name: str
|
||||
severity: Severity
|
||||
message: str
|
||||
count: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationReport:
|
||||
checks: List[CheckResult] = field(default_factory=list)
|
||||
|
||||
def add(self, name: str, severity: Severity, message: str, count: int = 0):
|
||||
self.checks.append(CheckResult(name, severity, message, count))
|
||||
|
||||
@property
|
||||
def max_severity(self) -> Severity:
|
||||
if not self.checks:
|
||||
return Severity.PASS
|
||||
return max(c.severity for c in self.checks)
|
||||
|
||||
def print_report(self):
|
||||
print("\n" + "=" * 60)
|
||||
print(" Multi-Node Mesh Validation Report (ADR-061 Layer 3)")
|
||||
print("=" * 60 + "\n")
|
||||
|
||||
for check in self.checks:
|
||||
if check.severity == Severity.PASS:
|
||||
icon = green("PASS")
|
||||
elif check.severity == Severity.SKIP:
|
||||
icon = yellow("SKIP")
|
||||
elif check.severity == Severity.WARN:
|
||||
icon = yellow("WARN")
|
||||
elif check.severity == Severity.ERROR:
|
||||
icon = red("FAIL")
|
||||
else:
|
||||
icon = bold_red("FATAL")
|
||||
|
||||
count_str = f" (count={check.count})" if check.count > 0 else ""
|
||||
print(f" [{icon}] {check.name}: {check.message}{count_str}")
|
||||
|
||||
print()
|
||||
|
||||
passed = sum(1 for c in self.checks if c.severity <= Severity.SKIP)
|
||||
total = len(self.checks)
|
||||
summary = f" {passed}/{total} checks passed"
|
||||
|
||||
max_sev = self.max_severity
|
||||
if max_sev <= Severity.SKIP:
|
||||
print(green(summary))
|
||||
elif max_sev == Severity.WARN:
|
||||
print(yellow(summary + " (with warnings)"))
|
||||
elif max_sev == Severity.ERROR:
|
||||
print(red(summary + " (with errors)"))
|
||||
else:
|
||||
print(bold_red(summary + " (FATAL issues detected)"))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Log parsing helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_node_booted(log_text: str) -> bool:
|
||||
"""Return True if the log shows a boot indicator."""
|
||||
boot_patterns = [r"app_main\(\)", r"main_task:", r"main:", r"ESP32-S3 CSI Node"]
|
||||
return any(re.search(p, log_text) for p in boot_patterns)
|
||||
|
||||
|
||||
def check_node_crashed(log_text: str) -> Optional[str]:
|
||||
"""Return first crash line or None."""
|
||||
crash_patterns = [
|
||||
r"Guru Meditation", r"assert failed", r"abort\(\)",
|
||||
r"panic", r"LoadProhibited", r"StoreProhibited",
|
||||
r"InstrFetchProhibited", r"IllegalInstruction",
|
||||
]
|
||||
for line in log_text.splitlines():
|
||||
for pat in crash_patterns:
|
||||
if re.search(pat, line):
|
||||
return line.strip()[:120]
|
||||
return None
|
||||
|
||||
|
||||
def extract_node_id_from_log(log_text: str) -> Optional[int]:
|
||||
"""Try to extract the node_id from UART log lines."""
|
||||
patterns = [
|
||||
r"node_id[=: ]+(\d+)",
|
||||
r"Node ID[=: ]+(\d+)",
|
||||
r"TDM slot[=: ]+(\d+)",
|
||||
]
|
||||
for line in log_text.splitlines():
|
||||
for pat in patterns:
|
||||
m = re.search(pat, line, re.IGNORECASE)
|
||||
if m:
|
||||
try:
|
||||
return int(m.group(1))
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def check_vitals_in_log(log_text: str) -> bool:
|
||||
"""Return True if the log contains vitals output."""
|
||||
vitals_patterns = [r"vitals", r"breathing", r"breathing_bpm",
|
||||
r"heart_rate", r"heartrate"]
|
||||
return any(
|
||||
re.search(p, line, re.IGNORECASE)
|
||||
for line in log_text.splitlines()
|
||||
for p in vitals_patterns
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def validate_mesh(
|
||||
n_nodes: int,
|
||||
results_path: Optional[Path],
|
||||
log_paths: List[Path],
|
||||
) -> ValidationReport:
|
||||
"""Run all 6 mesh validation checks."""
|
||||
report = ValidationReport()
|
||||
|
||||
# Load aggregator results if available
|
||||
results: Optional[dict] = None
|
||||
if results_path:
|
||||
if not results_path.exists():
|
||||
print(f"WARNING: Aggregator results file not found: {results_path}",
|
||||
file=sys.stderr)
|
||||
report.add("Results JSON", Severity.WARN,
|
||||
f"Results file not found: {results_path}")
|
||||
else:
|
||||
try:
|
||||
results = json.loads(results_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError) as exc:
|
||||
report.add("Results JSON", Severity.ERROR,
|
||||
f"Failed to parse results: {exc}")
|
||||
|
||||
# Load per-node logs
|
||||
node_logs: Dict[int, str] = {}
|
||||
for idx, lp in enumerate(log_paths):
|
||||
if lp.exists():
|
||||
node_logs[idx] = lp.read_text(encoding="utf-8", errors="replace")
|
||||
else:
|
||||
node_logs[idx] = ""
|
||||
|
||||
# ---- Check 1: All nodes booted ----
|
||||
booted = []
|
||||
not_booted = []
|
||||
crashed = []
|
||||
for idx in range(n_nodes):
|
||||
log_text = node_logs.get(idx, "")
|
||||
if not log_text.strip():
|
||||
not_booted.append(idx)
|
||||
continue
|
||||
crash_line = check_node_crashed(log_text)
|
||||
if crash_line:
|
||||
crashed.append((idx, crash_line))
|
||||
if check_node_booted(log_text):
|
||||
booted.append(idx)
|
||||
else:
|
||||
not_booted.append(idx)
|
||||
|
||||
if crashed:
|
||||
crash_desc = "; ".join(f"node {i}: {msg}" for i, msg in crashed)
|
||||
report.add("All nodes booted", Severity.FATAL,
|
||||
f"Crash detected: {crash_desc}", count=len(crashed))
|
||||
elif len(booted) == n_nodes:
|
||||
report.add("All nodes booted", Severity.PASS,
|
||||
f"All {n_nodes} nodes booted successfully", count=n_nodes)
|
||||
elif len(booted) == 0:
|
||||
report.add("All nodes booted", Severity.FATAL,
|
||||
f"No nodes booted (expected {n_nodes})")
|
||||
else:
|
||||
missing = ", ".join(str(i) for i in not_booted)
|
||||
report.add("All nodes booted", Severity.ERROR,
|
||||
f"{len(booted)}/{n_nodes} booted; missing: [{missing}]",
|
||||
count=len(booted))
|
||||
|
||||
# ---- Check 2: TDM ordering ----
|
||||
# Extract TDM slots either from aggregator results or from logs
|
||||
tdm_slots: Dict[int, int] = {}
|
||||
|
||||
# Try aggregator results first
|
||||
if results and "nodes" in results:
|
||||
for node_entry in results["nodes"]:
|
||||
nid = node_entry.get("node_id")
|
||||
slot = node_entry.get("tdm_slot")
|
||||
if nid is not None and slot is not None:
|
||||
tdm_slots[int(nid)] = int(slot)
|
||||
|
||||
# Fall back to log extraction
|
||||
if not tdm_slots:
|
||||
for idx in range(n_nodes):
|
||||
log_text = node_logs.get(idx, "")
|
||||
nid = extract_node_id_from_log(log_text)
|
||||
if nid is not None:
|
||||
tdm_slots[idx] = nid
|
||||
|
||||
if len(tdm_slots) == n_nodes:
|
||||
expected = list(range(n_nodes))
|
||||
actual = [tdm_slots.get(i, -1) for i in range(n_nodes)]
|
||||
if actual == expected:
|
||||
report.add("TDM ordering", Severity.PASS,
|
||||
f"Slots sequential 0..{n_nodes - 1}")
|
||||
else:
|
||||
report.add("TDM ordering", Severity.ERROR,
|
||||
f"Expected slots {expected}, got {actual}")
|
||||
elif len(tdm_slots) > 0:
|
||||
report.add("TDM ordering", Severity.WARN,
|
||||
f"Only {len(tdm_slots)}/{n_nodes} TDM slots detected",
|
||||
count=len(tdm_slots))
|
||||
else:
|
||||
report.add("TDM ordering", Severity.SKIP,
|
||||
"No TDM slot info found in results or logs")
|
||||
|
||||
# ---- Check 3: No slot collision ----
|
||||
if tdm_slots:
|
||||
slot_to_nodes: Dict[int, List[int]] = {}
|
||||
for nid, slot in tdm_slots.items():
|
||||
slot_to_nodes.setdefault(slot, []).append(nid)
|
||||
|
||||
collisions = {s: nodes for s, nodes in slot_to_nodes.items() if len(nodes) > 1}
|
||||
if not collisions:
|
||||
report.add("No slot collision", Severity.PASS,
|
||||
f"All {len(tdm_slots)} slots unique")
|
||||
else:
|
||||
desc = "; ".join(f"slot {s}: nodes {ns}" for s, ns in collisions.items())
|
||||
report.add("No slot collision", Severity.ERROR,
|
||||
f"Slot collisions: {desc}", count=len(collisions))
|
||||
else:
|
||||
report.add("No slot collision", Severity.SKIP,
|
||||
"No TDM slot data to check for collisions")
|
||||
|
||||
# ---- Check 4: Frame count balance (within +/-10%) ----
|
||||
frame_counts: Dict[int, int] = {}
|
||||
|
||||
# Try aggregator results
|
||||
if results and "nodes" in results:
|
||||
for node_entry in results["nodes"]:
|
||||
nid = node_entry.get("node_id")
|
||||
fc = node_entry.get("frame_count", node_entry.get("frames", 0))
|
||||
if nid is not None:
|
||||
frame_counts[int(nid)] = int(fc)
|
||||
|
||||
# Fall back to log extraction
|
||||
if not frame_counts:
|
||||
for idx in range(n_nodes):
|
||||
log_text = node_logs.get(idx, "")
|
||||
frame_pats = [
|
||||
r"frame[_ ]count[=: ]+(\d+)",
|
||||
r"frames?[=: ]+(\d+)",
|
||||
r"emitted[=: ]+(\d+)",
|
||||
]
|
||||
max_fc = 0
|
||||
for line in log_text.splitlines():
|
||||
for pat in frame_pats:
|
||||
m = re.search(pat, line, re.IGNORECASE)
|
||||
if m:
|
||||
try:
|
||||
max_fc = max(max_fc, int(m.group(1)))
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
if max_fc > 0:
|
||||
frame_counts[idx] = max_fc
|
||||
|
||||
if len(frame_counts) >= 2:
|
||||
counts = list(frame_counts.values())
|
||||
avg = sum(counts) / len(counts)
|
||||
if avg > 0:
|
||||
max_deviation = max(abs(c - avg) / avg for c in counts)
|
||||
details = ", ".join(f"node {nid}={fc}" for nid, fc in sorted(frame_counts.items()))
|
||||
if max_deviation <= 0.10:
|
||||
report.add("Frame count balance", Severity.PASS,
|
||||
f"Within +/-10% (avg={avg:.0f}): {details}",
|
||||
count=int(avg))
|
||||
elif max_deviation <= 0.25:
|
||||
report.add("Frame count balance", Severity.WARN,
|
||||
f"Deviation {max_deviation:.0%} exceeds 10%: {details}",
|
||||
count=int(avg))
|
||||
else:
|
||||
report.add("Frame count balance", Severity.ERROR,
|
||||
f"Severe imbalance {max_deviation:.0%}: {details}",
|
||||
count=int(avg))
|
||||
else:
|
||||
report.add("Frame count balance", Severity.ERROR,
|
||||
"All frame counts are zero")
|
||||
elif len(frame_counts) == 1:
|
||||
report.add("Frame count balance", Severity.WARN,
|
||||
f"Only 1 node reported frames: {frame_counts}")
|
||||
else:
|
||||
report.add("Frame count balance", Severity.WARN,
|
||||
"No frame count data found")
|
||||
|
||||
# ---- Check 5: ADR-018 compliance (magic 0xC5110001) ----
|
||||
ADR018_MAGIC = "c5110001"
|
||||
magic_found = False
|
||||
|
||||
# Check aggregator results
|
||||
if results:
|
||||
results_str = json.dumps(results).lower()
|
||||
if ADR018_MAGIC in results_str or "0xc5110001" in results_str:
|
||||
magic_found = True
|
||||
# Also check a dedicated field
|
||||
if results.get("adr018_magic") or results.get("magic"):
|
||||
magic_found = True
|
||||
# Check per-node entries
|
||||
if "nodes" in results:
|
||||
for node_entry in results["nodes"]:
|
||||
magic = node_entry.get("magic", "")
|
||||
if isinstance(magic, str) and ADR018_MAGIC in magic.lower():
|
||||
magic_found = True
|
||||
elif isinstance(magic, int) and magic == 0xC5110001:
|
||||
magic_found = True
|
||||
|
||||
# Check logs for serialization/ADR-018 markers
|
||||
if not magic_found:
|
||||
for idx in range(n_nodes):
|
||||
log_text = node_logs.get(idx, "")
|
||||
adr018_pats = [
|
||||
r"0xC5110001",
|
||||
r"c5110001",
|
||||
r"ADR-018",
|
||||
r"magic[=: ]+0x[Cc]5110001",
|
||||
]
|
||||
if any(re.search(p, log_text, re.IGNORECASE) for p in adr018_pats):
|
||||
magic_found = True
|
||||
break
|
||||
|
||||
if magic_found:
|
||||
report.add("ADR-018 compliance", Severity.PASS,
|
||||
"Magic 0xC5110001 found in frame data")
|
||||
else:
|
||||
report.add("ADR-018 compliance", Severity.WARN,
|
||||
"Magic 0xC5110001 not found (may require deeper frame inspection)")
|
||||
|
||||
# ---- Check 6: Vitals per node ----
|
||||
vitals_nodes = []
|
||||
no_vitals_nodes = []
|
||||
for idx in range(n_nodes):
|
||||
log_text = node_logs.get(idx, "")
|
||||
if check_vitals_in_log(log_text):
|
||||
vitals_nodes.append(idx)
|
||||
else:
|
||||
no_vitals_nodes.append(idx)
|
||||
|
||||
# Also check aggregator results for vitals data
|
||||
if results and "nodes" in results:
|
||||
for node_entry in results["nodes"]:
|
||||
nid = node_entry.get("node_id")
|
||||
has_vitals = (
|
||||
node_entry.get("vitals") is not None
|
||||
or node_entry.get("breathing_bpm") is not None
|
||||
or node_entry.get("heart_rate") is not None
|
||||
)
|
||||
if has_vitals and nid is not None and int(nid) not in vitals_nodes:
|
||||
vitals_nodes.append(int(nid))
|
||||
if int(nid) in no_vitals_nodes:
|
||||
no_vitals_nodes.remove(int(nid))
|
||||
|
||||
if len(vitals_nodes) == n_nodes:
|
||||
report.add("Vitals per node", Severity.PASS,
|
||||
f"All {n_nodes} nodes produced vitals output",
|
||||
count=n_nodes)
|
||||
elif len(vitals_nodes) > 0:
|
||||
missing = ", ".join(str(i) for i in no_vitals_nodes)
|
||||
report.add("Vitals per node", Severity.WARN,
|
||||
f"{len(vitals_nodes)}/{n_nodes} nodes have vitals; "
|
||||
f"missing: [{missing}]",
|
||||
count=len(vitals_nodes))
|
||||
else:
|
||||
report.add("Vitals per node", Severity.WARN,
|
||||
"No vitals output found from any node")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate multi-node mesh QEMU test output (ADR-061 Layer 3)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=(
|
||||
"Examples:\n"
|
||||
" python3 validate_mesh_test.py --nodes 3 --results mesh_results.json\n"
|
||||
" python3 validate_mesh_test.py --nodes 4 --log node0.log --log node1.log"
|
||||
),
|
||||
)
|
||||
parser.add_argument("--results", default=None,
|
||||
help="Path to mesh_test_results.json from aggregator")
|
||||
parser.add_argument("--nodes", "-n", type=int, required=True,
|
||||
help="Expected number of mesh nodes")
|
||||
parser.add_argument("--log", action="append", default=[],
|
||||
help="Path to a per-node QEMU log (can be repeated)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.nodes < 2:
|
||||
print("ERROR: --nodes must be >= 2", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
results_path = Path(args.results) if args.results else None
|
||||
log_paths = [Path(lp) for lp in args.log]
|
||||
|
||||
# If no log files given, try the conventional paths
|
||||
if not log_paths:
|
||||
for i in range(args.nodes):
|
||||
candidate = Path(f"build/qemu_node{i}.log")
|
||||
if candidate.exists():
|
||||
log_paths.append(candidate)
|
||||
|
||||
report = validate_mesh(args.nodes, results_path, log_paths)
|
||||
report.print_report()
|
||||
|
||||
# Map max severity to exit code
|
||||
max_sev = report.max_severity
|
||||
if max_sev <= Severity.SKIP:
|
||||
sys.exit(0)
|
||||
elif max_sev == Severity.WARN:
|
||||
sys.exit(1)
|
||||
elif max_sev == Severity.ERROR:
|
||||
sys.exit(2)
|
||||
else:
|
||||
sys.exit(3)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -3,8 +3,9 @@
|
|||
QEMU ESP32-S3 UART Output Validator (ADR-061)
|
||||
|
||||
Parses the UART log captured from a QEMU firmware run and validates
|
||||
14 checks covering boot, NVS, mock CSI, edge processing, vitals,
|
||||
presence/fall detection, serialization, and crash indicators.
|
||||
16 checks covering boot, NVS, mock CSI, edge processing, vitals,
|
||||
presence/fall detection, serialization, crash indicators, scenario
|
||||
completion, and frame rate sanity.
|
||||
|
||||
Usage:
|
||||
python3 validate_qemu_output.py <log_file>
|
||||
|
|
@ -16,6 +17,7 @@ Exit codes:
|
|||
3 Fatal (crash or corruption detected)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
|
|
@ -119,7 +121,7 @@ class ValidationReport:
|
|||
|
||||
|
||||
def validate_log(log_text: str) -> ValidationReport:
|
||||
"""Run all 14 validation checks against the UART log text."""
|
||||
"""Run all 16 validation checks against the UART log text."""
|
||||
report = ValidationReport()
|
||||
lines = log_text.splitlines()
|
||||
log_lower = log_text.lower()
|
||||
|
|
@ -131,7 +133,7 @@ def validate_log(log_text: str) -> ValidationReport:
|
|||
if boot_found:
|
||||
report.add("Boot", Severity.PASS, "Firmware booted successfully")
|
||||
else:
|
||||
report.add("Boot", Severity.ERROR, "No boot indicator found (app_main / main_task)")
|
||||
report.add("Boot", Severity.FATAL, "No boot indicator found (app_main / main_task)")
|
||||
|
||||
# ---- Check 2: NVS load ----
|
||||
nvs_patterns = [r"nvs_config:", r"nvs_config_load", r"NVS", r"csi_cfg"]
|
||||
|
|
@ -327,15 +329,55 @@ def validate_log(log_text: str) -> ValidationReport:
|
|||
report.add("Clean exit", Severity.WARN,
|
||||
"Reboot detected (may indicate crash or watchdog)")
|
||||
|
||||
# ---- Check 15: Scenario completion (when running all scenarios) ----
|
||||
all_scenarios_pattern = r"All (\d+) scenarios complete"
|
||||
scenario_match = re.search(all_scenarios_pattern, log_text)
|
||||
if scenario_match:
|
||||
n_scenarios = int(scenario_match.group(1))
|
||||
report.add("Scenario completion", Severity.PASS,
|
||||
f"All {n_scenarios} scenarios completed", count=n_scenarios)
|
||||
else:
|
||||
# Check if individual scenario started indicators exist
|
||||
scenario_starts = re.findall(r"=== Scenario (\d+) started ===", log_text)
|
||||
if scenario_starts:
|
||||
report.add("Scenario completion", Severity.WARN,
|
||||
f"Started {len(scenario_starts)} scenarios but no completion marker",
|
||||
count=len(scenario_starts))
|
||||
else:
|
||||
report.add("Scenario completion", Severity.SKIP,
|
||||
"No scenario tracking (single scenario or mock not enabled)")
|
||||
|
||||
# ---- Check 16: Frame rate sanity ----
|
||||
# Extract scenario frame counts and check they're reasonable
|
||||
frame_reports = re.findall(r"scenario=\d+ frames=(\d+)", log_text)
|
||||
if frame_reports:
|
||||
max_frames = max(int(f) for f in frame_reports)
|
||||
if max_frames > 0:
|
||||
report.add("Frame rate", Severity.PASS,
|
||||
f"Peak frame counter: {max_frames}", count=max_frames)
|
||||
else:
|
||||
report.add("Frame rate", Severity.ERROR,
|
||||
"Frame counters are all zero")
|
||||
else:
|
||||
report.add("Frame rate", Severity.SKIP,
|
||||
"No periodic frame reports found")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: {sys.argv[0]} <log_file>", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate QEMU ESP32-S3 UART output (ADR-061)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="Example: python3 validate_qemu_output.py build/qemu_output.log",
|
||||
)
|
||||
parser.add_argument(
|
||||
"log_file",
|
||||
help="Path to QEMU UART log file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
log_path = Path(sys.argv[1])
|
||||
log_path = Path(args.log_file)
|
||||
if not log_path.exists():
|
||||
print(f"ERROR: Log file not found: {log_path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
|
|
|||
Loading…
Reference in New Issue