"""Regenerate results/nan_windows_mask.npy + results/big_windows_mask.npy by scanning a PRISTINE kagglehub download of the WiFlow-STD dataset (kaka2434/wiflow-dataset v1, csi_windows.npy, 360,000 windows of 540x20). ============================ READ THIS FIRST =============================== This script MUST be run against an UNCLEANED copy of the dataset. remote/clean_v2.py (and its predecessor clean_nan.py) repair the dataset by zeroing the corrupted windows IN PLACE, with no backup. A cleaned copy contains no non-finite values and no out-of-range amplitudes, so on a cleaned copy this scan produces ALL-FALSE masks -- silently wrong ground truth. The script errors out loudly in that case (see the sanity check in main()). That irreversibility is exactly why the two committed mask files under results/ (gitignore-negated) are the canonical ground truth: once a download has been cleaned, the masks can NEVER be regenerated from it. Only run this on a fresh `kagglehub.dataset_download("kaka2434/wiflow-dataset")`. ============================================================================ Criteria (per window; mirrors the original 2026-06-10 scan and the remote/clean_v2.py repair criteria): nan mask: any non-finite value (NaN/Inf) anywhere in the 540x20 window big mask: max |finite value| > 1.5 (the data is otherwise [0,1]-normalized; the corrupted files contain garbage up to 3.4e38, float32 max) Expected result on the pristine Kaggle download (RESULTS.md defect 5): nan: 9,070 True | big: 9,072 True | union: 9,072 -- all windows in dataset files 487-499 (the final 13 files), window indices 350,922-359,999. Usage: PYTHONUTF8=1 .venv/Scripts/python.exe generate_corruption_masks.py \ [--data-dir ] [--out-dir results] """ import argparse import os import sys import numpy as np HERE = os.path.dirname(os.path.abspath(__file__)) RESULTS = os.path.join(HERE, "results") EXPECTED = {"nan": 9070, "big": 9072, "union": 9072, "files": (487, 499), "windows": (350922, 359999)} def scan(csi_path, chunk=4000): """Chunked scan of the (mmap'd) windows array; returns (nan_mask, big_mask).""" csi = np.load(csi_path, mmap_mode="r") n = len(csi) nan_mask = np.zeros(n, dtype=bool) big_mask = np.zeros(n, dtype=bool) for i in range(0, n, chunk): block = np.asarray(csi[i:i + chunk]) finite = np.isfinite(block) nan_mask[i:i + chunk] = (~finite).any(axis=(1, 2)) big_mask[i:i + chunk] = ( np.abs(np.where(finite, block, 0)).max(axis=(1, 2)) > 1.5) if (i // chunk) % 10 == 0: print(f" scanned {min(i + chunk, n):,}/{n:,} windows " f"(nan={int(nan_mask.sum()):,} big={int(big_mask.sum()):,})", flush=True) return nan_mask, big_mask def describe_files(data_dir, mask): """Map marked windows to dataset file indices via window_info.npz.""" info = os.path.join(data_dir, "window_info.npz") if not os.path.exists(info): return None w2f = np.load(info)["window_to_file"] return np.unique(w2f[mask]) def main(): parser = argparse.ArgumentParser( description="Regenerate the corruption masks from a PRISTINE " "(uncleaned) kagglehub download. See module docstring.") parser.add_argument("--data-dir", default=os.path.join( os.path.expanduser("~"), ".cache", "kagglehub", "datasets", "kaka2434", "wiflow-dataset", "versions", "1", "preprocessed_csi_data"), help="Directory containing csi_windows.npy (PRISTINE copy)") parser.add_argument("--out-dir", default=RESULTS, help="Where to write the two .npy masks") parser.add_argument("--chunk", type=int, default=4000, help="Windows per scan chunk (memory/speed tradeoff)") args = parser.parse_args() csi_path = os.path.join(args.data_dir, "csi_windows.npy") if not os.path.exists(csi_path): sys.exit(f"csi_windows.npy not found in {args.data_dir}") print(f"scanning {csi_path} (chunk={args.chunk}) ...") nan_mask, big_mask = scan(csi_path, args.chunk) union = nan_mask | big_mask print(f"nan: {int(nan_mask.sum()):,} | big: {int(big_mask.sum()):,} | " f"union: {int(union.sum()):,} of {len(union):,} windows") # ---- sanity check: an all-False result means a CLEANED copy ------------ if not union.any(): sys.exit( "ERROR: scan found ZERO corrupted windows.\n" "\n" "The pristine Kaggle download (kaka2434/wiflow-dataset v1) is " "known to contain\n" "9,072 corrupted windows (NaN/Inf + amplitudes up to 3.4e38) in " "dataset files\n" "487-499 (RESULTS.md, reproducibility defect 5). Finding none " "means this copy\n" "has almost certainly already been repaired by remote/clean_v2.py " "(or clean_nan.py),\n" "which zeroes the corrupted windows IN PLACE -- after that the " "corruption evidence\n" "is gone and the masks CANNOT be regenerated from this copy.\n" "\n" "Refusing to overwrite the committed ground-truth masks with " "all-False ones.\n" "Re-download the dataset (kagglehub.dataset_download(" "'kaka2434/wiflow-dataset'))\n" "and point --data-dir at the fresh, uncleaned copy.") files = describe_files(args.data_dir, union) if files is not None: print(f"marked windows span dataset files {files.min()}-{files.max()}: " f"{files.tolist()}") lo, hi = EXPECTED["files"] if files.min() != lo or files.max() != hi: print(f"WARNING: expected marked files exactly {lo}-{hi} " f"(the pristine v1 download); got {files.min()}-{files.max()}. " f"Different dataset version, or a partially cleaned copy?") for name, mask, exp in (("nan", nan_mask, EXPECTED["nan"]), ("big", big_mask, EXPECTED["big"])): if int(mask.sum()) != exp: print(f"WARNING: {name} mask has {int(mask.sum()):,} True windows; " f"the pristine v1 download yields {exp:,}.") os.makedirs(args.out_dir, exist_ok=True) for name, mask in (("nan_windows_mask.npy", nan_mask), ("big_windows_mask.npy", big_mask)): out = os.path.join(args.out_dir, name) np.save(out, mask) print(f"wrote {out} ({int(mask.sum()):,} True)") if __name__ == "__main__": main()