name: Verify Pipeline Determinism

on:
  push:
    branches: [ main, master, 'claude/**' ]
    paths:
      - 'archive/v1/src/core/**'
      - 'archive/v1/src/hardware/**'
      - 'archive/v1/data/proof/**'
      - 'archive/v1/requirements-lock.txt'
      - '.github/workflows/verify-pipeline.yml'
  pull_request:
    branches: [ main, master ]
    paths:
      - 'archive/v1/src/core/**'
      - 'archive/v1/src/hardware/**'
      - 'archive/v1/data/proof/**'
      - 'archive/v1/requirements-lock.txt'
      - '.github/workflows/verify-pipeline.yml'
  workflow_dispatch:

jobs:
  verify-determinism:
    name: Verify Pipeline Determinism
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.11']

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}

      - name: Install pinned dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r archive/v1/requirements-lock.txt

      - name: Verify reference signal is reproducible
        run: |
          echo "=== Regenerating reference signal ==="
          python archive/v1/data/proof/generate_reference_signal.py
          echo ""
          echo "=== Checking data file matches committed version ==="
          # The regenerated file should be identical to the committed one
          # (We compare the metadata file since data file is large)
          python -c "
          import json, hashlib
          with open('archive/v1/data/proof/sample_csi_meta.json') as f:
              meta = json.load(f)
          assert meta['is_synthetic'] == True, 'Metadata must mark signal as synthetic'
          assert meta['numpy_seed'] == 42, 'Seed must be 42'
          print('Reference signal metadata validated.')
          "

      - name: Run pipeline verification
        working-directory: archive/v1
        env:
          # Pin thread count for scipy.fft / BLAS — multi-threaded reduction
          # order is otherwise non-deterministic across CI runs (issue #560
          # follow-up: 9- and 6-decimal quantization were not enough because
          # the divergence is from threading order, not SIMD reordering).
          # Single-threaded keeps the proof reproducible at a ~2-3x slowdown.
          OMP_NUM_THREADS: "1"
          OPENBLAS_NUM_THREADS: "1"
          MKL_NUM_THREADS: "1"
          VECLIB_MAXIMUM_THREADS: "1"
          NUMEXPR_NUM_THREADS: "1"
        run: |
          echo "=== Running pipeline verification ==="
          python data/proof/verify.py
          echo ""
          echo "Pipeline verification PASSED."

      - name: Run verification twice to confirm determinism
        working-directory: archive/v1
        env:
          OMP_NUM_THREADS: "1"
          OPENBLAS_NUM_THREADS: "1"
          MKL_NUM_THREADS: "1"
          VECLIB_MAXIMUM_THREADS: "1"
          NUMEXPR_NUM_THREADS: "1"
        run: |
          echo "=== Second run for determinism confirmation ==="
          python data/proof/verify.py
          echo "Determinism confirmed across multiple runs."

      - name: Check for unseeded np.random in production code
        run: |
          echo "=== Scanning for unseeded np.random usage in production code ==="
          # Search for np.random calls without a seed in production code
          # Exclude test files, proof data generators, and known parser placeholders
          VIOLATIONS=$(grep -rn "np\.random\." archive/v1/src/ \
            --include="*.py" \
            --exclude-dir="__pycache__" \
            | grep -v "np\.random\.RandomState" \
            | grep -v "np\.random\.seed" \
            | grep -v "np\.random\.default_rng" \
            | grep -v "# placeholder" \
            | grep -v "# mock" \
            | grep -v "# test" \
            || true)

          if [ -n "$VIOLATIONS" ]; then
            echo ""
            echo "WARNING: Found potential unseeded np.random usage in production code:"
            echo "$VIOLATIONS"
            echo ""
            echo "Each np.random call should either:"
            echo "  1. Use np.random.RandomState(seed) or np.random.default_rng(seed)"
            echo "  2. Be in a test/mock context (add '# placeholder' comment)"
            echo ""
            # Note: This is a warning, not a failure, because some existing
            # placeholder code in parsers uses np.random for mock data.
            # Once hardware integration is complete, these should be removed.
            echo "WARNING: Review the above usages. Existing parser placeholders are expected."
          else
            echo "No unseeded np.random usage found in production code."
          fi