cloverleaf-larry/lib/nc-regression.sh

#!/usr/bin/env bash
# nc-regression.sh — Example 6 orchestrator: end-to-end regression testing
# between two Cloverleaf environments.
#
# Phases:
#   1. discover  → list inbound threads in scope (uses nc-find-inbound)
#   2. sample    → grab N messages per inbound from env-A smatdb → input .msgs files
#   3. route-A   → run route_test on env-A for each inbound → captured outputs/env-a/...
#   4. route-B   → run route_test on env-B with same inputs → captured outputs/env-b/...
#   5. diff      → hl7-diff every paired output file with --ignore MSH.7 → per-pair report
#   6. summary   → one master regression-summary.md compiling everything
#
# Phases 3 and 4 require Cloverleaf's route_test command on each box. The
# command is parameterized via --route-test-cmd with placeholders:
#     {THREAD}      → the inbound thread name
#     {INPUT}       → absolute path to the .msgs input file
#     {OUTPUT_DIR}  → absolute path where output files should land
#     {HCIROOT}     → the env's HCIROOT
#     {HCISITE}     → the env's HCISITE
# Default: not set — you must pass it once for your shop's invocation pattern.
#
# A common pattern: a wrapper script that sources the Cloverleaf profile and
# runs `<thread> route_test <INPUT>`, with output redirected to OUTPUT_DIR.
# Example template you might pass:
#   --route-test-cmd 'cd {HCIROOT}/{HCISITE} && . ./.profile && {THREAD} route_test {INPUT} && cp *.out.* {OUTPUT_DIR}/'
#
# Usage:
#   nc-regression.sh --scope <SCOPE>
#                    --count N
#                    --env-a HCIROOT_A --site-a SITENAME
#                    --env-b HCIROOT_B --site-b SITENAME
#                    --out DIR
#                    --route-test-cmd 'TEMPLATE'
#                    [--ignore "FIELDS"]
#                    [--include-fields "FIELDS"]
#                    [--phase 1|2|3|4|5|6|all]
#                    [--dry-run]
#                    [--inbound-mode tcp-listen|icl-or-file|all]
#                    [--env-b-host HOST] [--env-b-user USER]   # for scp of inputs
#
# Scope formats:
#   thread:NAME              one specific thread
#   threads:N1,N2,N3         comma-separated list
#   site                     every inbound in the configured site
#   server                   every inbound in every site under HCIROOT
#
# ─────────────────────────────────────────────────────────────────────────────
# CHAIN-WALK MODE (v0.8.23, Bryan's REGRESSION CHAIN-WALK route-test capture)
# ─────────────────────────────────────────────────────────────────────────────
# A second, orthogonal entry point that captures the route_test output at EVERY
# routing(inbound) thread along an nc-paths chain, single-env (per-env capture).
# Bryan runs it on env-A and env-B with the same START input, then the existing
# diff phase (hl7-diff --ignore MSH.7) compares the captured outputs.
#
#   nc-regression.sh --chain-walk
#                    --start <site/thread | thread> [--site SITE]
#                    --count N
#                    --hciroot HCIROOT          # single env (this box)
#                    --out DIR
#                    [--route-test-bin hciroutetest]   # default: hciroutetest
#                    [--target <site/thread>]   # restrict to the chain ending here
#                    [--dry-run]                # generate cmds, do NOT exec engine
#
# Workflow (Bryan's spec):
#   1. Resolve the downstream chain(s) from START via nc-paths (--format jsonl).
#      Each chain is "site/thread --> ... ==> ... --> site/thread".  --target
#      restricts to the single chain whose terminus matches.
#   2. Grab the N most-recent messages from the START inbound's SMAT (nc-msgs
#      --limit N --format raw) → the chain-walk's ORIGINAL INPUT .msgs file.
#   3. Walk the chain. The route_test ENTRY threads are the START node plus every
#      node that immediately follows a cross-site "==>" hop (the remote inbound).
#      At each entry thread E:
#        hciroutetest -a -d -f nl -s <out_base> <E> <input>
#      route_test writes ONE FILE PER DESTINATION named  <out_base>.out.<DEST>
#      (the OUTBOUND/dest thread is the suffix; -f nl gives NEWLINE output, so no
#      manual len2nl+delete).  We capture ALL fan-out branches.
#   4. File SELECTION for the next step: the node IMMEDIATELY AFTER E in the path
#      is the suffix to select.  For E -->(intra) X, select  .out.<X>.  For the
#      cross-site boundary  S ==> R  (S = E's local outbound sender, the path node
#      right after E), the selected  .out.<S>  payload is the INPUT fed to the
#      next site's inbound R.  Continue to the chain terminus.
#   PRODUCES (under $OUT/chain/<chain-id>/):
#       input.msgs                        — the original START messages
#       step-NN.<E>.out.<DEST>            — every per-destination route_test output
#       step-NN.<E>.selected → next input — the file chosen to feed the next step
#       commands.sh                       — the exact generated command sequence
#       chain.txt                         — the resolved v1 chain line(s)
#
# hciroutetest is a Linux engine binary needing a LIVE engine — NOT runnable off
# the server. So chain-walk GENERATES the orchestration + the exact command
# STRINGS + the file-chaining/selection, and (with --dry-run) stubs the engine
# call. Real execution is on Bryan's server. The proven command syntax is mined
# verbatim from the v1/v2 wrappers (route_test_wrapper.py:10/149-156 and
# legacy_workflow_commands.py:1015 .out.<DEST> naming).
set -o pipefail

NC_SELF="$0"
LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)"
NCP="$LIB_DIR/nc-parse.sh"
NCI="$LIB_DIR/nc-inbound.sh"
NCM="$LIB_DIR/nc-msgs.sh"
NCPATHS="$LIB_DIR/nc-paths.sh"
HL7DIFF="$LIB_DIR/hl7-diff.sh"

# v0.7.5: shared CR-safety primitives (coerce_int).  The phase loops use
# `wc -c | tr -d ' '` to count delimiter bytes — Cygwin wc.exe can leak \r
# which then crashes the `[ "$got" -ge "$COUNT" ]` integer test.
if [ -r "$LIB_DIR/cygwin-safe.sh" ]; then
  # shellcheck disable=SC1090,SC1091
  . "$LIB_DIR/cygwin-safe.sh"
else
  coerce_int() { local r="${1:-}" d="${2:-0}" c; c=$(printf '%s' "$r" | tr -cd '0-9'); printf '%s' "${c:-$d}"; }
fi

die() { printf 'nc-regression: %s\n' "$*" >&2; exit 1; }
say() { printf 'nc-regression: %s\n' "$*" >&2; }

SCOPE=""
COUNT=10
ENV_A=""
SITE_A=""
ENV_B=""
SITE_B=""
OUT=""
ROUTE_TEST_CMD=""
IGNORE="MSH.7"
INCLUDE=""
PHASE="all"
DRY_RUN=0
INBOUND_MODE="all"
ENV_B_HOST=""
ENV_B_USER=""
BUNDLE_OUT=""   # after env-A phases, tar up the artifacts here
BUNDLE_IN=""    # at start, untar a bundle here as the env-A artifacts
# v0.6.8: cross-env via ssh-helper.sh ControlMaster aliases. When set, phases
# 1–4 do their reads/writes against the named remote alias; phases 5–6 always
# run locally because all the artifacts live in $OUT (local).
SOURCE_SSH_ALIAS=""
TARGET_SSH_ALIAS=""
# v0.8.23: chain-walk mode (single-env, per-env route_test capture along a chain)
CHAIN_WALK=0
START=""
HCIROOT=""        # single-env HCIROOT for chain-walk (alias for --env-a in this mode)
SITE=""           # optional explicit site for the START thread
TARGET=""         # optional: restrict to the chain whose terminus is this node
ROUTE_TEST_BIN="hciroutetest"

while [ $# -gt 0 ]; do
  case "$1" in
    --chain-walk)       CHAIN_WALK=1 ;;
    --start)            shift; START="$1" ;;
    --hciroot)          shift; HCIROOT="$1" ;;
    --site)             shift; SITE="$1" ;;
    --target)           shift; TARGET="$1" ;;
    --route-test-bin)   shift; ROUTE_TEST_BIN="$1" ;;
    --scope)            shift; SCOPE="$1" ;;
    --count)            shift; COUNT="$1" ;;
    --env-a)            shift; ENV_A="$1" ;;
    --site-a)           shift; SITE_A="$1" ;;
    --env-b)            shift; ENV_B="$1" ;;
    --site-b)           shift; SITE_B="$1" ;;
    --out)              shift; OUT="$1" ;;
    --route-test-cmd)   shift; ROUTE_TEST_CMD="$1" ;;
    --ignore)           shift; IGNORE="$1" ;;
    --include-fields)   shift; INCLUDE="$1" ;;
    --phase)            shift; PHASE="$1" ;;
    --dry-run)          DRY_RUN=1 ;;
    --inbound-mode)     shift; INBOUND_MODE="$1" ;;
    --env-b-host)       shift; ENV_B_HOST="$1" ;;
    --env-b-user)       shift; ENV_B_USER="$1" ;;
    --bundle-out)       shift; BUNDLE_OUT="$1" ;;
    --bundle-in)        shift; BUNDLE_IN="$1" ;;
    --source-ssh-alias) shift; SOURCE_SSH_ALIAS="$1" ;;
    --target-ssh-alias) shift; TARGET_SSH_ALIAS="$1" ;;
    -h|--help)          sed -n '2,94p' "$NC_SELF"; exit 0 ;;
    -*)                 die "unknown flag: $1" ;;
    *)                  die "extra arg: $1" ;;
  esac
  shift
done

# Resolve ssh-helper.sh for cross-env support. Caller can pre-export
# LARRY_LIB_DIR; otherwise we look next to ourselves.
SSH_HELPER="${LARRY_LIB_DIR:-$LIB_DIR}/ssh-helper.sh"
[ -x "$SSH_HELPER" ] || SSH_HELPER="$LIB_DIR/ssh-helper.sh"

# Helper: run a command on a remote alias if non-empty, else locally.
# `$1`=alias (empty=local); rest=command (single string).
_run_on() {
  local alias="$1"; shift
  if [ -z "$alias" ]; then
    bash -c "$*"
  else
    [ -x "$SSH_HELPER" ] || die "ssh-helper.sh not found at $SSH_HELPER (needed for --source/target-ssh-alias)"
    "$SSH_HELPER" exec "$alias" "$*"
  fi
}

# ═════════════════════════════════════════════════════════════════════════════
# CHAIN-WALK MODE (v0.8.23) — single-env route_test capture along an nc-paths
# chain. Orthogonal to the phase pipeline above; runs and exits here.
# ═════════════════════════════════════════════════════════════════════════════

# Normalize a node to its bare thread name (strip a leading "site/" if present).
cw_thread_of() { printf '%s' "${1##*/}"; }
# Site of a "site/thread" node (empty if no slash).
cw_site_of()   { case "$1" in */*) printf '%s' "${1%%/*}" ;; *) printf '' ;; esac; }

# Parse ONE v1 chain line into two parallel newline lists on fd: NODES and ARROWS.
# Emits, per line: "N\t<node>" for each node and "A\t<arrow>" for each arrow
# ("-->" intra-site, "==>" cross-site), in left-to-right order. We tokenize by
# turning the two arrow tokens into sentinels we can split on without regex pain.
cw_parse_chain() {
  local line="$1"
  # Insert record separators around each arrow token, then read field-by-field.
  # Use awk for portable tokenization (no bash regex, Win+Linux safe).
  printf '%s\n' "$line" | awk '
    {
      n=0
      # split on whitespace; arrows are standalone tokens "-->" or "==>"
      cnt=split($0, tok, /[ \t]+/)
      for (i=1;i<=cnt;i++) {
        t=tok[i]
        if (t=="-->")      { print "A\t-->" }
        else if (t=="==>") { print "A\t==>" }
        else if (t!="")    { print "N\t" t }
      }
    }'
}

# Given the parsed NODES (array) and ARROWS (array, ARROWS[i] is between
# NODES[i] and NODES[i+1]), determine the route_test ENTRY indices: the START
# (index 0) plus every node that immediately FOLLOWS a "==>" arrow (a remote
# inbound after a cross-site hop). Echo the entry indices, one per line.
cw_entry_indices() {
  # args: arrow tokens as "$@" (ARROWS[0..n-2]); NODES count = #arrows+1
  local i=0
  echo 0                      # START is always a route_test entry
  for a in "$@"; do
    if [ "$a" = "==>" ]; then
      echo $((i+1))           # node right after a cross-site hop is an entry
    fi
    i=$((i+1))
  done
}

chain_walk() {
  [ -n "$START" ]   || die "chain-walk: missing --start <site/thread | thread>"
  [ -n "$HCIROOT" ] || die "chain-walk: missing --hciroot (single-env root)"
  [ -n "$OUT" ]     || die "chain-walk: missing --out DIR"
  [ -x "$NCPATHS" ] || die "chain-walk: nc-paths.sh not found/executable at $NCPATHS"
  [ -x "$NCM" ]     || die "chain-walk: nc-msgs.sh not found/executable at $NCM"

  mkdir -p "$OUT" 2>/dev/null

  # 1. Resolve downstream chains from START.
  say "=== CHAIN-WALK: resolve downstream chains from $START ==="
  local jsonl
  jsonl=$("$NCPATHS" "$START" --downstream --hciroot "$HCIROOT" --format jsonl 2>/dev/null)
  [ -n "$jsonl" ] || die "chain-walk: nc-paths returned no downstream chains for $START"

  # Extract the v1 path strings (one per JSON line). Pure-awk pull of "path":"…".
  local paths
  paths=$(printf '%s\n' "$jsonl" | awk -F'"path":"' 'NF>1 { p=$2; sub(/".*/,"",p); print p }')
  [ -n "$paths" ] || die "chain-walk: could not extract path strings from nc-paths jsonl"

  # If --target given, keep only the chain whose terminus matches it (bare-thread
  # match, so caller can pass either site/thread or just the thread name).
  if [ -n "$TARGET" ]; then
    local tgt; tgt=$(cw_thread_of "$TARGET")
    paths=$(printf '%s\n' "$paths" | awk -v t="$tgt" '
      { n=split($0,a,/[ \t]+/); last=a[n]; sub(/^.*\//,"",last); if (last==t) print }')
    [ -n "$paths" ] || die "chain-walk: --target $TARGET matched no chain terminus from $START"
  fi

  local nchains; nchains=$(printf '%s\n' "$paths" | grep -c .)
  say "resolved $nchains chain(s) to walk"

  # The START inbound's bare thread name + its site (for the message grab).
  local start_thread start_site
  start_thread=$(cw_thread_of "$START")
  start_site=$(cw_site_of "$START")
  [ -n "$start_site" ] && [ -z "$SITE" ] && SITE="$start_site"

  # 2. Generate the message-grab for the START inbound's SMAT (nc-msgs).
  #    Bryan: grab the N most-recent from the START inbound. We point nc-msgs at
  #    the START site dir; it locates <thread>.smatdb and emits raw (0x1c-sep).
  local start_sitedir
  if [ -n "$SITE" ]; then
    start_sitedir="$HCIROOT/$SITE"
  else
    start_sitedir=$(find "$HCIROOT" -maxdepth 5 -name "${start_thread}.smatdb" -type f 2>/dev/null \
      | head -1 | sed 's#/exec/processes/.*##')
  fi
  local grab_cmd="HCISITEDIR='$start_sitedir' '$NCM' '$start_thread' --limit $COUNT --format raw"

  local chain_id=0 chain_line
  while IFS= read -r chain_line; do
    [ -z "$chain_line" ] && continue
    chain_id=$((chain_id+1))
    local cdir; cdir="$OUT/chain/$(printf '%02d' "$chain_id")"
    mkdir -p "$cdir" 2>/dev/null
    printf '%s\n' "$chain_line" > "$cdir/chain.txt"

    # Parse into NODES[] / ARROWS[].
    local NODES=() ARROWS=()
    local rec
    while IFS=$'\t' read -r tag val; do
      case "$tag" in
        N) NODES+=("$val") ;;
        A) ARROWS+=("$val") ;;
      esac
    done < <(cw_parse_chain "$chain_line")

    local nnodes=${#NODES[@]}
    say "--- chain $chain_id ($nnodes nodes): $chain_line ---"

    # Determine route_test entry indices.
    local entries; entries=$(cw_entry_indices "${ARROWS[@]}")

    # 3. Walk. The first entry's input is the START messages; thereafter each
    #    step's input is the file SELECTED from the prior route_test.
    local cmds="$cdir/commands.sh"
    {
      printf '#!/usr/bin/env bash\n'
      printf '# GENERATED by nc-regression.sh --chain-walk (v0.8.23). Run ON THE ENGINE BOX.\n'
      printf '# Chain: %s\n' "$chain_line"
      printf '# Source the Cloverleaf profile so HCIROOT/HCISITE + engine libs are set:\n'
      printf '#   export HCIROOT=%s ; cd "$HCIROOT" ; . ./.profile   # or your shop pattern\n' "$HCIROOT"
      printf 'set -e\n\n'
      printf '# --- step 0: grab the %s most-recent messages from the START inbound SMAT ---\n' "$COUNT"
      printf '%s > %s\n\n' "$grab_cmd" "$cdir/input.msgs"
    } > "$cmds"

    local step=0 prev_input="$cdir/input.msgs"
    local e
    while IFS= read -r e; do
      [ -z "$e" ] && continue
      local node="${NODES[$e]}"
      local ethread; ethread=$(cw_thread_of "$node")
      local esite;   esite=$(cw_site_of "$node")
      # The node IMMEDIATELY AFTER E is the DEST suffix to SELECT for the next step.
      local nexti=$((e+1))
      local select_suffix=""
      if [ "$nexti" -lt "$nnodes" ]; then
        select_suffix=$(cw_thread_of "${NODES[$nexti]}")
      fi
      local out_base="$cdir/step-$(printf '%02d' "$step").${ethread}"
      # The exact, mined-verbatim hciroutetest command (route_test_wrapper.py:149-156):
      #   hciroutetest -a -d -f nl -s <out_base> <source_thread> <input>
      # -f nl → NEWLINE output (no manual len2nl+delete). HCISITE per the entry's site.
      local rt_cmd="HCISITE='${esite:-$SITE}' '$ROUTE_TEST_BIN' -a -d -f nl -s '$out_base' '$ethread' '$prev_input'"

      {
        printf '# --- step %s: route_test at routing(inbound) thread %s%s ---\n' \
          "$step" "$ethread" "$([ -n "$esite" ] && printf ' (site=%s)' "$esite")"
        printf '%s\n' "$rt_cmd"
        if [ -n "$select_suffix" ]; then
          printf '# select the per-destination output that feeds the NEXT step:\n'
          printf '#   route_test wrote one file per DEST as  %s.out.<DEST>\n' "$out_base"
          printf '#   the next node in the chain is %s → select  %s.out.%s\n' \
            "$select_suffix" "$out_base" "$select_suffix"
          printf 'cp "%s.out.%s" "%s.selected"\n\n' "$out_base" "$select_suffix" "$out_base"
        else
          printf '# chain terminus — no further selection; %s.out.* are the final outputs.\n\n' "$out_base"
        fi
      } >> "$cmds"

      # SELF-VERIFY echo (stderr): the generated command + selection for this step.
      say "  step $step  ENTRY=$ethread${esite:+ (site=$esite)}"
      say "    route_test : $rt_cmd"
      if [ -n "$select_suffix" ]; then
        say "    select     : ${out_base##*/}.out.$select_suffix  →  feeds next step"
      else
        say "    select     : (terminus) ${out_base##*/}.out.* are final"
      fi

      # --dry-run: stub the engine call so the chaining/selection is exercised
      # without hciroutetest. We DO NOT run the real binary here (Bryan's box does).
      if [ "$DRY_RUN" = "1" ]; then
        : # generation-only; nothing executed
      fi

      # The selected file becomes the next step's input (for the NEXT entry).
      if [ -n "$select_suffix" ]; then
        prev_input="${out_base}.out.${select_suffix}"
      fi
      step=$((step+1))
    done <<< "$entries"

    say "  → generated: $cmds"
  done <<< "$paths"

  say "chain-walk done. Per-chain artifacts under: $OUT/chain/"
  say "Run each chain/NN/commands.sh ON THE ENGINE BOX (profile sourced) to capture outputs."
}

if [ "$CHAIN_WALK" = "1" ]; then
  chain_walk
  exit 0
fi

[ -n "$OUT" ]    || die "missing --out DIR"
# When --bundle-in is given, we don't need scope/env-a/etc. — the bundle has them.
if [ -z "$BUNDLE_IN" ]; then
  [ -n "$SCOPE" ]  || die "missing --scope (thread:NAME | threads:N1,N2 | site | server)"
  [ -n "$ENV_A" ]  || die "missing --env-a HCIROOT_A"
  [ -n "$ENV_B" ]  || die "missing --env-b HCIROOT_B"
  # ENV_A directory check is only meaningful when SOURCE is local; same for ENV_B.
  if [ -z "$SOURCE_SSH_ALIAS" ]; then
    [ -d "$ENV_A" ]  || die "env-a is not a directory: $ENV_A (and --source-ssh-alias unset)"
  fi
fi
if [ -n "$SOURCE_SSH_ALIAS" ] || [ -n "$TARGET_SSH_ALIAS" ]; then
  [ -x "$SSH_HELPER" ] || die "ssh-helper.sh required for cross-env mode but not found at $SSH_HELPER"
fi
case "$PHASE" in 1|2|3|4|5|6|all|env-a|env-b) ;; *) die "bad --phase (use 1|2|3|4|5|6|all|env-a|env-b)" ;; esac
[ "$DRY_RUN" = "1" ] || [ -n "$ROUTE_TEST_CMD" ] || say "WARNING: --route-test-cmd is unset; phases 3 and 4 will be skipped (you can run them manually using the generated input files)"

mkdir -p "$OUT" "$OUT/inputs" "$OUT/outputs/env-a" "$OUT/outputs/env-b" "$OUT/diff" 2>/dev/null

# If --bundle-in given, untar the bundle into $OUT first. Manifest tells us
# what env-A was and (optionally) what route_test command to use.
if [ -n "$BUNDLE_IN" ]; then
  [ -f "$BUNDLE_IN" ] || die "bundle-in file not found: $BUNDLE_IN"
  say "unpacking bundle $BUNDLE_IN into $OUT/"
  tar -xzf "$BUNDLE_IN" -C "$OUT" 2>&1 | tail -5
  if [ -f "$OUT/manifest.json" ]; then
    say "manifest from env-A:"
    cat "$OUT/manifest.json" >&2
    # Pull scope and route-test-cmd hints if not overridden
    if [ -z "$SCOPE" ] && command -v jq >/dev/null 2>&1; then
      SCOPE=$(jq -r '.scope // ""' "$OUT/manifest.json")
    fi
  fi
fi

# ─────────────────────────────────────────────────────────────────────────────
# Phase 1: discover inbound threads in scope
# ─────────────────────────────────────────────────────────────────────────────
discover_inbounds() {
  case "$SCOPE" in
    thread:*)   echo "${SCOPE#thread:}" ;;
    threads:*)  echo "${SCOPE#threads:}" | tr ',' '\n' ;;
    site)
      [ -n "$SITE_A" ] || die "scope=site requires --site-a"
      if [ -n "$SOURCE_SSH_ALIAS" ]; then
        # Pull the remote NetConfig locally first, then parse with our local NCI.
        local remote_nc="$ENV_A/$SITE_A/NetConfig"
        local local_nc; local_nc=$("$SSH_HELPER" pull "$SOURCE_SSH_ALIAS" "$remote_nc" 2>/dev/null | tail -1)
        [ -n "$local_nc" ] && [ -f "$local_nc" ] || die "could not pull remote NetConfig $remote_nc from $SOURCE_SSH_ALIAS"
        "$NCI" "$local_nc" --mode "$INBOUND_MODE" --format tsv \
          | awk -F'\t' 'NR>1 {print $1}'
      else
        "$NCI" "$ENV_A/$SITE_A/NetConfig" --mode "$INBOUND_MODE" --format tsv \
          | awk -F'\t' 'NR>1 {print $1}'
      fi
      ;;
    server)
      if [ -n "$SOURCE_SSH_ALIAS" ]; then
        # find NetConfigs remotely, then pull each and parse locally.
        local ncs
        ncs=$("$SSH_HELPER" exec "$SOURCE_SSH_ALIAS" "find $ENV_A -maxdepth 2 -name NetConfig -type f 2>/dev/null" 2>/dev/null \
              | grep -v '^\[ssh_exec:' )
        local nc local_nc
        while IFS= read -r nc; do
          [ -z "$nc" ] && continue
          local_nc=$("$SSH_HELPER" pull "$SOURCE_SSH_ALIAS" "$nc" 2>/dev/null | tail -1)
          [ -n "$local_nc" ] && [ -f "$local_nc" ] || continue
          "$NCI" "$local_nc" --mode "$INBOUND_MODE" --format tsv \
            | awk -F'\t' 'NR>1 {print $1}'
        done <<< "$ncs"
      else
        while IFS= read -r nc; do
          "$NCI" "$nc" --mode "$INBOUND_MODE" --format tsv \
            | awk -F'\t' 'NR>1 {print $1}'
        done < <(find "$ENV_A" -maxdepth 2 -name NetConfig -type f 2>/dev/null)
      fi
      ;;
    *) die "bad --scope: $SCOPE" ;;
  esac
}

phase_1() {
  say "=== PHASE 1: discover inbounds ==="
  local inbounds; inbounds=$(discover_inbounds | sort -u | grep -v '^$')
  if [ -z "$inbounds" ]; then
    say "no inbounds found in scope $SCOPE"
    return 1
  fi
  printf '%s\n' "$inbounds" > "$OUT/inbounds.txt"
  local count; count=$(printf '%s\n' "$inbounds" | wc -l | tr -d ' ')
  say "discovered $count inbound thread(s) → $OUT/inbounds.txt"
  printf '%s\n' "$inbounds" | sed 's/^/  - /'
}

# ─────────────────────────────────────────────────────────────────────────────
# Phase 2: sample N messages per inbound from env-A's smatdbs
# ─────────────────────────────────────────────────────────────────────────────
phase_2() {
  say "=== PHASE 2: sample $COUNT messages per inbound from env-A${SOURCE_SSH_ALIAS:+ via ssh:$SOURCE_SSH_ALIAS} ==="
  [ -f "$OUT/inbounds.txt" ] || { say "no inbounds file from phase 1; running phase 1 first"; phase_1 || return 1; }
  local thread sitedir
  local count=0
  while IFS= read -r thread; do
    [ -z "$thread" ] && continue
    local input="$OUT/inputs/${thread}.msgs"

    if [ -n "$SOURCE_SSH_ALIAS" ]; then
      # Remote sampling: use ssh_pull_smat in sampled mode if SITE_A is known.
      # ssh_pull_smat needs a site name; if --site-a is unset we attempt a best-
      # effort discovery via remote find for the .smatdb path. Note: ssh-helper's
      # pull-smat requires the site arg, so we fall back to a remote find +
      # full pull when SITE_A is unset.
      if [ "$DRY_RUN" = "1" ]; then
        say "  [dry-run] would pull-smat $thread (sampled days=14) from $SOURCE_SSH_ALIAS → $input"
      else
        local site_for_smat="${SITE_A:-}"
        if [ -z "$site_for_smat" ]; then
          # Try to locate the .smatdb path remotely and infer the site.
          local remote_smatdb
          remote_smatdb=$("$SSH_HELPER" exec "$SOURCE_SSH_ALIAS" \
            "find $ENV_A -maxdepth 5 -name ${thread}.smatdb -type f 2>/dev/null | head -1" 2>/dev/null \
            | grep -v '^\[ssh_exec:' | head -1)
          if [ -n "$remote_smatdb" ]; then
            # site = first dir component under $ENV_A
            site_for_smat=$(printf '%s' "$remote_smatdb" | sed -e "s#^${ENV_A}/##" -e 's#/.*##')
          fi
        fi
        if [ -n "$site_for_smat" ]; then
          # Pull recent (last 14d) messages as TSV+b64; decode locally into the
          # input file separated by 0x1c (matching nc-msgs --format raw output).
          local sample_tsv; sample_tsv=$(mktemp)
          "$SSH_HELPER" pull-smat "$SOURCE_SSH_ALIAS" "$site_for_smat" "$thread" 14 > "$sample_tsv" 2>/dev/null
          # decode b64 column 6, separate messages with 0x1c
          : > "$input"
          local got=0
          while IFS=$'\t' read -r unix_ts dir typ src dst b64; do
            [ -z "$b64" ] && continue
            printf '%s' "$b64" | base64 -d >> "$input" 2>/dev/null && {
              printf '\x1c' >> "$input"
              got=$((got+1))
              [ "$got" -ge "$COUNT" ] && break
            }
          done < "$sample_tsv"
          rm -f "$sample_tsv"
          say "  sampled $thread (remote, site=$site_for_smat) → $input ($got messages)"
        else
          say "  skip $thread: could not infer remote site (set --site-a)"
          continue
        fi
      fi
    else
      # Local sampling (original behaviour).
      sitedir=""
      if [ -n "$SITE_A" ]; then
        sitedir="$ENV_A/$SITE_A"
      else
        sitedir=$(find "$ENV_A" -maxdepth 4 -name "${thread}.smatdb" -type f 2>/dev/null | head -1 | xargs -I{} dirname {} | sed "s#/exec/processes/.*##")
        [ -z "$sitedir" ] && { say "  skip $thread: smatdb not found under $ENV_A"; continue; }
      fi
      if [ "$DRY_RUN" = "1" ]; then
        say "  [dry-run] would sample $COUNT msgs from $thread → $input"
      else
        HCISITEDIR="$sitedir" "$NCM" "$thread" --limit "$COUNT" --format raw > "$input" 2>/dev/null
        # v0.7.5: coerce_int on wc output — Cygwin wc.exe can emit \r.
        local got; got=$(coerce_int "$(tr -cd $'\x1c' < "$input" | wc -c)" 0)
        say "  sampled $thread → $input ($got messages)"
      fi
    fi
    count=$((count+1))
  done < "$OUT/inbounds.txt"
  say "phase 2 done: $count thread(s) processed"
}

# ─────────────────────────────────────────────────────────────────────────────
# Phase 3 / 4: execute route_test on each env
# ─────────────────────────────────────────────────────────────────────────────
render_cmd() {
  local tmpl="$1" thread="$2" input="$3" outdir="$4" hciroot="$5" hcisite="$6"
  local cmd="$tmpl"
  cmd="${cmd//\{THREAD\}/$thread}"
  cmd="${cmd//\{INPUT\}/$input}"
  cmd="${cmd//\{OUTPUT_DIR\}/$outdir}"
  cmd="${cmd//\{HCIROOT\}/$hciroot}"
  cmd="${cmd//\{HCISITE\}/$hcisite}"
  printf '%s' "$cmd"
}

phase_routes() {
  local label="$1" hciroot="$2" hcisite="$3" ssh_alias="${4:-}"
  if [ -n "$ssh_alias" ]; then
    say "=== PHASE ${label}: route_test on env-${label} via ssh:${ssh_alias} ==="
  else
    say "=== PHASE ${label}: route_test on env-${label} (local) ==="
  fi
  if [ -z "$ROUTE_TEST_CMD" ]; then
    say "no --route-test-cmd; skipping phase ${label}"
    say "to run manually, use the input files at $OUT/inputs/*.msgs"
    return 0
  fi
  local thread
  while IFS= read -r thread; do
    [ -z "$thread" ] && continue
    local input="$OUT/inputs/${thread}.msgs"
    local outdir="$OUT/outputs/env-${label}/${thread}"
    [ -f "$input" ] || { say "  skip $thread: no input file"; continue; }
    mkdir -p "$outdir"

    if [ -n "$ssh_alias" ]; then
      # Cross-env: push the input to a deterministic remote staging path,
      # render the route_test cmd with REMOTE paths, run via ssh_exec, then
      # ssh_pull the output files back into the local outdir.
      local remote_input="/tmp/larry-regress/inputs/${thread}.msgs"
      local remote_outdir="/tmp/larry-regress/outputs/env-${label}/${thread}"
      local cmd; cmd=$(render_cmd "$ROUTE_TEST_CMD" "$thread" "$remote_input" "$remote_outdir" "$hciroot" "$hcisite")
      if [ "$DRY_RUN" = "1" ]; then
        say "  [dry-run] $thread (remote):"
        say "    ssh_push $input → $ssh_alias:$remote_input"
        say "    ssh_exec $ssh_alias: $cmd"
        say "    ssh_pull $ssh_alias:$remote_outdir/* → $outdir"
        continue
      fi
      say "  $thread (remote on $ssh_alias):"
      "$SSH_HELPER" exec "$ssh_alias" "mkdir -p $remote_outdir $(dirname "$remote_input")" >/dev/null 2>&1 || true
      "$SSH_HELPER" push "$ssh_alias" "$input" "$remote_input" >/dev/null 2>&1 || { say "    push failed for $thread"; continue; }
      say "    \$ $cmd"
      "$SSH_HELPER" exec "$ssh_alias" "$cmd" 2>&1 | sed 's/^/    /' || say "    (route_test exit non-zero — continuing)"
      # Pull every file from remote_outdir back to local outdir.
      local out_listing
      out_listing=$("$SSH_HELPER" exec "$ssh_alias" "find $remote_outdir -maxdepth 1 -type f 2>/dev/null" 2>/dev/null | grep -v '^\[ssh_exec:')
      local rf
      while IFS= read -r rf; do
        [ -z "$rf" ] && continue
        "$SSH_HELPER" pull "$ssh_alias" "$rf" "$outdir/$(basename "$rf")" >/dev/null 2>&1 || say "    pull failed: $rf"
      done <<< "$out_listing"
    else
      local cmd; cmd=$(render_cmd "$ROUTE_TEST_CMD" "$thread" "$input" "$outdir" "$hciroot" "$hcisite")
      if [ "$DRY_RUN" = "1" ]; then
        say "  [dry-run] $thread:"
        say "    \$ $cmd"
      else
        say "  $thread:"
        say "    \$ $cmd"
        bash -c "$cmd" 2>&1 | sed 's/^/    /' || say "    (route_test exit non-zero — continuing)"
      fi
    fi
  done < "$OUT/inbounds.txt"
}

phase_3() { phase_routes "a" "$ENV_A" "$SITE_A" "$SOURCE_SSH_ALIAS"; }

phase_4() {
  # Phase 4: if target is remote via ssh_alias, phase_routes handles the push.
  # Legacy --env-b-host path (raw scp) is preserved for environments where the
  # ssh-helper master isn't open.
  if [ -z "$TARGET_SSH_ALIAS" ] && [ -n "$ENV_B_HOST" ]; then
    say "copying input files to ${ENV_B_USER:-$USER}@${ENV_B_HOST}:${OUT}/inputs/"
    if [ "$DRY_RUN" = "1" ]; then
      say "  [dry-run] scp -r $OUT/inputs/ ${ENV_B_USER:-$USER}@${ENV_B_HOST}:${OUT}/"
    else
      ssh "${ENV_B_USER:-$USER}@${ENV_B_HOST}" "mkdir -p $OUT/inputs $OUT/outputs/env-b" || true
      scp -r "$OUT/inputs/" "${ENV_B_USER:-$USER}@${ENV_B_HOST}:${OUT}/" || say "scp failed; you'll need to copy manually"
    fi
  fi
  phase_routes "b" "$ENV_B" "$SITE_B" "$TARGET_SSH_ALIAS"
}

# ─────────────────────────────────────────────────────────────────────────────
# Phase 5: diff outputs pair-by-pair
# ─────────────────────────────────────────────────────────────────────────────
phase_5() {
  say "=== PHASE 5: diff env-a vs env-b outputs ==="
  local diff_index="$OUT/diff/_index.md"
  {
    printf '# Regression diff index\n\n'
    # NOTE: format string starts with '- ', so use printf '--' separator —
    # otherwise bash 3.2's printf (macOS default) reads the leading '-' as a
    # bad option and emits nothing. This was a latent bug pre-v0.6.8.
    printf -- '- env-A: `%s`%s\n- env-B: `%s`%s\n- scope: `%s`\n- count: %s msgs per inbound\n- ignore: `%s`\n%s\n\n' \
      "$ENV_A" "$([ -n "$SOURCE_SSH_ALIAS" ] && printf ' (via ssh:%s)' "$SOURCE_SSH_ALIAS")" \
      "$ENV_B" "$([ -n "$TARGET_SSH_ALIAS" ] && printf ' (via ssh:%s)' "$TARGET_SSH_ALIAS")" \
      "$SCOPE" "$COUNT" "$IGNORE" \
      "$([ -n "$INCLUDE" ] && printf -- '- include-only: `%s`' "$INCLUDE")"
    printf '| thread | dest | diffs | report |\n|---|---|---|---|\n'
  } > "$diff_index"

  local total_diff=0 total_pairs=0
  local thread destfile destname
  while IFS= read -r thread; do
    [ -z "$thread" ] && continue
    local a_dir="$OUT/outputs/env-a/${thread}"
    local b_dir="$OUT/outputs/env-b/${thread}"
    [ -d "$a_dir" ] || { say "  skip $thread: no env-a outputs"; continue; }
    [ -d "$b_dir" ] || { say "  skip $thread: no env-b outputs"; continue; }
    # Pair up by filename
    while IFS= read -r destfile; do
      destname=$(basename "$destfile")
      local b_pair="$b_dir/$destname"
      total_pairs=$((total_pairs+1))
      if [ ! -f "$b_pair" ]; then
        echo "| \`$thread\` | \`$destname\` | (missing on env-b) | — |" >> "$diff_index"
        continue
      fi
      local report="$OUT/diff/${thread}.${destname}.md"
      local count
      if [ "$DRY_RUN" = "1" ]; then
        echo "  [dry-run] would diff $destfile vs $b_pair → $report"
        echo "| \`$thread\` | \`$destname\` | [dry-run] | — |" >> "$diff_index"
        continue
      fi
      local diff_args=(--ignore "$IGNORE")
      [ -n "$INCLUDE" ] && diff_args+=(--include-fields "$INCLUDE")
      count=$("$HL7DIFF" "${diff_args[@]}" --format count "$destfile" "$b_pair" 2>/dev/null || echo "?")
      {
        printf '# Diff: %s → %s\n\n- env-A: `%s`\n- env-B: `%s`\n\n' "$thread" "$destname" "$destfile" "$b_pair"
        "$HL7DIFF" "${diff_args[@]}" --format text "$destfile" "$b_pair" 2>/dev/null || true
      } > "$report"
      echo "| \`$thread\` | \`$destname\` | $count | [report](./$(basename "$report")) |" >> "$diff_index"
      # v0.7.5: coerce_int — defends against (a) Cygwin awk emitting a CR-
      # tainted count from hl7-diff and (b) the legitimate `?` fallback above
      # which would otherwise crash `$((total_diff + ?))` arithmetic.
      total_diff=$((total_diff + $(coerce_int "$count" 0)))
      say "  $thread → $destname: $count diff(s)"
    done < <(find "$a_dir" -maxdepth 1 -type f 2>/dev/null)
  done < "$OUT/inbounds.txt"

  {
    printf '\n## Summary\n\n- pairs compared: %s\n- total field differences (post-ignore): %s\n' "$total_pairs" "$total_diff"
  } >> "$diff_index"
  say "phase 5 done: $total_pairs pairs compared, $total_diff total diffs"
  say "index: $diff_index"
}

# ─────────────────────────────────────────────────────────────────────────────
# Phase 6: master summary
# ─────────────────────────────────────────────────────────────────────────────
phase_6() {
  say "=== PHASE 6: master regression-summary.md ==="
  local summary="$OUT/regression-summary.md"
  {
    printf '# Regression test summary\n\n'
    printf 'Generated: %s\n\n' "$(date -Iseconds 2>/dev/null || date)"
    printf '## Configuration\n\n'
    # printf '--' guard for leading '-' format strings (bash 3.2 / macOS).
    printf -- '- scope: `%s`\n' "$SCOPE"
    printf -- '- count: %s messages per inbound\n' "$COUNT"
    printf -- '- env-A: `%s` (site=%s)%s\n' "$ENV_A" "${SITE_A:-auto}" \
      "$([ -n "$SOURCE_SSH_ALIAS" ] && printf ' via ssh:%s' "$SOURCE_SSH_ALIAS")"
    printf -- '- env-B: `%s` (site=%s)%s\n' "$ENV_B" "${SITE_B:-auto}" \
      "$([ -n "$TARGET_SSH_ALIAS" ] && printf ' via ssh:%s' "$TARGET_SSH_ALIAS")"
    printf -- '- ignore: `%s`\n' "$IGNORE"
    [ -n "$INCLUDE" ] && printf -- '- include-only: `%s`\n' "$INCLUDE"
    printf '\n## Inbounds tested\n\n'
    [ -f "$OUT/inbounds.txt" ] && awk '{print "- `" $0 "`"}' "$OUT/inbounds.txt"
    printf '\n## Inputs\n\n'
    find "$OUT/inputs" -maxdepth 1 -type f 2>/dev/null \
      | awk '{print "- `" $0 "`"}' || true
    printf '\n## Output directories\n\n'
    printf -- '- env-A: `%s`\n' "$OUT/outputs/env-a"
    printf -- '- env-B: `%s`\n' "$OUT/outputs/env-b"
    printf '\n## Diff details\n\n'
    printf 'See [diff/_index.md](./diff/_index.md) for the per-pair table.\n'
  } > "$summary"
  say "summary: $summary"
}

# ─────────────────────────────────────────────────────────────────────────────
# Dispatch
# ─────────────────────────────────────────────────────────────────────────────
case "$PHASE" in
  1)     phase_1 ;;
  2)     phase_1 && phase_2 ;;
  3)     phase_3 ;;
  4)     phase_4 ;;
  5)     phase_5 ;;
  6)     phase_6 ;;
  all)   phase_1 && phase_2 && phase_3 && phase_4 && phase_5 && phase_6 ;;
  env-a) phase_1 && phase_2 && phase_3 ;;   # everything that uses env-A
  env-b) phase_4 && phase_5 && phase_6 ;;   # everything that uses env-B + diff
esac

# Optional: produce a portable bundle of the env-A artifacts (inputs + a-outputs)
# so the user can move them to the env-B box manually.
if [ -n "$BUNDLE_OUT" ]; then
  say "producing bundle: $BUNDLE_OUT"
  {
    printf '{'
    printf '"generated":"%s",' "$(date -Iseconds 2>/dev/null || date)"
    printf '"host":"%s",' "$(hostname 2>/dev/null || echo unknown)"
    printf '"env_a":"%s",' "$ENV_A"
    printf '"site_a":"%s",' "$SITE_A"
    printf '"env_b_expected":"%s",' "$ENV_B"
    printf '"site_b_expected":"%s",' "$SITE_B"
    printf '"scope":"%s",' "$SCOPE"
    printf '"count":%s,' "$COUNT"
    printf '"ignore":"%s",' "$IGNORE"
    printf '"route_test_cmd_hint":"%s"' "$ROUTE_TEST_CMD"
    printf '}\n'
  } > "$OUT/manifest.json"
  cat > "$OUT/README.md" <<EOF
# Regression bundle — env-A artifacts

Take this bundle to the env-B box and run:

\`\`\`
nc-regression.sh --bundle-in $(basename "$BUNDLE_OUT") --out $OUT \\
  --env-b /path/to/env-b/integrator --site-b <site> \\
  --route-test-cmd '<env-b route_test command>' \\
  --phase env-b
\`\`\`

The bundle contains:
- inputs/        — sampled messages from env-A (one .msgs per inbound)
- outputs/env-a/ — route_test outputs from env-A
- manifest.json  — env-A metadata
- inbounds.txt   — the threads tested

env-B side will produce outputs/env-b/ and the diff/ tree.
EOF
  tar -czf "$BUNDLE_OUT" -C "$OUT" inputs outputs/env-a inbounds.txt manifest.json README.md 2>/dev/null
  say "bundle ready: $BUNDLE_OUT ($(du -h "$BUNDLE_OUT" | awk '{print $1}'))"
fi

say "regression run done. Output root: $OUT"