cloverleaf-larry/lib/nc-smat-diff.sh
Bryan Johnson 9dd5821436 v0.7.5: OAuth CR-taint fix + mouse opt-in + CR-safety sweep
- Fix bash arithmetic crash on MobaXterm/Cygwin: $(date +%s) was
  returning CR-tainted values landing in $(( )) operands
- Mouse mode off by default; opt in via LARRY_MOUSE=1 or /mouse on
- Comprehensive CR-safety sweep across lib/*.sh and larry.sh — every
  command-substitution result, file read, and user input that feeds
  an arithmetic context, case dispatcher, or path/header is now
  CR-stripped at the source

New shared helper lib/cygwin-safe.sh defines three primitives:
  coerce_int VAL [DEFAULT]   — for arithmetic / integer-test operands
  strip_cr VAL               — for case patterns, regex tests, paths, headers
  read_clean VAR [PROMPT]    — read -r wrapper that strips CR pre-assign

Hardened call sites (14 files, 60+ patch points):
  - larry.sh:  status-line date/tput, 3 y/N approvals, auth menu, API key
  - lib/oauth.sh:  cmd_login + cmd_refresh date+%s captures
  - lib/nc-engine.sh:  5 y/N action prompts + find|wc arithmetic
  - lib/nc-msgs.sh:  parse_time_ms (4 date sites) + meta-TSV time + MSG_COUNT
  - lib/nc-regression.sh:  tr|wc count + hl7-diff ?-fallback arithmetic
  - lib/nc-smat-diff.sh:  A_COUNT/B_COUNT/DIFFS_TOTAL
  - lib/nc-insert-protocol.sh:  every awk-emitted line number → head/tail math
  - lib/journal.sh:  _next_seq wc -l arithmetic
  - lib/lessons.sh:  _next_id/_count + 2 y/N prompts
  - lib/hl7-sanitize.sh:  cmd_count + clear-table y/N
  - lib/ssh-helper.sh:  4 local+remote wc -c integer compares
  - lib/nc-find.sh, lib/nc-table.sh, lib/nc-document.sh, larry-rollback.sh

Reproduces the exact error Bryan hit:
  bash: ...: arithmetic syntax error: invalid arithmetic operator (error token is "")

lib/cygwin-safe.sh added to MANIFEST so it auto-syncs on next launch.

Co-Authored-By: Clover (Claude Opus 4.7) <noreply@anthropic.com>
2026-05-27 19:17:48 -07:00

174 lines
6.3 KiB
Bash
Executable File

#!/usr/bin/env bash
# nc-smat-diff.sh — diff smat (message archive) content across two environments.
# Different from nc-regression which runs route_test; this just samples actual
# stored messages from each env's smatdb and compares them.
#
# Pairing strategy:
# By default, sample N most-recent messages from each side, then pair by
# MSH.10 (message control ID) — the standard HL7 unique identifier.
# Unmatched IDs are reported as A-only / B-only.
#
# Usage:
# nc-smat-diff.sh <thread> --env-a HCIROOT_A --env-b HCIROOT_B [opts]
#
# Options:
# --site-a SITE site on env-A (default $HCISITE)
# --site-b SITE site on env-B (default same as --site-a)
# --limit N messages to sample per side (default 50)
# --ignore FIELDS hl7-diff --ignore list (default "MSH.7")
# --out DIR output directory for per-msg diffs + summary
# --pair-on FIELD HL7 field to pair messages on (default MSH.10)
# --include-history include SmatHistory archives on both sides
# --after EXPR only messages after this time (e.g. "3 days ago")
set -o pipefail
NC_SELF="$0"
LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)"
NCM="$LIB_DIR/nc-msgs.sh"
HL7F="$LIB_DIR/hl7-field.sh"
HL7DIFF="$LIB_DIR/hl7-diff.sh"
# v0.7.5: shared CR-safety primitives — A_COUNT/B_COUNT/DIFFS_TOTAL all feed
# `%d` printf and arithmetic; Cygwin wc.exe CR-taint would crash them.
if [ -r "$LIB_DIR/cygwin-safe.sh" ]; then
# shellcheck disable=SC1090,SC1091
. "$LIB_DIR/cygwin-safe.sh"
else
coerce_int() { local r="${1:-}" d="${2:-0}" c; c=$(printf '%s' "$r" | tr -cd '0-9'); printf '%s' "${c:-$d}"; }
fi
die() { printf 'nc-smat-diff: %s\n' "$*" >&2; exit 1; }
THREAD=""
ENV_A=""
ENV_B=""
SITE_A="${HCISITE:-}"
SITE_B=""
LIMIT=50
IGNORE="MSH.7"
OUT=""
PAIR_ON="MSH.10"
INC_HIST=0
AFTER=""
while [ $# -gt 0 ]; do
case "$1" in
--env-a) shift; ENV_A="$1" ;;
--env-b) shift; ENV_B="$1" ;;
--site-a) shift; SITE_A="$1" ;;
--site-b) shift; SITE_B="$1" ;;
--limit) shift; LIMIT="$1" ;;
--ignore) shift; IGNORE="$1" ;;
--out) shift; OUT="$1" ;;
--pair-on) shift; PAIR_ON="$1" ;;
--include-history) INC_HIST=1 ;;
--after) shift; AFTER="$1" ;;
-h|--help) sed -n '2,22p' "$NC_SELF"; exit 0 ;;
-*) die "unknown flag: $1" ;;
*) [ -z "$THREAD" ] && THREAD="$1" || die "extra arg: $1" ;;
esac
shift
done
[ -n "$THREAD" ] || die "missing thread name"
[ -n "$ENV_A" ] && [ -n "$ENV_B" ] || die "--env-a and --env-b required"
[ -n "$SITE_A" ] || die "--site-a required (or set HCISITE)"
[ -z "$SITE_B" ] && SITE_B="$SITE_A"
[ -z "$OUT" ] && OUT=$(mktemp -d)
mkdir -p "$OUT/a" "$OUT/b" "$OUT/diff" 2>/dev/null
dump_side() {
local hciroot="$1" site="$2" target_dir="$3"
local nc_args=("$THREAD" --limit "$LIMIT" --format raw)
[ "$INC_HIST" = "1" ] && nc_args+=(--include-history)
[ -n "$AFTER" ] && nc_args+=(--after "$AFTER")
HCISITEDIR="$hciroot/$site" "$NCM" "${nc_args[@]}" > "$target_dir/all.raw" 2>"$target_dir/err"
# Split into individual messages on 0x1c, and index by pair-on field
awk -v RS=$'\x1c' -v dir="$target_dir" '
NF > 0 || $0 != "" {
n++
fpath = dir "/msg_" sprintf("%05d", n) ".hl7"
printf "%s", $0 > fpath
close(fpath)
}
' "$target_dir/all.raw"
# Build pair-on-field → file index
: > "$target_dir/index.tsv"
for f in "$target_dir"/msg_*.hl7; do
[ -f "$f" ] || continue
local key; key=$("$HL7F" "$PAIR_ON" "$f" 2>/dev/null | head -1)
[ -z "$key" ] && key="(no-$PAIR_ON)"
printf '%s\t%s\n' "$key" "$f" >> "$target_dir/index.tsv"
done
}
printf 'nc-smat-diff:\n thread: %s\n A: %s/%s\n B: %s/%s\n limit: %d ignore: %s pair-on: %s\n out: %s\n\n' \
"$THREAD" "$ENV_A" "$SITE_A" "$ENV_B" "$SITE_B" "$LIMIT" "$IGNORE" "$PAIR_ON" "$OUT" >&2
dump_side "$ENV_A" "$SITE_A" "$OUT/a"
dump_side "$ENV_B" "$SITE_B" "$OUT/b"
# v0.7.5: coerce_int on wc output — Cygwin wc.exe CR-taint defense.
A_COUNT=$(coerce_int "$(wc -l < "$OUT/a/index.tsv")" 0)
B_COUNT=$(coerce_int "$(wc -l < "$OUT/b/index.tsv")" 0)
printf 'A: %d msgs B: %d msgs\n\n' "$A_COUNT" "$B_COUNT" >&2
# Pair messages by key
sort -k1 "$OUT/a/index.tsv" > "$OUT/a/sorted.tsv"
sort -k1 "$OUT/b/index.tsv" > "$OUT/b/sorted.tsv"
A_KEYS=$(awk -F'\t' '{print $1}' "$OUT/a/sorted.tsv" | sort -u)
B_KEYS=$(awk -F'\t' '{print $1}' "$OUT/b/sorted.tsv" | sort -u)
SUMMARY="$OUT/_summary.md"
{
printf '# smat diff: thread=%s\n\n' "$THREAD"
printf '- A: `%s/%s` (%d messages sampled)\n' "$ENV_A" "$SITE_A" "$A_COUNT"
printf '- B: `%s/%s` (%d messages sampled)\n' "$ENV_B" "$SITE_B" "$B_COUNT"
printf '- pair-on: `%s`\n' "$PAIR_ON"
printf '- ignore: `%s`\n\n' "$IGNORE"
printf '## Per-pair diffs\n\n'
printf '| %s | diffs | report |\n|---|---|---|\n' "$PAIR_ON"
} > "$SUMMARY"
DIFFS_TOTAL=0
A_ONLY=0
B_ONLY=0
PAIRED=0
while IFS= read -r key; do
[ -z "$key" ] && continue
a_files=$(grep -F "${key} " "$OUT/a/sorted.tsv" | awk -F'\t' '{print $2}' | head -1)
b_files=$(grep -F "${key} " "$OUT/b/sorted.tsv" | awk -F'\t' '{print $2}' | head -1)
if [ -z "$a_files" ] && [ -n "$b_files" ]; then
B_ONLY=$((B_ONLY+1))
echo "| \`$key\` | (only on B) | — |" >> "$SUMMARY"
elif [ -z "$b_files" ] && [ -n "$a_files" ]; then
A_ONLY=$((A_ONLY+1))
echo "| \`$key\` | (only on A) | — |" >> "$SUMMARY"
else
PAIRED=$((PAIRED+1))
report="$OUT/diff/${key//\//_}.md"
cnt=$("$HL7DIFF" --ignore "$IGNORE" --format count "$a_files" "$b_files" 2>/dev/null || echo "?")
{
printf '# msg %s\n\nA: `%s`\nB: `%s`\n\n' "$key" "$a_files" "$b_files"
"$HL7DIFF" --ignore "$IGNORE" "$a_files" "$b_files" 2>/dev/null
} > "$report"
echo "| \`$key\` | $cnt | [report](./diff/$(basename "$report")) |" >> "$SUMMARY"
# v0.7.5: coerce_int — defends against both Cygwin awk CR-taint AND the
# legitimate `?` fallback that would crash arithmetic.
DIFFS_TOTAL=$((DIFFS_TOTAL + $(coerce_int "$cnt" 0)))
fi
done < <(printf '%s\n%s\n' "$A_KEYS" "$B_KEYS" | sort -u)
{
printf '\n## Summary\n\n'
printf '- paired (A and B): %d\n' "$PAIRED"
printf '- A-only: %d\n' "$A_ONLY"
printf '- B-only: %d\n' "$B_ONLY"
printf '- total field differences (post-ignore): %d\n' "$DIFFS_TOTAL"
} >> "$SUMMARY"
printf 'done. Summary: %s\n' "$SUMMARY" >&2
echo "$SUMMARY"