#!/usr/bin/env bash # nc-smat-diff.sh — diff smat (message archive) content across two environments. # Different from nc-regression which runs route_test; this just samples actual # stored messages from each env's smatdb and compares them. # # Pairing strategy: # By default, sample N most-recent messages from each side, then pair by # MSH.10 (message control ID) — the standard HL7 unique identifier. # Unmatched IDs are reported as A-only / B-only. # # Usage: # nc-smat-diff.sh --env-a HCIROOT_A --env-b HCIROOT_B [opts] # # Options: # --site-a SITE site on env-A (default $HCISITE) # --site-b SITE site on env-B (default same as --site-a) # --limit N messages to sample per side (default 50) # --ignore FIELDS hl7-diff --ignore list (default "MSH.7") # --out DIR output directory for per-msg diffs + summary # --pair-on FIELD HL7 field to pair messages on (default MSH.10) # --include-history include SmatHistory archives on both sides # --after EXPR only messages after this time (e.g. "3 days ago") set -o pipefail NC_SELF="$0" LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)" NCM="$LIB_DIR/nc-msgs.sh" HL7F="$LIB_DIR/hl7-field.sh" HL7DIFF="$LIB_DIR/hl7-diff.sh" die() { printf 'nc-smat-diff: %s\n' "$*" >&2; exit 1; } THREAD="" ENV_A="" ENV_B="" SITE_A="${HCISITE:-}" SITE_B="" LIMIT=50 IGNORE="MSH.7" OUT="" PAIR_ON="MSH.10" INC_HIST=0 AFTER="" while [ $# -gt 0 ]; do case "$1" in --env-a) shift; ENV_A="$1" ;; --env-b) shift; ENV_B="$1" ;; --site-a) shift; SITE_A="$1" ;; --site-b) shift; SITE_B="$1" ;; --limit) shift; LIMIT="$1" ;; --ignore) shift; IGNORE="$1" ;; --out) shift; OUT="$1" ;; --pair-on) shift; PAIR_ON="$1" ;; --include-history) INC_HIST=1 ;; --after) shift; AFTER="$1" ;; -h|--help) sed -n '2,22p' "$NC_SELF"; exit 0 ;; -*) die "unknown flag: $1" ;; *) [ -z "$THREAD" ] && THREAD="$1" || die "extra arg: $1" ;; esac shift done [ -n "$THREAD" ] || die "missing thread name" [ -n "$ENV_A" ] && [ -n "$ENV_B" ] || die "--env-a and --env-b required" [ -n "$SITE_A" ] || die "--site-a required (or set HCISITE)" [ -z "$SITE_B" ] && SITE_B="$SITE_A" [ -z "$OUT" ] && OUT=$(mktemp -d) mkdir -p "$OUT/a" "$OUT/b" "$OUT/diff" 2>/dev/null dump_side() { local hciroot="$1" site="$2" target_dir="$3" local nc_args=("$THREAD" --limit "$LIMIT" --format raw) [ "$INC_HIST" = "1" ] && nc_args+=(--include-history) [ -n "$AFTER" ] && nc_args+=(--after "$AFTER") HCISITEDIR="$hciroot/$site" "$NCM" "${nc_args[@]}" > "$target_dir/all.raw" 2>"$target_dir/err" # Split into individual messages on 0x1c, and index by pair-on field awk -v RS=$'\x1c' -v dir="$target_dir" ' NF > 0 || $0 != "" { n++ fpath = dir "/msg_" sprintf("%05d", n) ".hl7" printf "%s", $0 > fpath close(fpath) } ' "$target_dir/all.raw" # Build pair-on-field → file index : > "$target_dir/index.tsv" for f in "$target_dir"/msg_*.hl7; do [ -f "$f" ] || continue local key; key=$("$HL7F" "$PAIR_ON" "$f" 2>/dev/null | head -1) [ -z "$key" ] && key="(no-$PAIR_ON)" printf '%s\t%s\n' "$key" "$f" >> "$target_dir/index.tsv" done } printf 'nc-smat-diff:\n thread: %s\n A: %s/%s\n B: %s/%s\n limit: %d ignore: %s pair-on: %s\n out: %s\n\n' \ "$THREAD" "$ENV_A" "$SITE_A" "$ENV_B" "$SITE_B" "$LIMIT" "$IGNORE" "$PAIR_ON" "$OUT" >&2 dump_side "$ENV_A" "$SITE_A" "$OUT/a" dump_side "$ENV_B" "$SITE_B" "$OUT/b" A_COUNT=$(wc -l < "$OUT/a/index.tsv" | tr -d ' ') B_COUNT=$(wc -l < "$OUT/b/index.tsv" | tr -d ' ') printf 'A: %d msgs B: %d msgs\n\n' "$A_COUNT" "$B_COUNT" >&2 # Pair messages by key sort -k1 "$OUT/a/index.tsv" > "$OUT/a/sorted.tsv" sort -k1 "$OUT/b/index.tsv" > "$OUT/b/sorted.tsv" A_KEYS=$(awk -F'\t' '{print $1}' "$OUT/a/sorted.tsv" | sort -u) B_KEYS=$(awk -F'\t' '{print $1}' "$OUT/b/sorted.tsv" | sort -u) SUMMARY="$OUT/_summary.md" { printf '# smat diff: thread=%s\n\n' "$THREAD" printf '- A: `%s/%s` (%d messages sampled)\n' "$ENV_A" "$SITE_A" "$A_COUNT" printf '- B: `%s/%s` (%d messages sampled)\n' "$ENV_B" "$SITE_B" "$B_COUNT" printf '- pair-on: `%s`\n' "$PAIR_ON" printf '- ignore: `%s`\n\n' "$IGNORE" printf '## Per-pair diffs\n\n' printf '| %s | diffs | report |\n|---|---|---|\n' "$PAIR_ON" } > "$SUMMARY" DIFFS_TOTAL=0 A_ONLY=0 B_ONLY=0 PAIRED=0 while IFS= read -r key; do [ -z "$key" ] && continue a_files=$(grep -F "${key} " "$OUT/a/sorted.tsv" | awk -F'\t' '{print $2}' | head -1) b_files=$(grep -F "${key} " "$OUT/b/sorted.tsv" | awk -F'\t' '{print $2}' | head -1) if [ -z "$a_files" ] && [ -n "$b_files" ]; then B_ONLY=$((B_ONLY+1)) echo "| \`$key\` | (only on B) | — |" >> "$SUMMARY" elif [ -z "$b_files" ] && [ -n "$a_files" ]; then A_ONLY=$((A_ONLY+1)) echo "| \`$key\` | (only on A) | — |" >> "$SUMMARY" else PAIRED=$((PAIRED+1)) report="$OUT/diff/${key//\//_}.md" cnt=$("$HL7DIFF" --ignore "$IGNORE" --format count "$a_files" "$b_files" 2>/dev/null || echo "?") { printf '# msg %s\n\nA: `%s`\nB: `%s`\n\n' "$key" "$a_files" "$b_files" "$HL7DIFF" --ignore "$IGNORE" "$a_files" "$b_files" 2>/dev/null } > "$report" echo "| \`$key\` | $cnt | [report](./diff/$(basename "$report")) |" >> "$SUMMARY" DIFFS_TOTAL=$((DIFFS_TOTAL + ${cnt:-0})) fi done < <(printf '%s\n%s\n' "$A_KEYS" "$B_KEYS" | sort -u) { printf '\n## Summary\n\n' printf '- paired (A and B): %d\n' "$PAIRED" printf '- A-only: %d\n' "$A_ONLY" printf '- B-only: %d\n' "$B_ONLY" printf '- total field differences (post-ignore): %d\n' "$DIFFS_TOTAL" } >> "$SUMMARY" printf 'done. Summary: %s\n' "$SUMMARY" >&2 echo "$SUMMARY"