cloverleaf-larry/lib/hl7-diff.sh
Bryan Johnson e08f030df5 v0.3.0: initial release of Larry-Anywhere
Portable AI agent for Cloverleaf integration work. Pure bash + curl + jq.
Zero dependency on v1 wrapper scripts or v2 cloverleaf-tools.pyz.

27 native Anthropic tools:

NetConfig parsing (read)
  nc_list_protocols, nc_list_processes, nc_protocol_block,
  nc_protocol_field, nc_protocol_nested, nc_protocol_summary,
  nc_destinations, nc_sources, nc_xlate_refs, nc_tclproc_refs

NetConfig modification (journal-backed writes with rollback)
  nc_insert_protocol, nc_add_route, larry_rollback_list

Workflows
  nc_find_inbound, nc_make_jump (3-thread jump pattern), nc_find
  (tbn/tbp/tbh/tbpr/where replacements), nc_document, nc_diff_interface,
  nc_regression

Messages
  hl7_field, nc_msgs (smat is SQLite!), hl7_diff (with --ignore MSH.7)

File system
  read_file, list_dir, grep_files, glob_files, write_file, bash_exec

Validated against a 22-site real Cloverleaf test install. Five worked
examples end-to-end: jump-thread generation, smat MRN search, system
documentation, interface+connected diff, HL7-aware regression diff.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-26 09:46:20 -07:00

249 lines
8.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# hl7-diff.sh — HL7-aware diff with field-level normalization.
# Compares two HL7 message files (or multi-message files), segment-by-segment,
# field-by-field. Lets you ignore fields that always change (default: MSH.7
# timestamp) without losing meaningful differences.
#
# Usage:
# hl7-diff.sh [--ignore "FIELD,FIELD,..."] [--include-fields "FIELD,..."]
# [--separator SEP] [--format text|tsv|count]
# <left_file> <right_file>
#
# Multi-message handling:
# Files may contain multiple HL7 messages separated by either:
# - the file separator byte 0x1c (the nc_msgs --format raw default), OR
# - blank lines between MSH-starting blocks (legacy).
# The tool auto-detects.
#
# Defaults:
# --ignore MSH.7
#
# Output (text format):
# Per-message header, then one line per differing field:
# SEG.FIELD[.COMP[.SUB]] LEFT_VALUE RIGHT_VALUE
#
# Exit codes: 0 identical (post-ignore), 1 differences found, 2 input error
set -o pipefail
NC_SELF="$0"
LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)"
die() { printf 'hl7-diff: %s\n' "$*" >&2; exit 2; }
IGNORE="MSH.7"
INCLUDE=""
FORMAT="text"
LEFT=""
RIGHT=""
while [ $# -gt 0 ]; do
case "$1" in
--ignore) shift; IGNORE="$1" ;;
--include-fields) shift; INCLUDE="$1" ;;
--format) shift; FORMAT="$1" ;;
-h|--help) sed -n '2,25p' "$NC_SELF"; exit 0 ;;
-*) die "unknown flag: $1" ;;
*) if [ -z "$LEFT" ]; then LEFT="$1"
elif [ -z "$RIGHT" ]; then RIGHT="$1"
else die "extra arg: $1"; fi ;;
esac
shift
done
[ -f "$LEFT" ] || die "no such left file: $LEFT"
[ -f "$RIGHT" ] || die "no such right file: $RIGHT"
case "$FORMAT" in text|tsv|count) ;; *) die "bad --format" ;; esac
# Split a file into individual messages. Each MSH-block becomes one message.
# Output: one message per line, with \r preserved as 0x0d between segments,
# messages separated by \x1e (record sep). Returns a path.
split_messages() {
local infile="$1" outfile="$2"
# Try splitting by 0x1c first (raw nc_msgs format)
if grep -q $'\x1c' "$infile" 2>/dev/null; then
awk -v RS=$'\x1c' 'NF>0 || $0!="" {gsub(/\n$/,""); printf "%s\x1e", $0}' "$infile" > "$outfile"
else
# Fallback: each `MSH|` starts a new message; everything until next MSH is one message
awk '
/^MSH\|/ {
if (msg != "") printf "%s\x1e", msg
msg = $0
next
}
{
if (msg != "") msg = msg "\r" $0
else msg = $0
}
END {
if (msg != "") printf "%s\x1e", msg
}
' "$infile" > "$outfile"
fi
}
# Build awk script that does the comparison. We feed it both message lists and
# the ignore/include lists.
TMP_L=$(mktemp); TMP_R=$(mktemp)
trap 'rm -f "$TMP_L" "$TMP_R"' EXIT
split_messages "$LEFT" "$TMP_L"
split_messages "$RIGHT" "$TMP_R"
awk -v IGNORE="$IGNORE" -v INCLUDE="$INCLUDE" -v FMT="$FORMAT" \
-v LFILE="$LEFT" -v RFILE="$RIGHT" '
function ignored(seg, field, comp, subc, key, key2) {
if (INCLUDE != "") {
# Inclusion mode: only fields in INCLUDE are checked
key = seg "." field
if (comp != "") key = key "." comp
if (subc != "") key = key "." subc
key2 = seg "." field
# Match if exact or a prefix in include list
n = split(INCLUDE, arr, ",")
for (i=1; i<=n; i++) {
ent = arr[i]
if (ent == key || ent == key2) return 0
}
return 1
}
key = seg "." field
n = split(IGNORE, arr, ",")
for (i=1; i<=n; i++) {
ent = arr[i]
if (ent == key) return 1
if (ent == seg "." field "." comp) return 1
if (ent == seg "." field "." comp "." subc) return 1
}
return 0
}
function parse_msg(msg, out_segs, n, i, seg_name, raw_segs) {
delete out_segs
n = split(msg, raw_segs, "\r")
for (i=1; i<=n; i++) {
if (raw_segs[i] == "") continue
seg_name = substr(raw_segs[i], 1, 3)
out_segs[i] = seg_name "|" raw_segs[i] # prefix with name for easy lookup
}
return n
}
function compare_field(seg, fidx, lv, rv, msg_idx, n_lc, n_rc, lc, rc, lcomp, rcomp, j, k, n_lsc, n_rsc, lsub, rsub, nm, diffs, ls, rs, ns) {
if (lv == rv) return 0
# Try component-level if both have ^
if (lv ~ /\^/ || rv ~ /\^/) {
n_lc = split(lv, lcomp, "^")
n_rc = split(rv, rcomp, "^")
nm = (n_lc > n_rc) ? n_lc : n_rc
diffs = 0
for (j=1; j<=nm; j++) {
lc = (j <= n_lc) ? lcomp[j] : ""
rc = (j <= n_rc) ? rcomp[j] : ""
if (lc == rc) continue
# subcomponent
if (lc ~ /&/ || rc ~ /&/) {
n_lsc = split(lc, lsub, "&")
n_rsc = split(rc, rsub, "&")
ns = (n_lsc > n_rsc) ? n_lsc : n_rsc
for (k=1; k<=ns; k++) {
ls = (k <= n_lsc) ? lsub[k] : ""
rs = (k <= n_rsc) ? rsub[k] : ""
if (ls != rs && !ignored(seg, fidx, j, k)) {
emit(msg_idx, seg "." fidx "." j "." k, ls, rs)
diffs++
}
}
} else {
if (!ignored(seg, fidx, j, "")) {
emit(msg_idx, seg "." fidx "." j, lc, rc)
diffs++
}
}
}
return diffs
}
if (!ignored(seg, fidx, "", "")) {
emit(msg_idx, seg "." fidx, lv, rv)
return 1
}
return 0
}
function emit(msg_idx, path, lv, rv) {
if (FMT == "tsv") printf "%d\t%s\t%s\t%s\n", msg_idx, path, lv, rv
else printf " %-20s %-30s %s\n", path, lv, rv
DIFF_COUNT++
}
function diff_segment(seg_name, lseg, rseg, msg_idx, lf, rf, nl, nr, i, base, nmax, lv, rv, field_num) {
nl = split(lseg, lf, "|")
nr = split(rseg, rf, "|")
nmax = (nl > nr) ? nl : nr
# MSH field numbering offset
base = (seg_name == "MSH") ? 0 : 1
# MSH.1 is the field separator (always "|"); MSH.2 is encoding chars.
# For seg=MSH, index i in array → MSH.<i> for i>=2 corresponds to lf[i]
# For other seg, index i (i>=2) corresponds to SEG.(i-1).
for (i=2; i<=nmax; i++) {
lv = (i <= nl) ? lf[i] : ""
rv = (i <= nr) ? rf[i] : ""
field_num = (seg_name == "MSH") ? i : (i - 1)
compare_field(seg_name, field_num, lv, rv, msg_idx)
}
}
function diff_message(left_msg, right_msg, msg_idx, l_segs, r_segs, ln, rn, i, l_name, r_name, l, r, mx) {
ln = parse_msg(left_msg, l_segs)
rn = parse_msg(right_msg, r_segs)
mx = (ln > rn) ? ln : rn
for (i=1; i<=mx; i++) {
l = (i <= ln) ? l_segs[i] : ""
r = (i <= rn) ? r_segs[i] : ""
l_name = (l != "") ? substr(l, 1, 3) : "(none)"
r_name = (r != "") ? substr(r, 1, 3) : "(none)"
if (l_name != r_name) {
if (l == "" && r == "") continue # both ends padded — not a real diff
emit(msg_idx, "SEGMENT_ORDER", l_name, r_name)
continue
}
if (l_name == "(none)") continue
# strip "NAME|" prefix we added in parse_msg
sub(/^[A-Z0-9]+\|/, "", l)
sub(/^[A-Z0-9]+\|/, "", r)
diff_segment(l_name, l, r, msg_idx)
}
}
BEGIN { DIFF_COUNT = 0; n_l = 0; n_r = 0 }
FNR == NR { L_MSGS[++n_l] = $0; next }
{ R_MSGS[++n_r] = $0 }
END {
if (FMT == "count") { print DIFF_COUNT; exit }
nm = (n_l > n_r) ? n_l : n_r
if (FMT == "text") {
printf "HL7 diff:\n left: %s (%d messages)\n right: %s (%d messages)\n ignore: %s\n", LFILE, n_l, RFILE, n_r, IGNORE
if (INCLUDE != "") printf " include-only: %s\n", INCLUDE
printf "\n"
}
if (n_l != n_r) {
if (FMT == "tsv") printf "0\tMESSAGE_COUNT\t%d\t%d\n", n_l, n_r
else printf " MESSAGE COUNT mismatch: %d vs %d\n", n_l, n_r
DIFF_COUNT++
}
for (i=1; i<=nm; i++) {
lm = (i <= n_l) ? L_MSGS[i] : ""
rm = (i <= n_r) ? R_MSGS[i] : ""
if (lm == "" || rm == "") {
emit(i, "MESSAGE_PRESENCE", (lm != "" ? "present" : "missing"), (rm != "" ? "present" : "missing"))
continue
}
if (FMT == "text" && (i == 1 || DIFF_COUNT > 0)) printf "----- message %d -----\n", i
diff_message(lm, rm, i)
}
if (FMT == "text") printf "\n%d total field difference(s)\n", DIFF_COUNT
exit (DIFF_COUNT > 0 ? 1 : 0)
}
' RS=$'\x1e' "$TMP_L" "$TMP_R"