#!/usr/bin/env bash # hl7-diff.sh — HL7-aware diff with field-level normalization. # Compares two HL7 message files (or multi-message files), segment-by-segment, # field-by-field. Lets you ignore fields that always change (default: MSH.7 # timestamp) without losing meaningful differences. # # Usage: # hl7-diff.sh [--ignore "FIELD,FIELD,..."] [--include-fields "FIELD,..."] # [--separator SEP] [--format text|tsv|count] # # # Multi-message handling: # Files may contain multiple HL7 messages separated by either: # - the file separator byte 0x1c (the nc_msgs --format raw default), OR # - blank lines between MSH-starting blocks (legacy). # The tool auto-detects. # # Defaults: # --ignore MSH.7 # # Output (text format): # Per-message header, then one line per differing field: # SEG.FIELD[.COMP[.SUB]] LEFT_VALUE RIGHT_VALUE # # Exit codes: 0 identical (post-ignore), 1 differences found, 2 input error set -o pipefail NC_SELF="$0" LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)" # v0.8.26: shared control-byte sanitizer. The diff echoes raw HL7 field values, # which can carry C0 control bytes that corrupt a terminal when viewed # un-redirected. The final awk (sole stdout producer) is piped through the # tty-gated sanitizer; on a pipe/redirect it passes through raw. See # lib/cygwin-safe.sh. if [ -r "$LIB_DIR/cygwin-safe.sh" ]; then # shellcheck disable=SC1090,SC1091 . "$LIB_DIR/cygwin-safe.sh" else _sanitize_ctl_tty() { cat; } # degrade safe: raw passthrough if lib missing fi die() { printf 'hl7-diff: %s\n' "$*" >&2; exit 2; } IGNORE="MSH.7" INCLUDE="" FORMAT="text" LEFT="" RIGHT="" while [ $# -gt 0 ]; do case "$1" in --ignore) shift; IGNORE="$1" ;; --include-fields) shift; INCLUDE="$1" ;; --format) shift; FORMAT="$1" ;; -h|--help) sed -n '2,25p' "$NC_SELF"; exit 0 ;; -*) die "unknown flag: $1" ;; *) if [ -z "$LEFT" ]; then LEFT="$1" elif [ -z "$RIGHT" ]; then RIGHT="$1" else die "extra arg: $1"; fi ;; esac shift done [ -f "$LEFT" ] || die "no such left file: $LEFT" [ -f "$RIGHT" ] || die "no such right file: $RIGHT" case "$FORMAT" in text|tsv|count) ;; *) die "bad --format" ;; esac # Split a file into individual messages. Each MSH-block becomes one message. # Output: one message per line, with \r preserved as 0x0d between segments, # messages separated by \x1e (record sep). Returns a path. split_messages() { local infile="$1" outfile="$2" # Try splitting by 0x1c first (raw nc_msgs format) if grep -q $'\x1c' "$infile" 2>/dev/null; then awk -v RS=$'\x1c' 'NF>0 || $0!="" {gsub(/\n$/,""); printf "%s\x1e", $0}' "$infile" > "$outfile" else # Fallback: each `MSH|` starts a new message; everything until next MSH is one message awk ' /^MSH\|/ { if (msg != "") printf "%s\x1e", msg msg = $0 next } { if (msg != "") msg = msg "\r" $0 else msg = $0 } END { if (msg != "") printf "%s\x1e", msg } ' "$infile" > "$outfile" fi } # Build awk script that does the comparison. We feed it both message lists and # the ignore/include lists. TMP_L=$(mktemp); TMP_R=$(mktemp) trap 'rm -f "$TMP_L" "$TMP_R"' EXIT split_messages "$LEFT" "$TMP_L" split_messages "$RIGHT" "$TMP_R" # v0.8.26: pipe the comparison output through the tty-gated sanitizer, then # propagate ${PIPESTATUS[0]} so awk's exit (0 identical, 1 differs) survives. awk -v IGNORE="$IGNORE" -v INCLUDE="$INCLUDE" -v FMT="$FORMAT" \ -v LFILE="$LEFT" -v RFILE="$RIGHT" ' function ignored(seg, field, comp, subc, key, key2) { if (INCLUDE != "") { # Inclusion mode: only fields in INCLUDE are checked key = seg "." field if (comp != "") key = key "." comp if (subc != "") key = key "." subc key2 = seg "." field # Match if exact or a prefix in include list n = split(INCLUDE, arr, ",") for (i=1; i<=n; i++) { ent = arr[i] if (ent == key || ent == key2) return 0 } return 1 } key = seg "." field n = split(IGNORE, arr, ",") for (i=1; i<=n; i++) { ent = arr[i] if (ent == key) return 1 if (ent == seg "." field "." comp) return 1 if (ent == seg "." field "." comp "." subc) return 1 } return 0 } function parse_msg(msg, out_segs, n, i, seg_name, raw_segs) { delete out_segs n = split(msg, raw_segs, "\r") for (i=1; i<=n; i++) { if (raw_segs[i] == "") continue seg_name = substr(raw_segs[i], 1, 3) out_segs[i] = seg_name "|" raw_segs[i] # prefix with name for easy lookup } return n } function compare_field(seg, fidx, lv, rv, msg_idx, n_lc, n_rc, lc, rc, lcomp, rcomp, j, k, n_lsc, n_rsc, lsub, rsub, nm, diffs, ls, rs, ns) { if (lv == rv) return 0 # Try component-level if both have ^ if (lv ~ /\^/ || rv ~ /\^/) { n_lc = split(lv, lcomp, "^") n_rc = split(rv, rcomp, "^") nm = (n_lc > n_rc) ? n_lc : n_rc diffs = 0 for (j=1; j<=nm; j++) { lc = (j <= n_lc) ? lcomp[j] : "" rc = (j <= n_rc) ? rcomp[j] : "" if (lc == rc) continue # subcomponent if (lc ~ /&/ || rc ~ /&/) { n_lsc = split(lc, lsub, "&") n_rsc = split(rc, rsub, "&") ns = (n_lsc > n_rsc) ? n_lsc : n_rsc for (k=1; k<=ns; k++) { ls = (k <= n_lsc) ? lsub[k] : "" rs = (k <= n_rsc) ? rsub[k] : "" if (ls != rs && !ignored(seg, fidx, j, k)) { emit(msg_idx, seg "." fidx "." j "." k, ls, rs) diffs++ } } } else { if (!ignored(seg, fidx, j, "")) { emit(msg_idx, seg "." fidx "." j, lc, rc) diffs++ } } } return diffs } if (!ignored(seg, fidx, "", "")) { emit(msg_idx, seg "." fidx, lv, rv) return 1 } return 0 } function emit(msg_idx, path, lv, rv) { # count mode: accumulate only — no per-diff output (final print is in END). if (FMT == "tsv") printf "%d\t%s\t%s\t%s\n", msg_idx, path, lv, rv else if (FMT != "count") printf " %-20s %-30s %s\n", path, lv, rv DIFF_COUNT++ } function diff_segment(seg_name, lseg, rseg, msg_idx, lf, rf, nl, nr, i, base, nmax, lv, rv, field_num) { nl = split(lseg, lf, "|") nr = split(rseg, rf, "|") nmax = (nl > nr) ? nl : nr # MSH field numbering offset base = (seg_name == "MSH") ? 0 : 1 # MSH.1 is the field separator (always "|"); MSH.2 is encoding chars. # For seg=MSH, index i in array → MSH. for i>=2 corresponds to lf[i] # For other seg, index i (i>=2) corresponds to SEG.(i-1). for (i=2; i<=nmax; i++) { lv = (i <= nl) ? lf[i] : "" rv = (i <= nr) ? rf[i] : "" field_num = (seg_name == "MSH") ? i : (i - 1) compare_field(seg_name, field_num, lv, rv, msg_idx) } } function diff_message(left_msg, right_msg, msg_idx, l_segs, r_segs, ln, rn, i, l_name, r_name, l, r, mx) { ln = parse_msg(left_msg, l_segs) rn = parse_msg(right_msg, r_segs) mx = (ln > rn) ? ln : rn for (i=1; i<=mx; i++) { l = (i <= ln) ? l_segs[i] : "" r = (i <= rn) ? r_segs[i] : "" l_name = (l != "") ? substr(l, 1, 3) : "(none)" r_name = (r != "") ? substr(r, 1, 3) : "(none)" if (l_name != r_name) { if (l == "" && r == "") continue # both ends padded — not a real diff emit(msg_idx, "SEGMENT_ORDER", l_name, r_name) continue } if (l_name == "(none)") continue # strip "NAME|" prefix we added in parse_msg sub(/^[A-Z0-9]+\|/, "", l) sub(/^[A-Z0-9]+\|/, "", r) diff_segment(l_name, l, r, msg_idx) } } BEGIN { DIFF_COUNT = 0; n_l = 0; n_r = 0 } FNR == NR { L_MSGS[++n_l] = $0; next } { R_MSGS[++n_r] = $0 } END { # F-1 fix (2026-06-08): the early-exit `if (FMT=="count")` that used to sit # here fired BEFORE the diff loop ran, so DIFF_COUNT was always 0. The loop # is now unconditional; count output is emitted AFTER the loop at the bottom. nm = (n_l > n_r) ? n_l : n_r if (FMT == "text") { printf "HL7 diff:\n left: %s (%d messages)\n right: %s (%d messages)\n ignore: %s\n", LFILE, n_l, RFILE, n_r, IGNORE if (INCLUDE != "") printf " include-only: %s\n", INCLUDE printf "\n" } if (n_l != n_r) { if (FMT == "tsv") printf "0\tMESSAGE_COUNT\t%d\t%d\n", n_l, n_r else if (FMT != "count") printf " MESSAGE COUNT mismatch: %d vs %d\n", n_l, n_r DIFF_COUNT++ } for (i=1; i<=nm; i++) { lm = (i <= n_l) ? L_MSGS[i] : "" rm = (i <= n_r) ? R_MSGS[i] : "" if (lm == "" || rm == "") { emit(i, "MESSAGE_PRESENCE", (lm != "" ? "present" : "missing"), (rm != "" ? "present" : "missing")) continue } if (FMT == "text" && (i == 1 || DIFF_COUNT > 0)) printf "----- message %d -----\n", i diff_message(lm, rm, i) } if (FMT == "count") { print DIFF_COUNT; exit (DIFF_COUNT > 0 ? 1 : 0) } if (FMT == "text") printf "\n%d total field difference(s)\n", DIFF_COUNT exit (DIFF_COUNT > 0 ? 1 : 0) } ' RS=$'\x1e' "$TMP_L" "$TMP_R" | _sanitize_ctl_tty exit "${PIPESTATUS[0]}"