#!/usr/bin/env bash
# hl7-diff.sh — HL7-aware diff with field-level normalization.
# Compares two HL7 message files (or multi-message files), segment-by-segment,
# field-by-field. Lets you ignore fields that always change (default: MSH.7
# timestamp) without losing meaningful differences.
#
# Usage:
#   hl7-diff.sh [--ignore "FIELD,FIELD,..."] [--include-fields "FIELD,..."]
#               [--separator SEP] [--format text|tsv|count]
#               <left_file> <right_file>
#
# Multi-message handling:
#   Files may contain multiple HL7 messages separated by either:
#     - the file separator byte 0x1c (the nc_msgs --format raw default), OR
#     - blank lines between MSH-starting blocks (legacy).
#   The tool auto-detects.
#
# Defaults:
#   --ignore MSH.7
#
# Output (text format):
#   Per-message header, then one line per differing field:
#     SEG.FIELD[.COMP[.SUB]]    LEFT_VALUE                 RIGHT_VALUE
#
# Exit codes: 0 identical (post-ignore), 1 differences found, 2 input error
set -o pipefail

NC_SELF="$0"
LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)"

# v0.8.26: shared control-byte sanitizer. The diff echoes raw HL7 field values,
# which can carry C0 control bytes that corrupt a terminal when viewed
# un-redirected. The final awk (sole stdout producer) is piped through the
# tty-gated sanitizer; on a pipe/redirect it passes through raw. See
# lib/cygwin-safe.sh.
if [ -r "$LIB_DIR/cygwin-safe.sh" ]; then
  # shellcheck disable=SC1090,SC1091
  . "$LIB_DIR/cygwin-safe.sh"
else
  _sanitize_ctl_tty() { cat; }   # degrade safe: raw passthrough if lib missing
fi

die() { printf 'hl7-diff: %s\n' "$*" >&2; exit 2; }

IGNORE="MSH.7"
INCLUDE=""
FORMAT="text"
LEFT=""
RIGHT=""

while [ $# -gt 0 ]; do
  case "$1" in
    --ignore)         shift; IGNORE="$1" ;;
    --include-fields) shift; INCLUDE="$1" ;;
    --format)         shift; FORMAT="$1" ;;
    -h|--help)        sed -n '2,25p' "$NC_SELF"; exit 0 ;;
    -*)               die "unknown flag: $1" ;;
    *)                if [ -z "$LEFT" ]; then LEFT="$1"
                      elif [ -z "$RIGHT" ]; then RIGHT="$1"
                      else die "extra arg: $1"; fi ;;
  esac
  shift
done

[ -f "$LEFT" ]  || die "no such left file: $LEFT"
[ -f "$RIGHT" ] || die "no such right file: $RIGHT"
case "$FORMAT" in text|tsv|count) ;; *) die "bad --format" ;; esac

# Split a file into individual messages. Each MSH-block becomes one message.
# Output: one message per line, with \r preserved as 0x0d between segments,
# messages separated by \x1e (record sep). Returns a path.
split_messages() {
  local infile="$1" outfile="$2"
  # Try splitting by 0x1c first (raw nc_msgs format)
  if grep -q $'\x1c' "$infile" 2>/dev/null; then
    awk -v RS=$'\x1c' 'NF>0 || $0!="" {gsub(/\n$/,""); printf "%s\x1e", $0}' "$infile" > "$outfile"
  else
    # Fallback: each `MSH|` starts a new message; everything until next MSH is one message
    awk '
      /^MSH\|/ {
        if (msg != "") printf "%s\x1e", msg
        msg = $0
        next
      }
      {
        if (msg != "") msg = msg "\r" $0
        else msg = $0
      }
      END {
        if (msg != "") printf "%s\x1e", msg
      }
    ' "$infile" > "$outfile"
  fi
}

# Build awk script that does the comparison. We feed it both message lists and
# the ignore/include lists.
TMP_L=$(mktemp); TMP_R=$(mktemp)
trap 'rm -f "$TMP_L" "$TMP_R"' EXIT

split_messages "$LEFT"  "$TMP_L"
split_messages "$RIGHT" "$TMP_R"

# v0.8.26: pipe the comparison output through the tty-gated sanitizer, then
# propagate ${PIPESTATUS[0]} so awk's exit (0 identical, 1 differs) survives.
awk -v IGNORE="$IGNORE" -v INCLUDE="$INCLUDE" -v FMT="$FORMAT" \
    -v LFILE="$LEFT" -v RFILE="$RIGHT" '
  function ignored(seg, field, comp, subc,    key, key2) {
    if (INCLUDE != "") {
      # Inclusion mode: only fields in INCLUDE are checked
      key = seg "." field
      if (comp != "") key = key "." comp
      if (subc != "") key = key "." subc
      key2 = seg "." field
      # Match if exact or a prefix in include list
      n = split(INCLUDE, arr, ",")
      for (i=1; i<=n; i++) {
        ent = arr[i]
        if (ent == key || ent == key2) return 0
      }
      return 1
    }
    key = seg "." field
    n = split(IGNORE, arr, ",")
    for (i=1; i<=n; i++) {
      ent = arr[i]
      if (ent == key) return 1
      if (ent == seg "." field "." comp) return 1
      if (ent == seg "." field "." comp "." subc) return 1
    }
    return 0
  }

  function parse_msg(msg, out_segs,    n, i, seg_name, raw_segs) {
    delete out_segs
    n = split(msg, raw_segs, "\r")
    for (i=1; i<=n; i++) {
      if (raw_segs[i] == "") continue
      seg_name = substr(raw_segs[i], 1, 3)
      out_segs[i] = seg_name "|" raw_segs[i]   # prefix with name for easy lookup
    }
    return n
  }

  function compare_field(seg, fidx, lv, rv, msg_idx,    n_lc, n_rc, lc, rc, lcomp, rcomp, j, k, n_lsc, n_rsc, lsub, rsub, nm, diffs, ls, rs, ns) {
    if (lv == rv) return 0
    # Try component-level if both have ^
    if (lv ~ /\^/ || rv ~ /\^/) {
      n_lc = split(lv, lcomp, "^")
      n_rc = split(rv, rcomp, "^")
      nm = (n_lc > n_rc) ? n_lc : n_rc
      diffs = 0
      for (j=1; j<=nm; j++) {
        lc = (j <= n_lc) ? lcomp[j] : ""
        rc = (j <= n_rc) ? rcomp[j] : ""
        if (lc == rc) continue
        # subcomponent
        if (lc ~ /&/ || rc ~ /&/) {
          n_lsc = split(lc, lsub, "&")
          n_rsc = split(rc, rsub, "&")
          ns = (n_lsc > n_rsc) ? n_lsc : n_rsc
          for (k=1; k<=ns; k++) {
            ls = (k <= n_lsc) ? lsub[k] : ""
            rs = (k <= n_rsc) ? rsub[k] : ""
            if (ls != rs && !ignored(seg, fidx, j, k)) {
              emit(msg_idx, seg "." fidx "." j "." k, ls, rs)
              diffs++
            }
          }
        } else {
          if (!ignored(seg, fidx, j, "")) {
            emit(msg_idx, seg "." fidx "." j, lc, rc)
            diffs++
          }
        }
      }
      return diffs
    }
    if (!ignored(seg, fidx, "", "")) {
      emit(msg_idx, seg "." fidx, lv, rv)
      return 1
    }
    return 0
  }

  function emit(msg_idx, path, lv, rv) {
    # count mode: accumulate only — no per-diff output (final print is in END).
    if (FMT == "tsv")   printf "%d\t%s\t%s\t%s\n", msg_idx, path, lv, rv
    else if (FMT != "count") printf "  %-20s  %-30s  %s\n", path, lv, rv
    DIFF_COUNT++
  }

  function diff_segment(seg_name, lseg, rseg, msg_idx,    lf, rf, nl, nr, i, base, nmax, lv, rv, field_num) {
    nl = split(lseg, lf, "|")
    nr = split(rseg, rf, "|")
    nmax = (nl > nr) ? nl : nr
    # MSH field numbering offset
    base = (seg_name == "MSH") ? 0 : 1
    # MSH.1 is the field separator (always "|"); MSH.2 is encoding chars.
    # For seg=MSH, index i in array → MSH.<i> for i>=2 corresponds to lf[i]
    # For other seg, index i (i>=2) corresponds to SEG.(i-1).
    for (i=2; i<=nmax; i++) {
      lv = (i <= nl) ? lf[i] : ""
      rv = (i <= nr) ? rf[i] : ""
      field_num = (seg_name == "MSH") ? i : (i - 1)
      compare_field(seg_name, field_num, lv, rv, msg_idx)
    }
  }

  function diff_message(left_msg, right_msg, msg_idx,     l_segs, r_segs, ln, rn, i, l_name, r_name, l, r, mx) {
    ln = parse_msg(left_msg,  l_segs)
    rn = parse_msg(right_msg, r_segs)
    mx = (ln > rn) ? ln : rn
    for (i=1; i<=mx; i++) {
      l = (i <= ln) ? l_segs[i] : ""
      r = (i <= rn) ? r_segs[i] : ""
      l_name = (l != "") ? substr(l, 1, 3) : "(none)"
      r_name = (r != "") ? substr(r, 1, 3) : "(none)"
      if (l_name != r_name) {
        if (l == "" && r == "") continue  # both ends padded — not a real diff
        emit(msg_idx, "SEGMENT_ORDER", l_name, r_name)
        continue
      }
      if (l_name == "(none)") continue
      # strip "NAME|" prefix we added in parse_msg
      sub(/^[A-Z0-9]+\|/, "", l)
      sub(/^[A-Z0-9]+\|/, "", r)
      diff_segment(l_name, l, r, msg_idx)
    }
  }

  BEGIN { DIFF_COUNT = 0; n_l = 0; n_r = 0 }

  FNR == NR { L_MSGS[++n_l] = $0; next }
  { R_MSGS[++n_r] = $0 }

  END {
    # F-1 fix (2026-06-08): the early-exit `if (FMT=="count")` that used to sit
    # here fired BEFORE the diff loop ran, so DIFF_COUNT was always 0.  The loop
    # is now unconditional; count output is emitted AFTER the loop at the bottom.
    nm = (n_l > n_r) ? n_l : n_r
    if (FMT == "text") {
      printf "HL7 diff:\n  left:  %s  (%d messages)\n  right: %s  (%d messages)\n  ignore: %s\n", LFILE, n_l, RFILE, n_r, IGNORE
      if (INCLUDE != "") printf "  include-only: %s\n", INCLUDE
      printf "\n"
    }
    if (n_l != n_r) {
      if (FMT == "tsv") printf "0\tMESSAGE_COUNT\t%d\t%d\n", n_l, n_r
      else if (FMT != "count") printf "  MESSAGE COUNT mismatch: %d vs %d\n", n_l, n_r
      DIFF_COUNT++
    }
    for (i=1; i<=nm; i++) {
      lm = (i <= n_l) ? L_MSGS[i] : ""
      rm = (i <= n_r) ? R_MSGS[i] : ""
      if (lm == "" || rm == "") {
        emit(i, "MESSAGE_PRESENCE", (lm != "" ? "present" : "missing"), (rm != "" ? "present" : "missing"))
        continue
      }
      if (FMT == "text" && (i == 1 || DIFF_COUNT > 0)) printf "----- message %d -----\n", i
      diff_message(lm, rm, i)
    }
    if (FMT == "count") { print DIFF_COUNT; exit (DIFF_COUNT > 0 ? 1 : 0) }
    if (FMT == "text") printf "\n%d total field difference(s)\n", DIFF_COUNT
    exit (DIFF_COUNT > 0 ? 1 : 0)
  }
' RS=$'\x1e' "$TMP_L" "$TMP_R" | _sanitize_ctl_tty
exit "${PIPESTATUS[0]}"