cloverleaf-larry/lib/hl7-sanitize.sh

#!/usr/bin/env bash
# hl7-sanitize.sh — tokenize PHI fields in HL7 v2 messages.
#
# Replaces PHI-likely fields with deterministic local tokens like [[MRN_0001]],
# [[NAME_0001]], etc. Same value → same token across messages, so analysis
# downstream can still correlate patients, find duplicates, etc.
#
# The token ↔ original mapping is stored LOCAL ONLY at:
#   $LARRY_HOME/sanitize/lookup.tsv   (mode 0600)
#
# Use hl7-desanitize.sh to reverse the operation when viewing results locally.
#
# Usage:
#   hl7-sanitize.sh [FILE]                 # sanitize file (or stdin); writes lookup
#   hl7-sanitize.sh --strict [FILE]        # also tokenize unrecognized Z* segments wholesale
#   hl7-sanitize.sh --no-update-table      # use existing tokens; do not create new ones
#   hl7-sanitize.sh --table PATH           # use a different lookup table
#   hl7-sanitize.sh --rules-file PATH      # override the default PHI rules
#
# PHI rule format (one per line):
#   SEGMENT|FIELD_NUM|CATEGORY
#   e.g.   PID|3|MRN
#
# Subcommands (when first arg is one of these):
#   show-rules            print the default PHI-field rules
#   show-table            print the current lookup table (PHI in clear — local only)
#   clear-table           wipe the table (asks for confirmation)
#   count                 number of token entries in the table
set -o pipefail

NC_SELF="$0"
LARRY_HOME="${LARRY_HOME:-$HOME/.larry}"
TABLE_DIR="$LARRY_HOME/sanitize"
DEFAULT_TABLE="$TABLE_DIR/lookup.tsv"

die() { printf 'hl7-sanitize: %s\n' "$*" >&2; exit 1; }

# ─────────────────────────────────────────────────────────────────────────────
# Default PHI rules — the standard set Bryan can override via --rules-file.
# Each rule:  SEGMENT|FIELD|CATEGORY
# ─────────────────────────────────────────────────────────────────────────────
PHI_RULES_DEFAULT='# Patient identification
PID|2|ALTID
PID|3|MRN
PID|4|ALTID
PID|5|NAME
PID|6|NAME
PID|7|DOB
PID|8|SEX
PID|9|NAME
PID|11|ADDR
PID|12|REGION
PID|13|PHONE
PID|14|PHONE
PID|18|ACCT
PID|19|SSN
PID|20|LIC
PID|21|MRN
PID|29|DOD
PID|30|DEATHFLAG
# Patient visit
PV1|7|PROV
PV1|8|PROV
PV1|9|PROV
PV1|17|PROV
PV1|19|VISIT
PV1|50|VISIT
PV1|52|PROV
# Next of kin
NK1|2|NAME
NK1|3|RELATIONSHIP
NK1|4|ADDR
NK1|5|PHONE
NK1|6|PHONE
NK1|16|DOB
# Guarantor
GT1|3|NAME
GT1|4|NAME
GT1|5|ADDR
GT1|6|PHONE
GT1|7|PHONE
GT1|11|DOB
GT1|12|SSN
GT1|19|EMP
# Insurance
IN1|16|NAME
IN1|17|DOB
IN1|18|NAME
IN1|19|ADDR
IN1|20|SSN
IN1|36|INSPOL
IN1|49|INSID
# Observation / orders
OBR|10|PROV
OBR|16|PROV
OBR|32|PROV
OBX|16|PROV
DG1|3|DIAG
DG1|4|DIAG
# Order Common
ORC|10|PROV
ORC|12|PROV'

cmd_show_rules() { printf '%s\n' "$PHI_RULES_DEFAULT"; }

cmd_show_table() {
  local table="${1:-$DEFAULT_TABLE}"
  [ -f "$table" ] || { echo "no table at $table"; return 0; }
  cat "$table"
}

cmd_clear_table() {
  local table="${1:-$DEFAULT_TABLE}"
  local yes=0; [ "${2:-}" = "--yes" ] && yes=1
  [ -f "$table" ] || { echo "no table at $table"; return 0; }
  if [ "$yes" != "1" ]; then
    printf 'clear lookup table at %s? [y/N]: ' "$table"
    read -r ans </dev/tty || ans=""
    [[ "$ans" =~ ^[Yy]$ ]] || { echo "aborted"; return 1; }
  fi
  umask 077
  printf 'token\tcategory\toriginal\n' > "$table"
  chmod 600 "$table"
  echo "cleared $table"
}

cmd_count() {
  local table="${1:-$DEFAULT_TABLE}"
  [ -f "$table" ] || { echo 0; return 0; }
  echo $(($(wc -l < "$table") - 1))
}

# tokenize-value: take a single value (and optional category) and return the
# existing or newly-created token from the lookup table. Used by larry.sh to
# preprocess {{phi:...}} placeholders BEFORE sending to the API.
#   --category X   force category; default = search all categories for a match,
#                  fall back to "MANUAL".
cmd_tokenize_value() {
  local table="$DEFAULT_TABLE"
  local category=""
  local value=""
  while [ $# -gt 0 ]; do
    case "$1" in
      --category) shift; category="$1" ;;
      --table)    shift; table="$1" ;;
      -h|--help)  echo "usage: tokenize-value [--category CAT] VALUE" >&2; exit 2 ;;
      *)          value="$1" ;;
    esac
    shift
  done
  [ -n "$value" ] || die "tokenize-value needs a VALUE"

  init_table "$table"

  # Auto-category: search existing table for an exact value match.
  if [ -z "$category" ]; then
    local existing
    existing=$(awk -F'\t' -v v="$value" 'NR>1 && $3==v { print $1; exit }' "$table")
    if [ -n "$existing" ]; then
      printf '%s\n' "$existing"
      return 0
    fi
    category="MANUAL"
  fi

  # Look up (category, value) → existing token
  local tok
  tok=$(awk -F'\t' -v cat="$category" -v v="$value" 'NR>1 && $2==cat && $3==v { print $1; exit }' "$table")
  if [ -n "$tok" ]; then
    printf '%s\n' "$tok"
    return 0
  fi

  # Create new
  local nextnum
  nextnum=$(awk -F'\t' -v cat="$category" '
    NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) {
      n = substr($1, RSTART+1, RLENGTH-3) + 0
      if (n > max) max = n
    }
    END { print max+1 }
  ' "$table")
  tok=$(printf '[[%s_%04d]]' "$category" "$nextnum")
  umask 077
  printf '%s\t%s\t%s\n' "$tok" "$category" "$value" >> "$table"
  chmod 600 "$table" 2>/dev/null || true
  printf '%s\n' "$tok"
}

# detokenize-value: reverse of tokenize-value (looks up by token, returns original).
cmd_detokenize_value() {
  local table="$DEFAULT_TABLE"
  local token=""
  while [ $# -gt 0 ]; do
    case "$1" in
      --table)   shift; table="$1" ;;
      -h|--help) echo "usage: detokenize-value TOKEN" >&2; exit 2 ;;
      *)         token="$1" ;;
    esac
    shift
  done
  [ -n "$token" ] || die "detokenize-value needs a TOKEN"
  [ -f "$table" ] || die "no lookup table at $table"
  local val
  val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print $3; exit }' "$table")
  if [ -z "$val" ]; then
    printf 'no such token: %s\n' "$token" >&2
    return 2
  fi
  printf '%s\n' "$val"
}

init_table() {
  local table="$1"
  mkdir -p "$(dirname "$table")" 2>/dev/null
  chmod 700 "$(dirname "$table")" 2>/dev/null || true
  if [ ! -f "$table" ]; then
    umask 077
    printf 'token\tcategory\toriginal\n' > "$table"
    chmod 600 "$table"
  fi
}

# ─────────────────────────────────────────────────────────────────────────────
# Main sanitize logic — single awk pass over HL7 messages.
# ─────────────────────────────────────────────────────────────────────────────
do_sanitize() {
  local input_file="$1"
  local rules="$2"
  local table="$3"
  local strict="$4"
  local update_table="$5"

  init_table "$table"

  # Write rules to a temp file because awk -v can't carry newlines on macOS
  local rules_tmp; rules_tmp=$(mktemp)
  printf '%s\n' "$rules" > "$rules_tmp"
  trap 'rm -f "$rules_tmp"' RETURN

  local awk_script
  awk_script=$(cat <<'AWK_END'
BEGIN {
  ORS = "\r"
  # During BEGIN, read auxiliary files with newline separator
  RS = "\n"
  while ((getline line < RULES_FILE) > 0) {
    gsub(/^[[:space:]]+|[[:space:]]+$/, "", line)
    if (line == "" || substr(line,1,1) == "#") continue
    split(line, p, "|")
    if (p[1] == "" || p[2] == "" || p[3] == "") continue
    rule_cat[p[1] SUBSEP (p[2]+0)] = p[3]
    has_seg[p[1]] = 1
  }
  close(RULES_FILE)

  while ((getline tline < TABLE) > 0) {
    if (tline ~ /^token\t/) continue
    split(tline, c, "\t")
    if (c[1] == "" || c[3] == "") continue
    token_for[c[2] SUBSEP c[3]] = c[1]
    if (match(c[1], /_[0-9]+\]\]$/)) {
      num = substr(c[1], RSTART+1, RLENGTH-3) + 0
      if (num > counter[c[2]]) counter[c[2]] = num
    }
  }
  close(TABLE)

  # Switch to CR for the main input (HL7 segments)
  RS = "\r"
}

function tokenize(val, cat,    key, t) {
  if (val == "" || val == "\"\"") return val
  key = cat SUBSEP val
  if (key in token_for) return token_for[key]
  counter[cat]++
  t = sprintf("[[%s_%04d]]", cat, counter[cat])
  token_for[key] = t
  if (UPDATE_TABLE == "1") {
    new_entries[++n_new] = t "\t" cat "\t" val
  }
  return t
}

{
  # Each record (a segment, separated by \r). Bytes outside segments — like
  # 0x1c message separators — get printed through unchanged if we just print.
  if (length($0) < 3) { print; next }
  seg = substr($0, 1, 3)

  # Strict mode: any Z segment we have no rules for → tokenize the whole segment body
  if (STRICT == "1" && substr(seg,1,1) == "Z" && !(seg in has_seg)) {
    body = substr($0, 5)  # skip "Zxx|"
    if (body != "") {
      tok = tokenize(body, "Z" substr(seg,2,2))
      print seg "|" tok
    } else {
      print
    }
    next
  }

  if (!(seg in has_seg)) { print; next }

  is_msh = (seg == "MSH")
  nf = split($0, fields, "|")

  # For MSH, fields[1] is "MSH", fields[2] is the encoding chars (MSH.2).
  # MSH.N → fields[N] for N >= 2; MSH.1 is the separator char (skip).
  # For others, SEG.N → fields[N+1].
  for (k in rule_cat) {
    split(k, kp, SUBSEP)
    if (kp[1] != seg) continue
    fnum = kp[2] + 0
    cat = rule_cat[k]
    idx = is_msh ? fnum : (fnum + 1)
    if (idx < 1 || idx > nf) continue
    fields[idx] = tokenize(fields[idx], cat)
  }
  line = fields[1]
  for (j=2; j<=nf; j++) line = line "|" fields[j]
  print line
}

END {
  if (UPDATE_TABLE == "1" && n_new > 0) {
    # Use explicit \n — ORS is \r for the main HL7 input loop, not for the table.
    for (i=1; i<=n_new; i++) printf "%s\n", new_entries[i] >> TABLE
    close(TABLE)
  }
  printf "hl7-sanitize: %d new token(s) created; %d total in %s\n", \
    n_new, length(token_for), TABLE > "/dev/stderr"
}
AWK_END
)

  if [ -n "$input_file" ]; then
    awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \
        -v UPDATE_TABLE="$update_table" \
        "$awk_script" "$input_file"
  else
    awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \
        -v UPDATE_TABLE="$update_table" \
        "$awk_script" /dev/stdin
  fi
}

# ─────────────────────────────────────────────────────────────────────────────
# Dispatch
# ─────────────────────────────────────────────────────────────────────────────
SUB="${1:-}"
case "$SUB" in
  show-rules)       shift; cmd_show_rules ;;
  show-table)       shift; cmd_show_table "$@" ;;
  clear-table)      shift; cmd_clear_table "$@" ;;
  count)            shift; cmd_count "$@" ;;
  tokenize-value)   shift; cmd_tokenize_value "$@" ;;
  detokenize-value) shift; cmd_detokenize_value "$@" ;;
  -h|--help)   sed -n '2,30p' "$NC_SELF"; exit 0 ;;
  *)
    # Default = sanitize mode
    input_file=""
    rules="$PHI_RULES_DEFAULT"
    table="$DEFAULT_TABLE"
    strict=0
    update_table=1
    while [ $# -gt 0 ]; do
      case "$1" in
        --strict)            strict=1 ;;
        --no-update-table)   update_table=0 ;;
        --table)             shift; table="$1" ;;
        --rules-file)        shift; [ -f "$1" ] || die "no such rules file: $1"; rules=$(cat "$1") ;;
        -h|--help)           sed -n '2,30p' "$NC_SELF"; exit 0 ;;
        -*)                  die "unknown flag: $1" ;;
        *)                   input_file="$1" ;;
      esac
      shift
    done
    do_sanitize "$input_file" "$rules" "$table" "$strict" "$update_table"
    ;;
esac