#!/usr/bin/env bash # hl7-sanitize.sh — tokenize PHI fields in HL7 v2 messages. # # Replaces PHI-likely fields with deterministic local tokens like [[MRN_0001]], # [[NAME_0001]], etc. Same value → same token across messages, so analysis # downstream can still correlate patients, find duplicates, etc. # # The token ↔ original mapping is stored LOCAL ONLY at: # $LARRY_HOME/sanitize/lookup.tsv (mode 0600) # # Use hl7-desanitize.sh to reverse the operation when viewing results locally. # # Usage: # hl7-sanitize.sh [FILE] # sanitize file (or stdin); writes lookup # hl7-sanitize.sh --strict [FILE] # also tokenize unrecognized Z* segments wholesale # hl7-sanitize.sh --no-update-table # use existing tokens; do not create new ones # hl7-sanitize.sh --table PATH # use a different lookup table # hl7-sanitize.sh --rules-file PATH # override the default PHI rules # # PHI rule format (one per line): # SEGMENT|FIELD_NUM|CATEGORY # e.g. PID|3|MRN # # Subcommands (when first arg is one of these): # show-rules print the default PHI-field rules # show-table print the current lookup table (PHI in clear — local only) # clear-table wipe the table (asks for confirmation) # count number of token entries in the table set -o pipefail NC_SELF="$0" NC_LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)" LARRY_HOME="${LARRY_HOME:-$HOME/.larry}" TABLE_DIR="$LARRY_HOME/sanitize" DEFAULT_TABLE="$TABLE_DIR/lookup.tsv" # v0.8.26: shared control-byte sanitizer. Sanitized HL7 and the show-table dump # echo message/field bytes that can include C0 controls (and 0x1c framing on # raw input) that corrupt a terminal when viewed un-redirected. Strip them ONLY # when stdout is a tty; on a pipe/redirect (feeding analysis tooling) the bytes # pass through raw and byte-identical. See lib/cygwin-safe.sh. if [ -r "$NC_LIB_DIR/cygwin-safe.sh" ]; then # shellcheck disable=SC1090,SC1091 . "$NC_LIB_DIR/cygwin-safe.sh" else _sanitize_ctl_tty() { cat; } # degrade safe: raw passthrough if lib missing fi die() { printf 'hl7-sanitize: %s\n' "$*" >&2; exit 1; } # ───────────────────────────────────────────────────────────────────────────── # Default PHI rules — the standard set Bryan can override via --rules-file. # Each rule: SEGMENT|FIELD|CATEGORY # ───────────────────────────────────────────────────────────────────────────── PHI_RULES_DEFAULT='# Patient identification PID|2|ALTID PID|3|MRN PID|4|ALTID PID|5|NAME PID|6|NAME PID|7|DOB PID|8|SEX PID|9|NAME PID|11|ADDR PID|12|REGION PID|13|PHONE PID|14|PHONE PID|18|ACCT PID|19|SSN PID|20|LIC PID|21|MRN PID|29|DOD PID|30|DEATHFLAG # Patient visit PV1|7|PROV PV1|8|PROV PV1|9|PROV PV1|17|PROV PV1|19|VISIT PV1|50|VISIT PV1|52|PROV # Next of kin NK1|2|NAME NK1|3|RELATIONSHIP NK1|4|ADDR NK1|5|PHONE NK1|6|PHONE NK1|16|DOB # Guarantor GT1|3|NAME GT1|4|NAME GT1|5|ADDR GT1|6|PHONE GT1|7|PHONE GT1|11|DOB GT1|12|SSN GT1|19|EMP # Insurance IN1|16|NAME IN1|17|DOB IN1|18|NAME IN1|19|ADDR IN1|20|SSN IN1|36|INSPOL IN1|49|INSID # Observation / orders OBR|10|PROV OBR|16|PROV OBR|32|PROV OBX|16|PROV DG1|3|DIAG DG1|4|DIAG # Order Common ORC|10|PROV ORC|12|PROV' cmd_show_rules() { printf '%s\n' "$PHI_RULES_DEFAULT"; } cmd_show_table() { local table="${1:-$DEFAULT_TABLE}" [ -f "$table" ] || { echo "no table at $table"; return 0; } cat "$table" } cmd_clear_table() { local table="${1:-$DEFAULT_TABLE}" local yes=0; [ "${2:-}" = "--yes" ] && yes=1 [ -f "$table" ] || { echo "no table at $table"; return 0; } if [ "$yes" != "1" ]; then printf 'clear lookup table at %s? [y/N]: ' "$table" read -r ans "$table" chmod 600 "$table" echo "cleared $table" } cmd_count() { local table="${1:-$DEFAULT_TABLE}" [ -f "$table" ] || { echo 0; return 0; } # v0.7.5: strip non-digits at the wc boundary — Cygwin wc.exe CR-taint # defense for the surrounding $(( - 1 )) arithmetic. local _n; _n=$(wc -l < "$table" | tr -cd '0-9') echo $(( ${_n:-0} - 1 )) } # ───────────────────────────────────────────────────────────────────────────── # Auto-category detection + value canonicalization. # Canonicalization makes "SMITH^JOHN", "Smith, John", "John Smith", and # "JOHN SMITH" all collapse to the same lookup key so they share one token. # ───────────────────────────────────────────────────────────────────────────── # detect_category VALUE — guess category from value shape. # Returns one of: MRN | SSN | DOB | NAME | MANUAL detect_category() { local v="$1" # Trim surrounding whitespace v="${v#"${v%%[![:space:]]*}"}" v="${v%"${v##*[![:space:]]}"}" # Pure digits, 4-15 chars → MRN if [[ "$v" =~ ^[0-9]+$ ]] && [ "${#v}" -ge 4 ] && [ "${#v}" -le 15 ]; then printf 'MRN'; return fi # SSN: 9 digits with optional dashes if [[ "$v" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then printf 'SSN'; return fi # Date-like if [[ "$v" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then printf 'DOB'; return fi # HL7 caret or comma → likely name if [[ "$v" == *"^"* ]] || [[ "$v" == *","* ]]; then printf 'NAME'; return fi # Two+ alpha tokens → name local word_count word_count=$(printf '%s\n' "$v" | awk '{ c=0; for(i=1;i<=NF;i++) if ($i ~ /^[A-Za-z]+$/) c++; print c }') if [ "${word_count:-0}" -ge 2 ]; then printf 'NAME'; return fi printf 'MANUAL' } # normalize_value VALUE CATEGORY — produce canonical lookup key. # NAME: lowercase, strip punctuation, sort unique alpha tokens. So # "SMITH^JOHN", "Smith, John", "John Smith", "JOHN SMITH" → "john smith" # DOB: parse to YYYY-MM-DD if possible (needs GNU date or BSD date) # SSN: strip dashes/whitespace # MRN/MANUAL: just trim outer whitespace normalize_value() { local value="$1" local category="$2" case "$category" in NAME) printf '%s' "$value" \ | tr '[:upper:]' '[:lower:]' \ | tr ',^/' ' ' \ | tr -cs '[:alnum:]' ' ' \ | tr ' ' '\n' \ | grep -E '^[a-z][a-z0-9]*$' \ | sort -u \ | tr '\n' ' ' \ | sed 's/ *$//' ;; DOB) local parsed parsed=$(date -d "$value" +%Y-%m-%d 2>/dev/null) \ || parsed=$(date -j -f "%m/%d/%Y" "$value" +%Y-%m-%d 2>/dev/null) \ || parsed=$(date -j -f "%Y-%m-%d" "$value" +%Y-%m-%d 2>/dev/null) [ -n "$parsed" ] && printf '%s' "$parsed" || printf '%s' "$value" | tr -d '[:space:]' ;; SSN) # Note: '-' must come at the END of the set so BSD tr doesn't treat # it as an option flag. Same caveat applies for any future tr -d call. printf '%s' "$value" | tr -d '[:space:]-' ;; PHONE) # v0.7.3: digits-only canonical so "(555) 123-4567", "5551234567", # "555.123.4567", "555-123-4567" all collapse to one token. # (Originally added in af2ffe8 — reverted with v0.7.1; re-added per # v0.7.3 auto-PHI spec.) printf '%s' "$value" | tr -cd '[:digit:]' ;; EMAIL) # v0.7.3: lowercase + outer-trim. RFC 5321 says the local-part is # technically case-sensitive but in practice every mainstream provider # treats it case-insensitively, so collapsing for PHI purposes is safe. # (Originally added in af2ffe8 — reverted with v0.7.1; re-added per # v0.7.3 auto-PHI spec.) printf '%s' "$value" | tr '[:upper:]' '[:lower:]' | awk '{$1=$1; print}' ;; NPI) # v0.7.3: National Provider Identifier — 10 digits. Strip whitespace # to canonical 10-digit form. printf '%s' "$value" | tr -d '[:space:]-' ;; *) printf '%s' "$value" | awk '{$1=$1; print}' ;; esac } # tokenize-value: take a single value (and optional category) and return the # existing or newly-created token. Auto-detects category if not given. Uses # canonicalized form as the lookup key so name variants share one token. # Table schema (v0.5.5+): token \t category \t canonical \t original # Legacy 3-col rows (token \t category \t value) still readable. cmd_tokenize_value() { local table="$DEFAULT_TABLE" local category="" local value="" while [ $# -gt 0 ]; do case "$1" in --category) shift; category="$1" ;; --table) shift; table="$1" ;; -h|--help) echo "usage: tokenize-value [--category CAT] VALUE" >&2; exit 2 ;; *) value="$1" ;; esac shift done [ -n "$value" ] || die "tokenize-value needs a VALUE" init_table "$table" # Auto-detect category from value shape if not explicitly forced. if [ -z "$category" ]; then category=$(detect_category "$value") fi # Canonicalize so different surface forms of the same PHI share one token. local canonical canonical=$(normalize_value "$value" "$category") [ -z "$canonical" ] && canonical="$value" # Look up (category, canonical). Column 3 in both new (4-col) and legacy # (3-col) rows is the lookup key, so this works for both. local tok tok=$(awk -F'\t' -v cat="$category" -v c="$canonical" 'NR>1 && $2==cat && $3==c { print $1; exit }' "$table") if [ -n "$tok" ]; then printf '%s\n' "$tok" return 0 fi # Create new — write 4 columns. local nextnum nextnum=$(awk -F'\t' -v cat="$category" ' NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) { n = substr($1, RSTART+1, RLENGTH-3) + 0 if (n > max) max = n } END { print max+1 } ' "$table") tok=$(printf '[[%s_%04d]]' "$category" "$nextnum") umask 077 printf '%s\t%s\t%s\t%s\n' "$tok" "$category" "$canonical" "$value" >> "$table" chmod 600 "$table" 2>/dev/null || true printf '%s\n' "$tok" } # detokenize-value: reverse of tokenize-value (looks up by token, returns original). cmd_detokenize_value() { local table="$DEFAULT_TABLE" local token="" while [ $# -gt 0 ]; do case "$1" in --table) shift; table="$1" ;; -h|--help) echo "usage: detokenize-value TOKEN" >&2; exit 2 ;; *) token="$1" ;; esac shift done [ -n "$token" ] || die "detokenize-value needs a TOKEN" [ -f "$table" ] || die "no lookup table at $table" local val # Prefer column 4 (original surface form) for new rows; fall back to col 3 # for legacy 3-col rows where col 3 IS the original value. val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print ($4 != "" ? $4 : $3); exit }' "$table") if [ -z "$val" ]; then printf 'no such token: %s\n' "$token" >&2 return 2 fi printf '%s\n' "$val" } init_table() { local table="$1" mkdir -p "$(dirname "$table")" 2>/dev/null chmod 700 "$(dirname "$table")" 2>/dev/null || true if [ ! -f "$table" ]; then umask 077 printf 'token\tcategory\tcanonical\toriginal\n' > "$table" chmod 600 "$table" fi } # ───────────────────────────────────────────────────────────────────────────── # Main sanitize logic — single awk pass over HL7 messages. # ───────────────────────────────────────────────────────────────────────────── do_sanitize() { local input_file="$1" local rules="$2" local table="$3" local strict="$4" local update_table="$5" init_table "$table" # Write rules to a temp file because awk -v can't carry newlines on macOS local rules_tmp; rules_tmp=$(mktemp) printf '%s\n' "$rules" > "$rules_tmp" trap 'rm -f "$rules_tmp"' RETURN local awk_script awk_script=$(cat <<'AWK_END' BEGIN { ORS = "\r" # During BEGIN, read auxiliary files with newline separator RS = "\n" while ((getline line < RULES_FILE) > 0) { gsub(/^[[:space:]]+|[[:space:]]+$/, "", line) if (line == "" || substr(line,1,1) == "#") continue split(line, p, "|") if (p[1] == "" || p[2] == "" || p[3] == "") continue rule_cat[p[1] SUBSEP (p[2]+0)] = p[3] has_seg[p[1]] = 1 } close(RULES_FILE) while ((getline tline < TABLE) > 0) { if (tline ~ /^token\t/) continue split(tline, c, "\t") if (c[1] == "" || c[3] == "") continue token_for[c[2] SUBSEP c[3]] = c[1] if (match(c[1], /_[0-9]+\]\]$/)) { num = substr(c[1], RSTART+1, RLENGTH-3) + 0 if (num > counter[c[2]]) counter[c[2]] = num } } close(TABLE) # Switch to CR for the main input (HL7 segments) RS = "\r" } function tokenize(val, cat, key, t) { if (val == "" || val == "\"\"") return val key = cat SUBSEP val if (key in token_for) return token_for[key] counter[cat]++ t = sprintf("[[%s_%04d]]", cat, counter[cat]) token_for[key] = t if (UPDATE_TABLE == "1") { new_entries[++n_new] = t "\t" cat "\t" val } return t } { # Each record (a segment, separated by \r). Bytes outside segments — like # 0x1c message separators — get printed through unchanged if we just print. if (length($0) < 3) { print; next } seg = substr($0, 1, 3) # Strict mode: any Z segment we have no rules for → tokenize the whole segment body if (STRICT == "1" && substr(seg,1,1) == "Z" && !(seg in has_seg)) { body = substr($0, 5) # skip "Zxx|" if (body != "") { tok = tokenize(body, "Z" substr(seg,2,2)) print seg "|" tok } else { print } next } if (!(seg in has_seg)) { print; next } is_msh = (seg == "MSH") nf = split($0, fields, "|") # For MSH, fields[1] is "MSH", fields[2] is the encoding chars (MSH.2). # MSH.N → fields[N] for N >= 2; MSH.1 is the separator char (skip). # For others, SEG.N → fields[N+1]. for (k in rule_cat) { split(k, kp, SUBSEP) if (kp[1] != seg) continue fnum = kp[2] + 0 cat = rule_cat[k] idx = is_msh ? fnum : (fnum + 1) if (idx < 1 || idx > nf) continue fields[idx] = tokenize(fields[idx], cat) } line = fields[1] for (j=2; j<=nf; j++) line = line "|" fields[j] print line } END { if (UPDATE_TABLE == "1" && n_new > 0) { # Use explicit \n — ORS is \r for the main HL7 input loop, not for the table. for (i=1; i<=n_new; i++) printf "%s\n", new_entries[i] >> TABLE close(TABLE) } printf "hl7-sanitize: %d new token(s) created; %d total in %s\n", \ n_new, length(token_for), TABLE > "/dev/stderr" } AWK_END ) if [ -n "$input_file" ]; then awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \ -v UPDATE_TABLE="$update_table" \ "$awk_script" "$input_file" else awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \ -v UPDATE_TABLE="$update_table" \ "$awk_script" /dev/stdin fi } # ───────────────────────────────────────────────────────────────────────────── # Dispatch # ───────────────────────────────────────────────────────────────────────────── SUB="${1:-}" # v0.8.26: route the whole dispatch's stdout through the tty-gated sanitizer in # a brace group, then propagate ${PIPESTATUS[0]} so each subcommand's exit code # survives the pipe. On a pipe/redirect the gate is a no-op (raw passthrough). # clear-table's confirmation read uses /dev/tty directly, so the pipeline # subshell does not interfere with it. { case "$SUB" in show-rules) shift; cmd_show_rules ;; show-table) shift; cmd_show_table "$@" ;; clear-table) shift; cmd_clear_table "$@" ;; count) shift; cmd_count "$@" ;; tokenize-value) shift; cmd_tokenize_value "$@" ;; detokenize-value) shift; cmd_detokenize_value "$@" ;; normalize-value) # v0.7.3: normalize-value VALUE [CATEGORY] — emit canonical form without # tokenizing or touching the table. Used by larry.sh's v0.7.3 auto-PHI # detection to build per-session memory keys so "John Smith" and # "JOHN SMITH" share one accept/decline decision. # (Originally added in af2ffe8 — reverted with v0.7.1; re-added per # v0.7.3 auto-PHI spec.) shift nv_val="${1:-}"; nv_cat="${2:-}" [ -n "$nv_val" ] || die "normalize-value needs a VALUE" [ -n "$nv_cat" ] || nv_cat=$(detect_category "$nv_val") normalize_value "$nv_val" "$nv_cat" printf '\n' ;; lookup-original) # v0.7.3: lookup-original VALUE CATEGORY — return token if value is # already present in the lookup table (Tier-4 "already-known" check). # Empty stdout + exit 0 if not found; non-empty token + exit 0 if found. shift lo_val="${1:-}"; lo_cat="${2:-}" [ -n "$lo_val" ] || die "lookup-original needs a VALUE" [ -n "$lo_cat" ] || lo_cat=$(detect_category "$lo_val") lo_canonical=$(normalize_value "$lo_val" "$lo_cat") [ -z "$lo_canonical" ] && lo_canonical="$lo_val" [ -f "$DEFAULT_TABLE" ] || { printf ''; exit 0; } awk -F'\t' -v cat="$lo_cat" -v c="$lo_canonical" -v orig="$lo_val" ' NR>1 && $2==cat && ($3==c || $4==orig) { print $1; exit } ' "$DEFAULT_TABLE" ;; -h|--help) sed -n '2,30p' "$NC_SELF"; exit 0 ;; *) # Default = sanitize mode input_file="" rules="$PHI_RULES_DEFAULT" table="$DEFAULT_TABLE" strict=0 update_table=1 while [ $# -gt 0 ]; do case "$1" in --strict) strict=1 ;; --no-update-table) update_table=0 ;; --table) shift; table="$1" ;; --rules-file) shift; [ -f "$1" ] || die "no such rules file: $1"; rules=$(cat "$1") ;; -h|--help) sed -n '2,30p' "$NC_SELF"; exit 0 ;; -*) die "unknown flag: $1" ;; *) input_file="$1" ;; esac shift done do_sanitize "$input_file" "$rules" "$table" "$strict" "$update_table" ;; esac } | _sanitize_ctl_tty exit "${PIPESTATUS[0]}"