Vera FAIL on v0.9.2 was the stdin-path temp leak (bash RETURN traps don't stack). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
578 lines
21 KiB
Bash
Executable File
578 lines
21 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# hl7-sanitize.sh — tokenize PHI fields in HL7 v2 messages.
|
|
#
|
|
# Replaces PHI-likely fields with deterministic local tokens like [[MRN_0001]],
|
|
# [[NAME_0001]], etc. Same value → same token across messages, so analysis
|
|
# downstream can still correlate patients, find duplicates, etc.
|
|
#
|
|
# The token ↔ original mapping is stored LOCAL ONLY at:
|
|
# $LARRY_HOME/sanitize/lookup.tsv (mode 0600)
|
|
#
|
|
# Use hl7-desanitize.sh to reverse the operation when viewing results locally.
|
|
#
|
|
# Usage:
|
|
# hl7-sanitize.sh [FILE] # sanitize file (or stdin); writes lookup
|
|
# hl7-sanitize.sh --strict [FILE] # also tokenize unrecognized Z* segments wholesale
|
|
# hl7-sanitize.sh --no-update-table # use existing tokens; do not create new ones
|
|
# hl7-sanitize.sh --table PATH # use a different lookup table
|
|
# hl7-sanitize.sh --rules-file PATH # override the default PHI rules
|
|
#
|
|
# PHI rule format (one per line):
|
|
# SEGMENT|FIELD_NUM|CATEGORY
|
|
# e.g. PID|3|MRN
|
|
#
|
|
# Subcommands (when first arg is one of these):
|
|
# show-rules print the default PHI-field rules
|
|
# show-table print the current lookup table (PHI in clear — local only)
|
|
# clear-table wipe the table (asks for confirmation)
|
|
# count number of token entries in the table
|
|
set -o pipefail
|
|
|
|
NC_SELF="$0"
|
|
NC_LIB_DIR="$(cd "$(dirname "$NC_SELF")" && pwd)"
|
|
LARRY_HOME="${LARRY_HOME:-$HOME/.larry}"
|
|
TABLE_DIR="$LARRY_HOME/sanitize"
|
|
DEFAULT_TABLE="$TABLE_DIR/lookup.tsv"
|
|
|
|
# v0.8.26: shared control-byte sanitizer. Sanitized HL7 and the show-table dump
|
|
# echo message/field bytes that can include C0 controls (and 0x1c framing on
|
|
# raw input) that corrupt a terminal when viewed un-redirected. Strip them ONLY
|
|
# when stdout is a tty; on a pipe/redirect (feeding analysis tooling) the bytes
|
|
# pass through raw and byte-identical. See lib/cygwin-safe.sh.
|
|
if [ -r "$NC_LIB_DIR/cygwin-safe.sh" ]; then
|
|
# shellcheck disable=SC1090,SC1091
|
|
. "$NC_LIB_DIR/cygwin-safe.sh"
|
|
else
|
|
_sanitize_ctl_tty() { cat; } # degrade safe: raw passthrough if lib missing
|
|
fi
|
|
|
|
die() { printf 'hl7-sanitize: %s\n' "$*" >&2; exit 1; }
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Default PHI rules — the standard set Bryan can override via --rules-file.
|
|
# Each rule: SEGMENT|FIELD|CATEGORY
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
PHI_RULES_DEFAULT='# Patient identification
|
|
PID|2|ALTID
|
|
PID|3|MRN
|
|
PID|4|ALTID
|
|
PID|5|NAME
|
|
PID|6|NAME
|
|
PID|7|DOB
|
|
PID|8|SEX
|
|
PID|9|NAME
|
|
PID|11|ADDR
|
|
PID|12|REGION
|
|
PID|13|PHONE
|
|
PID|14|PHONE
|
|
PID|18|ACCT
|
|
PID|19|SSN
|
|
PID|20|LIC
|
|
PID|21|MRN
|
|
PID|29|DOD
|
|
PID|30|DEATHFLAG
|
|
# Patient visit
|
|
PV1|7|PROV
|
|
PV1|8|PROV
|
|
PV1|9|PROV
|
|
PV1|17|PROV
|
|
PV1|19|VISIT
|
|
PV1|50|VISIT
|
|
PV1|52|PROV
|
|
# Next of kin
|
|
NK1|2|NAME
|
|
NK1|3|RELATIONSHIP
|
|
NK1|4|ADDR
|
|
NK1|5|PHONE
|
|
NK1|6|PHONE
|
|
NK1|16|DOB
|
|
# Guarantor
|
|
GT1|3|NAME
|
|
GT1|4|NAME
|
|
GT1|5|ADDR
|
|
GT1|6|PHONE
|
|
GT1|7|PHONE
|
|
GT1|11|DOB
|
|
GT1|12|SSN
|
|
GT1|19|EMP
|
|
# Insurance
|
|
IN1|16|NAME
|
|
IN1|17|DOB
|
|
IN1|18|NAME
|
|
IN1|19|ADDR
|
|
IN1|20|SSN
|
|
IN1|36|INSPOL
|
|
IN1|49|INSID
|
|
# Observation / orders
|
|
OBR|10|PROV
|
|
OBR|16|PROV
|
|
OBR|32|PROV
|
|
OBX|16|PROV
|
|
DG1|3|DIAG
|
|
DG1|4|DIAG
|
|
# Order Common
|
|
ORC|10|PROV
|
|
ORC|12|PROV'
|
|
|
|
cmd_show_rules() { printf '%s\n' "$PHI_RULES_DEFAULT"; }
|
|
|
|
cmd_show_table() {
|
|
local table="${1:-$DEFAULT_TABLE}"
|
|
[ -f "$table" ] || { echo "no table at $table"; return 0; }
|
|
cat "$table"
|
|
}
|
|
|
|
cmd_clear_table() {
|
|
local table="${1:-$DEFAULT_TABLE}"
|
|
local yes=0; [ "${2:-}" = "--yes" ] && yes=1
|
|
[ -f "$table" ] || { echo "no table at $table"; return 0; }
|
|
if [ "$yes" != "1" ]; then
|
|
printf 'clear lookup table at %s? [y/N]: ' "$table"
|
|
read -r ans </dev/tty || ans=""
|
|
# v0.7.5: strip CR so `Y\r` from a Cygwin pty matches `^[Yy]$`.
|
|
ans="${ans//$'\r'/}"
|
|
[[ "$ans" =~ ^[Yy]$ ]] || { echo "aborted"; return 1; }
|
|
fi
|
|
umask 077
|
|
printf 'token\tcategory\toriginal\n' > "$table"
|
|
chmod 600 "$table"
|
|
echo "cleared $table"
|
|
}
|
|
|
|
cmd_count() {
|
|
local table="${1:-$DEFAULT_TABLE}"
|
|
[ -f "$table" ] || { echo 0; return 0; }
|
|
# v0.7.5: strip non-digits at the wc boundary — Cygwin wc.exe CR-taint
|
|
# defense for the surrounding $(( - 1 )) arithmetic.
|
|
local _n; _n=$(wc -l < "$table" | tr -cd '0-9')
|
|
echo $(( ${_n:-0} - 1 ))
|
|
}
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Auto-category detection + value canonicalization.
|
|
# Canonicalization makes "SMITH^JOHN", "Smith, John", "John Smith", and
|
|
# "JOHN SMITH" all collapse to the same lookup key so they share one token.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# detect_category VALUE — guess category from value shape.
|
|
# Returns one of: MRN | SSN | DOB | NAME | MANUAL
|
|
detect_category() {
|
|
local v="$1"
|
|
# Trim surrounding whitespace
|
|
v="${v#"${v%%[![:space:]]*}"}"
|
|
v="${v%"${v##*[![:space:]]}"}"
|
|
|
|
# Pure digits, 4-15 chars → MRN
|
|
if [[ "$v" =~ ^[0-9]+$ ]] && [ "${#v}" -ge 4 ] && [ "${#v}" -le 15 ]; then
|
|
printf 'MRN'; return
|
|
fi
|
|
# SSN: 9 digits with optional dashes
|
|
if [[ "$v" =~ ^[0-9]{3}-?[0-9]{2}-?[0-9]{4}$ ]]; then
|
|
printf 'SSN'; return
|
|
fi
|
|
# Date-like
|
|
if [[ "$v" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then
|
|
printf 'DOB'; return
|
|
fi
|
|
# HL7 caret or comma → likely name
|
|
if [[ "$v" == *"^"* ]] || [[ "$v" == *","* ]]; then
|
|
printf 'NAME'; return
|
|
fi
|
|
# Two+ alpha tokens → name
|
|
local word_count
|
|
word_count=$(printf '%s\n' "$v" | awk '{ c=0; for(i=1;i<=NF;i++) if ($i ~ /^[A-Za-z]+$/) c++; print c }')
|
|
if [ "${word_count:-0}" -ge 2 ]; then
|
|
printf 'NAME'; return
|
|
fi
|
|
printf 'MANUAL'
|
|
}
|
|
|
|
# normalize_value VALUE CATEGORY — produce canonical lookup key.
|
|
# NAME: lowercase, strip punctuation, sort unique alpha tokens. So
|
|
# "SMITH^JOHN", "Smith, John", "John Smith", "JOHN SMITH" → "john smith"
|
|
# DOB: parse to YYYY-MM-DD if possible (needs GNU date or BSD date)
|
|
# SSN: strip dashes/whitespace
|
|
# MRN/MANUAL: just trim outer whitespace
|
|
normalize_value() {
|
|
local value="$1"
|
|
local category="$2"
|
|
case "$category" in
|
|
NAME)
|
|
printf '%s' "$value" \
|
|
| tr '[:upper:]' '[:lower:]' \
|
|
| tr ',^/' ' ' \
|
|
| tr -cs '[:alnum:]' ' ' \
|
|
| tr ' ' '\n' \
|
|
| grep -E '^[a-z][a-z0-9]*$' \
|
|
| sort -u \
|
|
| tr '\n' ' ' \
|
|
| sed 's/ *$//'
|
|
;;
|
|
DOB)
|
|
local parsed
|
|
parsed=$(date -d "$value" +%Y-%m-%d 2>/dev/null) \
|
|
|| parsed=$(date -j -f "%m/%d/%Y" "$value" +%Y-%m-%d 2>/dev/null) \
|
|
|| parsed=$(date -j -f "%Y-%m-%d" "$value" +%Y-%m-%d 2>/dev/null)
|
|
[ -n "$parsed" ] && printf '%s' "$parsed" || printf '%s' "$value" | tr -d '[:space:]'
|
|
;;
|
|
SSN)
|
|
# Note: '-' must come at the END of the set so BSD tr doesn't treat
|
|
# it as an option flag. Same caveat applies for any future tr -d call.
|
|
printf '%s' "$value" | tr -d '[:space:]-'
|
|
;;
|
|
PHONE)
|
|
# v0.7.3: digits-only canonical so "(555) 123-4567", "5551234567",
|
|
# "555.123.4567", "555-123-4567" all collapse to one token.
|
|
# (Originally added in af2ffe8 — reverted with v0.7.1; re-added per
|
|
# v0.7.3 auto-PHI spec.)
|
|
printf '%s' "$value" | tr -cd '[:digit:]'
|
|
;;
|
|
EMAIL)
|
|
# v0.7.3: lowercase + outer-trim. RFC 5321 says the local-part is
|
|
# technically case-sensitive but in practice every mainstream provider
|
|
# treats it case-insensitively, so collapsing for PHI purposes is safe.
|
|
# (Originally added in af2ffe8 — reverted with v0.7.1; re-added per
|
|
# v0.7.3 auto-PHI spec.)
|
|
printf '%s' "$value" | tr '[:upper:]' '[:lower:]' | awk '{$1=$1; print}'
|
|
;;
|
|
NPI)
|
|
# v0.7.3: National Provider Identifier — 10 digits. Strip whitespace
|
|
# to canonical 10-digit form.
|
|
printf '%s' "$value" | tr -d '[:space:]-'
|
|
;;
|
|
*)
|
|
printf '%s' "$value" | awk '{$1=$1; print}'
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# tokenize-value: take a single value (and optional category) and return the
|
|
# existing or newly-created token. Auto-detects category if not given. Uses
|
|
# canonicalized form as the lookup key so name variants share one token.
|
|
# Table schema (v0.5.5+): token \t category \t canonical \t original
|
|
# Legacy 3-col rows (token \t category \t value) still readable.
|
|
cmd_tokenize_value() {
|
|
local table="$DEFAULT_TABLE"
|
|
local category=""
|
|
local value=""
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--category) shift; category="$1" ;;
|
|
--table) shift; table="$1" ;;
|
|
-h|--help) echo "usage: tokenize-value [--category CAT] VALUE" >&2; exit 2 ;;
|
|
*) value="$1" ;;
|
|
esac
|
|
shift
|
|
done
|
|
[ -n "$value" ] || die "tokenize-value needs a VALUE"
|
|
|
|
init_table "$table"
|
|
|
|
# Auto-detect category from value shape if not explicitly forced.
|
|
if [ -z "$category" ]; then
|
|
category=$(detect_category "$value")
|
|
fi
|
|
|
|
# Canonicalize so different surface forms of the same PHI share one token.
|
|
local canonical
|
|
canonical=$(normalize_value "$value" "$category")
|
|
[ -z "$canonical" ] && canonical="$value"
|
|
|
|
# Look up (category, canonical). Column 3 in both new (4-col) and legacy
|
|
# (3-col) rows is the lookup key, so this works for both.
|
|
local tok
|
|
tok=$(awk -F'\t' -v cat="$category" -v c="$canonical" 'NR>1 && $2==cat && $3==c { print $1; exit }' "$table")
|
|
if [ -n "$tok" ]; then
|
|
printf '%s\n' "$tok"
|
|
return 0
|
|
fi
|
|
|
|
# Create new — write 4 columns.
|
|
local nextnum
|
|
nextnum=$(awk -F'\t' -v cat="$category" '
|
|
NR>1 && $2==cat && match($1, /_[0-9]+\]\]$/) {
|
|
n = substr($1, RSTART+1, RLENGTH-3) + 0
|
|
if (n > max) max = n
|
|
}
|
|
END { print max+1 }
|
|
' "$table")
|
|
tok=$(printf '[[%s_%04d]]' "$category" "$nextnum")
|
|
umask 077
|
|
printf '%s\t%s\t%s\t%s\n' "$tok" "$category" "$canonical" "$value" >> "$table"
|
|
chmod 600 "$table" 2>/dev/null || true
|
|
printf '%s\n' "$tok"
|
|
}
|
|
|
|
# detokenize-value: reverse of tokenize-value (looks up by token, returns original).
|
|
cmd_detokenize_value() {
|
|
local table="$DEFAULT_TABLE"
|
|
local token=""
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--table) shift; table="$1" ;;
|
|
-h|--help) echo "usage: detokenize-value TOKEN" >&2; exit 2 ;;
|
|
*) token="$1" ;;
|
|
esac
|
|
shift
|
|
done
|
|
[ -n "$token" ] || die "detokenize-value needs a TOKEN"
|
|
[ -f "$table" ] || die "no lookup table at $table"
|
|
local val
|
|
# Prefer column 4 (original surface form) for new rows; fall back to col 3
|
|
# for legacy 3-col rows where col 3 IS the original value.
|
|
val=$(awk -F'\t' -v t="$token" 'NR>1 && $1==t { print ($4 != "" ? $4 : $3); exit }' "$table")
|
|
if [ -z "$val" ]; then
|
|
printf 'no such token: %s\n' "$token" >&2
|
|
return 2
|
|
fi
|
|
printf '%s\n' "$val"
|
|
}
|
|
|
|
init_table() {
|
|
local table="$1"
|
|
mkdir -p "$(dirname "$table")" 2>/dev/null
|
|
chmod 700 "$(dirname "$table")" 2>/dev/null || true
|
|
if [ ! -f "$table" ]; then
|
|
umask 077
|
|
printf 'token\tcategory\tcanonical\toriginal\n' > "$table"
|
|
chmod 600 "$table"
|
|
fi
|
|
}
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Main sanitize logic — single awk pass over HL7 messages.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
do_sanitize() {
|
|
local input_file="$1"
|
|
local rules="$2"
|
|
local table="$3"
|
|
local strict="$4"
|
|
local update_table="$5"
|
|
|
|
init_table "$table"
|
|
|
|
# Write rules to a temp file because awk -v can't carry newlines on macOS
|
|
local rules_tmp; rules_tmp=$(mktemp)
|
|
printf '%s\n' "$rules" > "$rules_tmp"
|
|
trap 'rm -f "$rules_tmp"' RETURN
|
|
|
|
local awk_script
|
|
awk_script=$(cat <<'AWK_END'
|
|
BEGIN {
|
|
ORS = "\r"
|
|
# During BEGIN, read auxiliary files with newline separator
|
|
RS = "\n"
|
|
while ((getline line < RULES_FILE) > 0) {
|
|
gsub(/^[[:space:]]+|[[:space:]]+$/, "", line)
|
|
if (line == "" || substr(line,1,1) == "#") continue
|
|
split(line, p, "|")
|
|
if (p[1] == "" || p[2] == "" || p[3] == "") continue
|
|
rule_cat[p[1] SUBSEP (p[2]+0)] = p[3]
|
|
has_seg[p[1]] = 1
|
|
}
|
|
close(RULES_FILE)
|
|
|
|
while ((getline tline < TABLE) > 0) {
|
|
if (tline ~ /^token\t/) continue
|
|
split(tline, c, "\t")
|
|
if (c[1] == "" || c[3] == "") continue
|
|
token_for[c[2] SUBSEP c[3]] = c[1]
|
|
if (match(c[1], /_[0-9]+\]\]$/)) {
|
|
num = substr(c[1], RSTART+1, RLENGTH-3) + 0
|
|
if (num > counter[c[2]]) counter[c[2]] = num
|
|
}
|
|
}
|
|
close(TABLE)
|
|
|
|
# Switch to CR for the main input (HL7 segments)
|
|
RS = "\r"
|
|
}
|
|
|
|
function tokenize(val, cat, key, t) {
|
|
if (val == "" || val == "\"\"") return val
|
|
key = cat SUBSEP val
|
|
if (key in token_for) return token_for[key]
|
|
counter[cat]++
|
|
t = sprintf("[[%s_%04d]]", cat, counter[cat])
|
|
token_for[key] = t
|
|
if (UPDATE_TABLE == "1") {
|
|
new_entries[++n_new] = t "\t" cat "\t" val
|
|
}
|
|
return t
|
|
}
|
|
|
|
{
|
|
# Each record (a segment, separated by \r). Bytes outside segments — like
|
|
# 0x1c message separators — get printed through unchanged if we just print.
|
|
if (length($0) < 3) { print; next }
|
|
seg = substr($0, 1, 3)
|
|
|
|
# Strict mode: any Z segment we have no rules for → tokenize the whole segment body
|
|
if (STRICT == "1" && substr(seg,1,1) == "Z" && !(seg in has_seg)) {
|
|
body = substr($0, 5) # skip "Zxx|"
|
|
if (body != "") {
|
|
tok = tokenize(body, "Z" substr(seg,2,2))
|
|
print seg "|" tok
|
|
} else {
|
|
print
|
|
}
|
|
next
|
|
}
|
|
|
|
if (!(seg in has_seg)) { print; next }
|
|
|
|
is_msh = (seg == "MSH")
|
|
nf = split($0, fields, "|")
|
|
|
|
# For MSH, fields[1] is "MSH", fields[2] is the encoding chars (MSH.2).
|
|
# MSH.N → fields[N] for N >= 2; MSH.1 is the separator char (skip).
|
|
# For others, SEG.N → fields[N+1].
|
|
for (k in rule_cat) {
|
|
split(k, kp, SUBSEP)
|
|
if (kp[1] != seg) continue
|
|
fnum = kp[2] + 0
|
|
cat = rule_cat[k]
|
|
idx = is_msh ? fnum : (fnum + 1)
|
|
if (idx < 1 || idx > nf) continue
|
|
fields[idx] = tokenize(fields[idx], cat)
|
|
}
|
|
line = fields[1]
|
|
for (j=2; j<=nf; j++) line = line "|" fields[j]
|
|
print line
|
|
}
|
|
|
|
END {
|
|
if (UPDATE_TABLE == "1" && n_new > 0) {
|
|
# Use explicit \n — ORS is \r for the main HL7 input loop, not for the table.
|
|
for (i=1; i<=n_new; i++) printf "%s\n", new_entries[i] >> TABLE
|
|
close(TABLE)
|
|
}
|
|
printf "hl7-sanitize: %d new token(s) created; %d total in %s\n", \
|
|
n_new, length(token_for), TABLE > "/dev/stderr"
|
|
}
|
|
AWK_END
|
|
)
|
|
|
|
# F-5 fix (2026-06-08): awk uses RS="\r" (CR segment separator — the real
|
|
# Cloverleaf wire format). LF-only or CRLF input never splits into segments,
|
|
# so every PHI field passes through as cleartext. Normalise all three line
|
|
# endings (LF, CRLF, bare CR) to CR before handing off to awk so that the
|
|
# existing tokenisation logic works regardless of how the file was created.
|
|
# `tr` is POSIX and available on every platform the tool targets.
|
|
# Detection: if the file contains no CR but does contain LF-separated MSH
|
|
# lines, it needs normalisation. We normalise whenever no bare CR is present,
|
|
# which is a safe no-op on already-CR wire data (the first tr removes nothing).
|
|
_normalise_to_cr() {
|
|
# CRLF → CR first, then remaining bare LF → CR.
|
|
tr -d '\r' | tr '\n' '\r'
|
|
}
|
|
|
|
if [ -n "$input_file" ]; then
|
|
# Detect whether file already uses CR segment separators.
|
|
if grep -qP '\r' "$input_file" 2>/dev/null || \
|
|
python3 -c "import sys; d=open(sys.argv[1],'rb').read(); sys.exit(0 if b'\r' in d else 1)" "$input_file" 2>/dev/null; then
|
|
# Already CR-delimited — feed directly (original path, zero overhead).
|
|
awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \
|
|
-v UPDATE_TABLE="$update_table" \
|
|
"$awk_script" "$input_file"
|
|
else
|
|
# LF or CRLF — normalise to CR on the fly then pipe into awk.
|
|
_normalise_to_cr < "$input_file" | \
|
|
awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \
|
|
-v UPDATE_TABLE="$update_table" \
|
|
"$awk_script" /dev/stdin
|
|
fi
|
|
else
|
|
# stdin — buffer to a temp file so we can inspect for CR presence.
|
|
local _norm_tmp; _norm_tmp=$(mktemp)
|
|
# F-5 fix: bash RETURN traps do NOT stack — this REPLACES the earlier rules_tmp trap
|
|
# (~line 357), so it MUST clean BOTH temps or $rules_tmp leaks one mktemp file per stdin call.
|
|
trap 'rm -f "$rules_tmp" "$_norm_tmp"' RETURN
|
|
cat /dev/stdin > "$_norm_tmp"
|
|
# CR-detection via python3; if python3 is absent the test fails and the else (normalise)
|
|
# branch runs — the intentional FAIL-SAFE default (normalise, never skip sanitization).
|
|
if python3 -c "import sys; d=open(sys.argv[1],'rb').read(); sys.exit(0 if b'\r' in d else 1)" "$_norm_tmp" 2>/dev/null; then
|
|
awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \
|
|
-v UPDATE_TABLE="$update_table" \
|
|
"$awk_script" "$_norm_tmp"
|
|
else
|
|
_normalise_to_cr < "$_norm_tmp" | \
|
|
awk -v RULES_FILE="$rules_tmp" -v TABLE="$table" -v STRICT="$strict" \
|
|
-v UPDATE_TABLE="$update_table" \
|
|
"$awk_script" /dev/stdin
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Dispatch
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
SUB="${1:-}"
|
|
# v0.8.26: route the whole dispatch's stdout through the tty-gated sanitizer in
|
|
# a brace group, then propagate ${PIPESTATUS[0]} so each subcommand's exit code
|
|
# survives the pipe. On a pipe/redirect the gate is a no-op (raw passthrough).
|
|
# clear-table's confirmation read uses /dev/tty directly, so the pipeline
|
|
# subshell does not interfere with it.
|
|
{
|
|
case "$SUB" in
|
|
show-rules) shift; cmd_show_rules ;;
|
|
show-table) shift; cmd_show_table "$@" ;;
|
|
clear-table) shift; cmd_clear_table "$@" ;;
|
|
count) shift; cmd_count "$@" ;;
|
|
tokenize-value) shift; cmd_tokenize_value "$@" ;;
|
|
detokenize-value) shift; cmd_detokenize_value "$@" ;;
|
|
normalize-value)
|
|
# v0.7.3: normalize-value VALUE [CATEGORY] — emit canonical form without
|
|
# tokenizing or touching the table. Used by larry.sh's v0.7.3 auto-PHI
|
|
# detection to build per-session memory keys so "John Smith" and
|
|
# "JOHN SMITH" share one accept/decline decision.
|
|
# (Originally added in af2ffe8 — reverted with v0.7.1; re-added per
|
|
# v0.7.3 auto-PHI spec.)
|
|
shift
|
|
nv_val="${1:-}"; nv_cat="${2:-}"
|
|
[ -n "$nv_val" ] || die "normalize-value needs a VALUE"
|
|
[ -n "$nv_cat" ] || nv_cat=$(detect_category "$nv_val")
|
|
normalize_value "$nv_val" "$nv_cat"
|
|
printf '\n'
|
|
;;
|
|
lookup-original)
|
|
# v0.7.3: lookup-original VALUE CATEGORY — return token if value is
|
|
# already present in the lookup table (Tier-4 "already-known" check).
|
|
# Empty stdout + exit 0 if not found; non-empty token + exit 0 if found.
|
|
shift
|
|
lo_val="${1:-}"; lo_cat="${2:-}"
|
|
[ -n "$lo_val" ] || die "lookup-original needs a VALUE"
|
|
[ -n "$lo_cat" ] || lo_cat=$(detect_category "$lo_val")
|
|
lo_canonical=$(normalize_value "$lo_val" "$lo_cat")
|
|
[ -z "$lo_canonical" ] && lo_canonical="$lo_val"
|
|
[ -f "$DEFAULT_TABLE" ] || { printf ''; exit 0; }
|
|
awk -F'\t' -v cat="$lo_cat" -v c="$lo_canonical" -v orig="$lo_val" '
|
|
NR>1 && $2==cat && ($3==c || $4==orig) { print $1; exit }
|
|
' "$DEFAULT_TABLE"
|
|
;;
|
|
-h|--help) sed -n '2,30p' "$NC_SELF"; exit 0 ;;
|
|
*)
|
|
# Default = sanitize mode
|
|
input_file=""
|
|
rules="$PHI_RULES_DEFAULT"
|
|
table="$DEFAULT_TABLE"
|
|
strict=0
|
|
update_table=1
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--strict) strict=1 ;;
|
|
--no-update-table) update_table=0 ;;
|
|
--table) shift; table="$1" ;;
|
|
--rules-file) shift; [ -f "$1" ] || die "no such rules file: $1"; rules=$(cat "$1") ;;
|
|
-h|--help) sed -n '2,30p' "$NC_SELF"; exit 0 ;;
|
|
-*) die "unknown flag: $1" ;;
|
|
*) input_file="$1" ;;
|
|
esac
|
|
shift
|
|
done
|
|
do_sanitize "$input_file" "$rules" "$table" "$strict" "$update_table"
|
|
;;
|
|
esac
|
|
} | _sanitize_ctl_tty
|
|
exit "${PIPESTATUS[0]}"
|