diff --git a/VERSION b/VERSION index 7486fdb..f38fc53 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.7.2 +0.7.3 diff --git a/larry.sh b/larry.sh index 2ac9162..10e9073 100755 --- a/larry.sh +++ b/larry.sh @@ -54,7 +54,7 @@ set -o pipefail # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── -LARRY_VERSION="0.7.2" +LARRY_VERSION="0.7.3" LARRY_HOME="${LARRY_HOME:-$HOME/.larry}" # ───────────────────────────────────────────────────────────────────────────── @@ -980,6 +980,597 @@ preprocess_phi_markers() { printf '%s' "$input" } +# ───────────────────────────────────────────────────────────────────────────── +# v0.7.3 — Automatic PHI detection (supersedes the af2ffe8 prototype). +# +# Background +# ---------- +# Bryan's directive: "Err on the side of caution and tokenize anything you +# think you may need to as long as it doesn't break the tools." Priorities, +# in order: +# 1. DO NOT break tools (constraint) +# 2. Catch all PHI (goal) +# 3. Minimize false positives (nice-to-have, secondary) +# +# Reference implementation: commit af2ffe8 (reverted with v0.7.1) — Bryan's +# own first pass. v0.7.3 supersedes it with: +# * Four-tier confidence model (vs. af2ffe8's flat regex set) so we can +# reason about WHY a value tokenizes and gate behavior accordingly. +# * Explicit blacklist contexts (path-like, HL7 field refs like PID.18, +# version strings, port keywords, error/status codes, JSON keys, fenced +# code) — addresses the false-positive failure modes the spec calls out. +# * Tool-result surface: HL7-shaped tool outputs (read_file of .hl7, .txt +# with segments; nc_msgs output) get sanitized BEFORE entering message +# history. Generic outputs (list_dir, grep_files, web search) are NOT +# touched — the spec is explicit about this. +# * Structured JSONL audit log at $LARRY_HOME/log/auto-phi.log. +# * `/phi-auto on|off|confirm|status` slash command + LARRY_AUTO_PHI env. +# * Per-turn override `!nophi `. +# +# Surfaces +# -------- +# 1. USER INPUT: invoked AFTER preprocess_phi_markers in main_loop, so any +# explicit @@VALUE / {{phi:V}} markers are already tokenized. Auto-PHI +# fills the gaps Bryan didn't manually mark. +# 2. TOOL RESULTS: invoked from the tool dispatch path (agent_turn) when the +# result text looks like HL7 (contains `\rMSH|` or starts with `MSH|`). +# +# Detection tiers (first match wins per token) +# -------------------------------------------- +# Tier 1 DEFINITE SSN, email, phone (formatted), NPI (with context). +# High confidence. Always tokenize. +# Tier 2 CONTEXTUAL Numeric value immediately preceded by an MRN/Patient/ +# DOB/Account/Visit/Acct/Record/Birth keyword (within 5 +# chars). Always tokenize. +# Tier 3 HL7-CTX When the line/paragraph mentions a known-PHI HL7 field +# ref (PID.3, PID.5, PID.7, PID.11, PID.13, PID.18, +# NK1.*, GT1.*), be aggressive about plausibly-PHI-shaped +# values in the same line. Tokenize. +# Tier 4 KNOWN Value already exists in $LARRY_HOME/sanitize/lookup.tsv +# (Bryan has tokenized this exact value before). Always +# tokenize, reusing the existing token. +# +# Blacklist contexts (NEVER tokenize, even if a tier matches) +# ----------------------------------------------------------- +# * Path-like: starts with /, ./, ../, ~/, contains / +# * HL7 field references THEMSELVES: [A-Z]{3}\.\d+ — the digit after the +# dot is a field index, not PHI. (Critical: spec verification #5.) +# * Version strings: vN.N.N, semver, ISO dates YYYY-MM-DD +# * Port numbers: :NNNN, port NNNN, tcp:NNNN, PROTOCOL.PORT= +# * Error / status codes: error NNN, code NNN, rc=N, HTTP NNN, status NNN +# * JSON key position: token immediately followed by ":" with a string +# value to the right (don't break tool argument JSON) +# * Fenced code: anything inside ``` ... ``` is skipped wholesale +# +# Behavior controls +# ----------------- +# env LARRY_AUTO_PHI 1 (default, ON) | 0 (off) | confirm (prompt on +# Tier 3-4 matches) +# /phi-auto on|off|confirm|status +# !nophi per-turn override (strip prefix, skip auto-PHI) +# +# After each pass, a dim status line summarises what was caught: +# phi> auto-tokenized 3 values: MRN×1 NAME×1 DOB×1 +# +# Audit +# ----- +# Every tokenization writes a JSONL line to $LARRY_HOME/log/auto-phi.log: +# { "ts": "...", "value": "", "category": "MRN", "token": "[[MRN_0042]]", +# "tier": "contextual", "surface": "user_input"|"tool_result", "context": "..." } +# ───────────────────────────────────────────────────────────────────────────── + +# Mode resolution. Env default per spec: ON unless 0 / off. +# Accepted env values: "1" / "on" / "" → on ; "0" / "off" → off ; "confirm" → confirm. +# (aggressive accepted as an alias for "on" to preserve af2ffe8 muscle memory.) +_resolve_auto_phi_mode() { + local v="${LARRY_AUTO_PHI:-1}" + case "$v" in + 0|off|OFF) printf 'off' ;; + confirm|CONFIRM) printf 'confirm' ;; + 1|on|ON|aggressive|"") printf 'on' ;; + *) printf 'on' ;; + esac +} +AUTO_PHI_MODE="$(_resolve_auto_phi_mode)" +AUTO_PHI_SESSION_COUNT=0 +AUTO_PHI_LOG="${LARRY_HOME}/log/auto-phi.log" + +# Per-session confirm cache. Keyed on canonical-normalized form so +# "John Smith" / "JOHN SMITH" share a single decision. +if (( BASH_VERSINFO[0] >= 4 )); then + declare -A _AUTO_PHI_ACCEPTED 2>/dev/null + declare -A _AUTO_PHI_DECLINED 2>/dev/null +else + _AUTO_PHI_ACCEPTED_LIST="" + _AUTO_PHI_DECLINED_LIST="" +fi + +_auto_phi_accept_check() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + [ -n "${_AUTO_PHI_ACCEPTED[$key]:-}" ] + else + [[ "|$_AUTO_PHI_ACCEPTED_LIST|" == *"|$key|"* ]] + fi +} +_auto_phi_decline_check() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + [ -n "${_AUTO_PHI_DECLINED[$key]:-}" ] + else + [[ "|$_AUTO_PHI_DECLINED_LIST|" == *"|$key|"* ]] + fi +} +_auto_phi_mark_accept() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + _AUTO_PHI_ACCEPTED[$key]=1 + else + _AUTO_PHI_ACCEPTED_LIST="${_AUTO_PHI_ACCEPTED_LIST}|$key" + fi +} +_auto_phi_mark_decline() { + local key="$1" + if (( BASH_VERSINFO[0] >= 4 )); then + _AUTO_PHI_DECLINED[$key]=1 + else + _AUTO_PHI_DECLINED_LIST="${_AUTO_PHI_DECLINED_LIST}|$key" + fi +} + +# Emit one JSONL audit entry. All fields jq-quoted to handle PHI characters +# safely. Best-effort — never fail the host call. +_auto_phi_log() { + local value="$1" category="$2" token="$3" tier="$4" surface="$5" context="$6" + local logdir; logdir="$(dirname "$AUTO_PHI_LOG")" + mkdir -p "$logdir" 2>/dev/null + chmod 700 "$logdir" 2>/dev/null || true + local ts; ts="$(date -Iseconds 2>/dev/null || date)" + # Truncate context to ~40 chars around the hit so we don't bloat the log. + local ctx_short="${context:0:80}" + jq -cn \ + --arg ts "$ts" --arg value "$value" --arg category "$category" \ + --arg token "$token" --arg tier "$tier" --arg surface "$surface" \ + --arg context "$ctx_short" \ + '{ts:$ts,value:$value,category:$category,token:$token,tier:$tier,surface:$surface,context:$context}' \ + >> "$AUTO_PHI_LOG" 2>/dev/null || true + chmod 600 "$AUTO_PHI_LOG" 2>/dev/null || true +} + +# ── Blacklist guards. Return 0 (true) when the token should be SKIPPED. ─── +# Most guards take (token, left_context, full_line) so we can look at the +# surroundings without re-tokenizing the whole input. + +_auto_phi_skip_path_like() { + local v="$1" + case "$v" in + /*|./*|../*|~/*) return 0 ;; + [A-Z]:\\*) return 0 ;; + */*) return 0 ;; + esac + return 1 +} + +# HL7 field-reference guard. The DIGIT in "PID.18" is a field index, NOT +# an MRN. We need to spot this BEFORE the contextual MRN/NPI checks fire. +# Pattern: if left_context ends with [A-Z]{3}\. immediately before our digit +# token, skip. +_auto_phi_skip_hl7_fieldref() { + local v="$1" left="$2" + [[ "$v" =~ ^[0-9]+$ ]] || return 1 + [[ "$left" =~ [A-Z]{3}\.$ ]] && return 0 + return 1 +} + +_auto_phi_skip_version() { + local v="$1" + # vN.N.N, vN.N + [[ "$v" =~ ^v[0-9]+(\.[0-9]+){1,3}$ ]] && return 0 + # Bare semver N.N.N + [[ "$v" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] && return 0 + # ISO date YYYY-MM-DD + [[ "$v" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]] && return 0 + # Date-or-time-ish 8 pure digits that look like YYYYMMDD in the 1900-2099 + # range. (Conservative — only block if it parses as a plausible date.) + if [[ "$v" =~ ^(19|20)[0-9]{2}(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])$ ]]; then + return 0 + fi + return 1 +} + +_auto_phi_skip_port() { + local v="$1" left="$2" + [[ "$v" =~ ^[0-9]+$ ]] || return 1 + # :NNNN + [[ "$left" =~ :$ ]] && return 0 + # "port NNNN" / "tcp:NNNN" / "PROTOCOL.PORT=NNNN" / "TCP PORT" + if [[ "$left" =~ ([Pp]ort|PORT|tcp|TCP|udp|UDP|listen|LISTEN)[[:space:]:=]*$ ]]; then + return 0 + fi + return 1 +} + +_auto_phi_skip_errcode() { + local v="$1" left="$2" + [[ "$v" =~ ^[0-9]+$ ]] || return 1 + if [[ "$left" =~ ([Ee]rror|ERROR|[Cc]ode|CODE|HTTP|http|[Ss]tatus|STATUS|rc=|RC=)[[:space:]:=]*$ ]]; then + return 0 + fi + return 1 +} + +# JSON key position: token immediately followed by `":` (or `: ` with a +# JSON-quoted value to the right). We test using the SURROUNDING line. +_auto_phi_skip_json_key() { + local v="$1" right="$2" + case "$right" in + \"*) return 0 ;; + :*) return 0 ;; + esac + return 1 +} + +# Timestamp guard — 13+ digit epoch milliseconds, or 10 digits starting with +# '1' (epoch seconds in the 2001-2286 range). Carry-over from af2ffe8's +# detector. +_auto_phi_skip_timestamp() { + local v="$1" + [[ "$v" =~ ^[0-9]+$ ]] || return 1 + local n="${#v}" + [ "$n" -ge 13 ] && return 0 + if [ "$n" -eq 10 ] && [[ "$v" == 1* ]]; then return 0; fi + return 1 +} + +# Already-tokenized form: [[CAT_NNNN]]. Skip — we don't double-tokenize. +_auto_phi_skip_already_token() { + [[ "$1" =~ ^\[\[[A-Z][A-Z0-9_]*_[0-9]+\]\]$ ]] +} + +# Already inside a manual marker (@@…@@, @@…, {{phi:…}}). Caller can detect +# by checking that the marker has already been substituted to a token by +# preprocess_phi_markers — by the time auto_detect_phi runs, those are gone. +# This guard is defensive: skip leftover marker-shaped tokens. +_auto_phi_skip_marker_residue() { + local v="$1" + [[ "$v" == @@* ]] && return 0 + [[ "$v" == *@@ ]] && return 0 + [[ "$v" == \{\{phi:* ]] && return 0 + return 1 +} + +# ── Tier classifier. Returns "TIER|CATEGORY" or empty. ── +# Args: value, left_context, right_context, full_line. +_auto_phi_classify_tiered() { + local v="$1" left="$2" right="$3" line="$4" + [ -z "$v" ] && return 0 + + # Universal skips (apply before any tier). + _auto_phi_skip_already_token "$v" && return 0 + _auto_phi_skip_marker_residue "$v" && return 0 + _auto_phi_skip_path_like "$v" && return 0 + _auto_phi_skip_hl7_fieldref "$v" "$left" && return 0 + _auto_phi_skip_json_key "$v" "$right" && return 0 + # Version skip is overridden when a DOB/Birth keyword precedes — an ISO + # date in DOB context IS the DOB, not a version string. Err on caution. + if ! [[ "$left" =~ ([Dd][Oo][Bb]|[Bb]irth|BIRTH)[[:space:]:#=]*$ ]]; then + _auto_phi_skip_version "$v" && return 0 + fi + + # Strip one trailing sentence-grammar char for classification. + local trimmed="$v" + case "$trimmed" in + *[.,\;:\!\?\)]) trimmed="${trimmed%?}" ;; + esac + + # URL — leave alone. + case "$trimmed" in + http://*|https://*|ssh://*|ftp://*|sftp://*|file://*|ws://*|wss://*) return 0 ;; + esac + + # ── TIER 1: DEFINITE ── + # Email + if [[ "$trimmed" =~ ^[^@[:space:]]+@[^@[:space:]]+\.[^@[:space:]]+$ ]]; then + printf 'definite|EMAIL'; return + fi + # SSN with dashes (definite shape) + if [[ "$trimmed" =~ ^[0-9]{3}-[0-9]{2}-[0-9]{4}$ ]]; then + printf 'definite|SSN'; return + fi + # Phone — formatted variants ((NNN) NNN-NNNN, NNN-NNN-NNNN) + if [[ "$trimmed" =~ ^\([0-9]{3}\)[[:space:]]?[0-9]{3}-[0-9]{4}$ ]] \ + || [[ "$trimmed" =~ ^[0-9]{3}-[0-9]{3}-[0-9]{4}$ ]] \ + || [[ "$trimmed" =~ ^[0-9]{3}\.[0-9]{3}\.[0-9]{4}$ ]]; then + printf 'definite|PHONE'; return + fi + # NPI with explicit context (NPI: NNNNNNNNNN, or in PV1.7 field context) + if [[ "$trimmed" =~ ^[0-9]{10}$ ]]; then + if [[ "$left" =~ (NPI|npi)[[:space:]:=]*$ ]]; then + printf 'definite|NPI'; return + fi + fi + + # Timestamp/port/errcode rejection (applies before contextual MRN check). + _auto_phi_skip_timestamp "$trimmed" && return 0 + _auto_phi_skip_port "$trimmed" "$left" && return 0 + _auto_phi_skip_errcode "$trimmed" "$left" && return 0 + + # ── TIER 2: CONTEXTUAL ── + # Numeric value preceded by an MRN/Patient/DOB/Account/Visit/Acct/ + # Record/Birth keyword within 5 chars (i.e. immediately). + if [[ "$trimmed" =~ ^[0-9]+$ ]]; then + if [[ "$left" =~ ([Mm][Rr][Nn]|[Pp]atient|PATIENT|[Dd][Oo][Bb]|[Aa]ccount|ACCOUNT|[Vv]isit|VISIT|[Aa]cct|ACCT|[Rr]ecord|RECORD|[Bb]irth|BIRTH)[[:space:]:#=]*$ ]]; then + local cat + case "${BASH_REMATCH[1]}" in + DOB|dob|Dob|Birth|birth|BIRTH) cat="DOB" ;; + Account|account|ACCOUNT|Acct|acct|ACCT|Visit|visit|VISIT) cat="ACCT" ;; + *) cat="MRN" ;; + esac + printf 'contextual|%s' "$cat"; return + fi + fi + # Date-shaped value preceded by DOB/Birth → DOB tier 2. + if [[ "$trimmed" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then + if [[ "$left" =~ ([Dd][Oo][Bb]|[Bb]irth|BIRTH)[[:space:]:#=]*$ ]]; then + printf 'contextual|DOB'; return + fi + # Bare M/D/Y outside of DOB context is still date-shaped; treat as Tier 3 + # only if HL7 PHI fields are mentioned in the line. + fi + + # SSN without dashes (9 raw digits) needs context too — too easy to confuse + # with other 9-digit IDs. + if [[ "$trimmed" =~ ^[0-9]{9}$ ]]; then + if [[ "$left" =~ ([Ss][Ss][Nn]|SSN)[[:space:]:#=]*$ ]]; then + printf 'contextual|SSN'; return + fi + fi + + # ── TIER 3: HL7-CONTEXT ── + # Only kicks in if the surrounding line mentions a known-PHI HL7 field ref. + local hl7_ctx=0 + if [[ "$line" =~ (PID\.(3|5|7|11|13|18)|NK1\.[0-9]+|GT1\.[0-9]+|IN1\.(16|17|18|19|20)) ]]; then + hl7_ctx=1 + fi + if [ "$hl7_ctx" = "1" ]; then + # HL7 caret-name (FAMILY^GIVEN^MIDDLE…) + if [[ "$trimmed" =~ ^[A-Za-z][A-Za-z\'-]*\^[A-Za-z][A-Za-z\'-]* ]]; then + printf 'hl7|NAME'; return + fi + # Date-shaped → DOB + if [[ "$trimmed" =~ ^[0-9]{1,4}[/-][0-9]{1,2}[/-][0-9]{1,4}$ ]]; then + printf 'hl7|DOB'; return + fi + # 6-12 pure digits in HL7 context → MRN + if [[ "$trimmed" =~ ^[0-9]{6,12}$ ]]; then + printf 'hl7|MRN'; return + fi + fi + + return 0 +} + +# Tier-4 ("already-known") check: is the value already in lookup.tsv? +# Returns 0 (true) + emits "CATEGORY|TOKEN" on hit; 1 + empty on miss. +# Reads the full set of categories actually present in lookup.tsv at the +# time of the call, so user-added categories (EMP, INSPOL, etc. from the +# default PHI rules) are all considered, not just a hardcoded shortlist. +_auto_phi_check_known() { + local v="$1" + local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh" + [ -x "$sanitize_script" ] || return 1 + local table="${LARRY_HOME}/sanitize/lookup.tsv" + [ -f "$table" ] || return 1 + # Discover categories present in the table (column 2, skip header). + local cats + cats=$(awk -F'\t' 'NR>1 && $2 != "" { print $2 }' "$table" 2>/dev/null | sort -u) + [ -z "$cats" ] && return 1 + local cat token + while IFS= read -r cat; do + [ -z "$cat" ] && continue + token=$("$sanitize_script" lookup-original "$v" "$cat" 2>/dev/null) + if [ -n "$token" ]; then + printf '%s|%s' "$cat" "$token" + return 0 + fi + done <<< "$cats" + return 1 +} + +# Confirm-mode interactive Y/n. Returns 0 to proceed with tokenization, 1 to +# skip. Caches decision per-session under the normalized key. +_auto_phi_confirm() { + local value="$1" category="$2" tier="$3" + local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh" + local mem_key + mem_key=$("$sanitize_script" normalize-value "$value" "$category" 2>/dev/null) || mem_key="$value" + [ -z "$mem_key" ] && mem_key="$value" + + if _auto_phi_decline_check "$mem_key"; then return 1; fi + if _auto_phi_accept_check "$mem_key"; then return 0; fi + + # confirm mode only prompts on Tier 3 and Tier 4 per spec — Tier 1 & 2 + # are always tokenized even in confirm mode. + if [ "$tier" != "hl7" ] && [ "$tier" != "known" ]; then return 0; fi + + printf '%sphi auto>%s tokenize "%s" as %s? [Y/n] ' \ + "$C_YELLOW" "$C_RESET" "$value" "$category" >&2 + local ans=""; IFS= read -r ans /dev/null || ans="" + case "$ans" in + n|N|no|NO|No) _auto_phi_mark_decline "$mem_key"; return 1 ;; + *) _auto_phi_mark_accept "$mem_key"; return 0 ;; + esac +} + +# Strip fenced code blocks (``` … ```) from a copy of the input before +# token scanning. The caller scans the redacted version; substitutions are +# applied against the ORIGINAL input via literal string replace, so any +# values inside a fenced block remain untouched. +_auto_phi_redact_fences() { + local s="$1" + # Replace anything between triple-backticks with spaces of same length. + printf '%s' "$s" | awk ' + BEGIN { in_fence=0 } + /^[[:space:]]*```/ { in_fence = !in_fence; print ""; next } + { if (in_fence) print ""; else print } + ' +} + +# Detect HL7 shape — used to gate the tool-result surface. +_auto_phi_looks_like_hl7() { + local s="$1" + # Strong signal: contains a CR-separated MSH segment, or starts with MSH|. + case "$s" in + MSH\|*) return 0 ;; + esac + case "$s" in + *$'\r'MSH\|*) return 0 ;; + *$'\n'MSH\|*) return 0 ;; + esac + # Also accept "MSH|" near start (some tool outputs add a header line). + printf '%s' "$s" | head -c 4096 | grep -qE '(^|[\r\n])(MSH|PID|EVN|PV1)\|' && return 0 + return 1 +} + +# Main detector. Args: surface ("user_input"|"tool_result"), input text. +# Echoes the rewritten input. Status message goes to stderr. +auto_detect_phi() { + local surface="$1" + local input="$2" + local sanitize_script="$LARRY_LIB_DIR/hl7-sanitize.sh" + [ -x "$sanitize_script" ] || { printf '%s' "$input"; return; } + + # Per-turn override (user-input surface only). + if [ "$surface" = "user_input" ] && [[ "$input" == '!nophi '* ]]; then + printf '%s' "${input#!nophi }" + return 0 + fi + if [ "$AUTO_PHI_MODE" = "off" ]; then + printf '%s' "$input" + return 0 + fi + + # Build a fence-redacted scan copy. Substitutions still happen on $input. + local scan; scan=$(_auto_phi_redact_fences "$input") + + # Collect hits as TIER|CAT|VALUE rows. Use newline-separated for safety. + local hits="" + local line left_context right_context tok token_cat token_tier + local i ch + # Iterate line by line; tokenize by whitespace within each line so we can + # compute left/right context. This is bash-only — no awk dependency for the + # detection loop, only for fence redaction. + while IFS= read -r line; do + # Pass over whitespace-delimited tokens. We track an offset within the + # line to compute left/right context. + local offset=0 trimmed_line="$line" + local -a words + # shellcheck disable=SC2206 + words=( $line ) + local w widx=0 + for w in "${words[@]}"; do + [ -z "$w" ] && continue + # Compute left context: ~20 chars before the word. Manual slice for + # macOS bash 3.2 — `${pos: -20}` returns empty when len(pos) < 20. + local pos="${line%%"$w"*}" + local _plen=${#pos} + local left + if [ "$_plen" -le 20 ]; then + left="$pos" + else + left="${pos:$((_plen-20))}" + fi + local right_pos="${line#*"$w"}" + local right="${right_pos:0:20}" + + # Try comma-split sub-tokens too (e.g. "a@b.com,c@d.com"). + local sub + local IFS=',' + for sub in $w; do + [ -z "$sub" ] && continue + unset IFS + local result + result=$(_auto_phi_classify_tiered "$sub" "$left" "$right" "$line") + if [ -n "$result" ]; then + token_tier="${result%%|*}" + token_cat="${result##*|}" + hits+="${token_tier}|${token_cat}|${sub}"$'\n' + else + # Try Tier-4 already-known. + local known + known=$(_auto_phi_check_known "$sub" 2>/dev/null) + if [ -n "$known" ]; then + token_cat="${known%%|*}" + hits+="known|${token_cat}|${sub}"$'\n' + fi + fi + IFS=',' + done + unset IFS + widx=$((widx+1)) + done + done <<< "$scan" + + [ -z "$hits" ] && { printf '%s' "$input"; return 0; } + + # Dedupe hits (preserving first-seen order). + local seen_hash="" + local uniq_hits="" + local h + while IFS= read -r h; do + [ -z "$h" ] && continue + case $'\n'"$seen_hash"$'\n' in + *$'\n'"$h"$'\n'*) continue ;; + esac + seen_hash+="$h"$'\n' + uniq_hits+="$h"$'\n' + done <<< "$hits" + + # Per-category counters for the status summary. + local -A cat_count=() + + while IFS= read -r h; do + [ -z "$h" ] && continue + local tier="${h%%|*}"; local rest="${h#*|}" + local cat="${rest%%|*}"; local orig="${rest#*|}" + + # Confirm mode gating (Tier 3-4 only). + if [ "$AUTO_PHI_MODE" = "confirm" ]; then + _auto_phi_confirm "$orig" "$cat" "$tier" || continue + fi + + # Tokenize via the canonical pipeline. + local token + token=$("$sanitize_script" tokenize-value --category "$cat" "$orig" 2>/dev/null) + [ -z "$token" ] && continue + + # Substitute. Literal string replace catches all occurrences. + input="${input//"$orig"/"$token"}" + + # Bookkeeping. + cat_count[$cat]=$(( ${cat_count[$cat]:-0} + 1 )) + AUTO_PHI_SESSION_COUNT=$(( AUTO_PHI_SESSION_COUNT + 1 )) + + # Audit. Context: short slice around the value from the ORIGINAL input. + local ctx; ctx=$(printf '%s' "$scan" | grep -F -- "$orig" | head -1 | head -c 80) + _auto_phi_log "$orig" "$cat" "$token" "$tier" "$surface" "$ctx" + done <<< "$uniq_hits" + + # Emit a single status summary if anything was tokenized. + if [ ${#cat_count[@]} -gt 0 ]; then + local summary="" total=0 k + for k in "${!cat_count[@]}"; do + summary+="${k}×${cat_count[$k]} " + total=$(( total + cat_count[$k] )) + done + summary="${summary% }" + printf '%sphi>%s auto-tokenized %d value(s) [%s]: %s\n' \ + "$C_DIM" "$C_RESET" "$total" "$surface" "$summary" >&2 + fi + + printf '%s' "$input" +} + tool_hl7_sanitize() { local input_path="$1" strict="${2:-0}" _lib_err_if_missing || return @@ -2498,6 +3089,51 @@ agent_turn() { result="Tool returned malformed JSON; raw body: $(printf '%s' "$result" | head -c 200)" ;; esac + + # v0.7.3 — auto-PHI on tool results. + # Gating per spec: only HL7-shaped output gets sanitized. We allow-list + # the tool names that can return HL7 (read_file of an .hl7/.txt, hl7_* + # tools, nc_msgs). Generic outputs (list_dir, grep_files, bash_exec, + # glob_files, web search, etc.) are NEVER touched — the spec is + # explicit: false positives there would break legitimate text. + # + # For HL7-shaped results we route through hl7-sanitize.sh (the + # canonical field-aware pipeline) — NOT auto_detect_phi (which is + # designed for prose, not pipe-delimited segment data). The two + # share lookup.tsv so tokens are stable across surfaces. + if [ "$AUTO_PHI_MODE" != "off" ]; then + local _ap_eligible=0 + case "$name" in + read_file) + local _ap_path + _ap_path=$(printf '%s' "$input_json" | jq -r '.path // ""' 2>/dev/null) + case "$_ap_path" in + *.hl7|*.HL7|*.txt|*.TXT) _ap_eligible=1 ;; + esac + ;; + nc_msgs|hl7_field|hl7_diff) _ap_eligible=1 ;; + esac + if [ "$_ap_eligible" = "1" ] && _auto_phi_looks_like_hl7 "$result"; then + local _ap_tmp _ap_sanitized _ap_before _ap_after + _ap_tmp=$(mktemp) + printf '%s' "$result" > "$_ap_tmp" + _ap_before=$(bash "$LARRY_LIB_DIR/hl7-sanitize.sh" count 2>/dev/null || echo 0) + _ap_sanitized=$(bash "$LARRY_LIB_DIR/hl7-sanitize.sh" "$_ap_tmp" 2>/dev/null) + if [ -n "$_ap_sanitized" ]; then + _ap_after=$(bash "$LARRY_LIB_DIR/hl7-sanitize.sh" count 2>/dev/null || echo 0) + result="$_ap_sanitized" + local _ap_new=$((_ap_after - _ap_before)) + if [ "$_ap_new" -gt 0 ]; then + printf '%sphi>%s auto-tokenized %d HL7 field(s) in %s result [tool_result]\n' \ + "$C_DIM" "$C_RESET" "$_ap_new" "$name" >&2 + AUTO_PHI_SESSION_COUNT=$(( AUTO_PHI_SESSION_COUNT + _ap_new )) + _auto_phi_log "(hl7-sanitize batch)" "BATCH" "(+${_ap_new} tokens)" "hl7_pipeline" "tool_result" "$name" + fi + fi + rm -f "$_ap_tmp" + fi + fi + _LARRY_LAST_TOOL_RESULT="$result" log_append '```'; log_append "$result"; log_append '```' @@ -2607,6 +3243,41 @@ Slash commands: all collapse to the same token. Category is auto-detected from value shape (MRN/SSN/DOB/NAME/MANUAL). {{phi:VALUE}} / {{phi:CAT:VALUE}} legacy syntax (still works) + + Automatic PHI detection (v0.7.3, supersedes the af2ffe8 prototype): + Larry scans every prompt AND every HL7-shaped tool result for PHI- + shaped values and tokenizes them BEFORE conversation history is sent + to Anthropic. Four-tier confidence model with explicit blacklists for + paths, HL7 field refs (PID.18 is not an MRN), version strings, port + numbers, error codes, JSON keys, and fenced code blocks. + + Tiers (first-match wins, all tokenize unless mode=off): + 1 DEFINITE SSN with dashes, email, formatted phone, NPI (with + "NPI:" prefix). Always. + 2 CONTEXTUAL Numeric value after MRN/Patient/DOB/Account/Visit/ + Acct/Record/Birth keyword. Always. + 3 HL7-CONTEXT Plausible-PHI values when PID.3/PID.5/PID.7/PID.11/ + PID.13/PID.18, NK1.*, GT1.*, IN1.16-20 mentioned in + the same line. Aggressive — prompts in confirm mode. + 4 KNOWN Value matches an existing lookup.tsv entry — Bryan + has seen this value before. Always. + + Tool-result scan applies ONLY to read_file (.hl7/.txt), hl7_*, and + nc_msgs outputs, and ONLY if content is HL7-shaped. Generic outputs + (list_dir, grep_files, bash_exec, web search) are NEVER scanned — the + risk of breaking legitimate text outweighs the catch rate. + + Modes (env LARRY_AUTO_PHI or /phi-auto): + on default — all four tiers always tokenize (caution-first) + confirm Tier 3-4 prompts Y/n once per session per canonical value + off disable auto-detection entirely (manual markers still work) + + Per-turn override: prefix any prompt with "!nophi " to skip the scan + for that turn only. Explicit @@VALUE / {{phi:VALUE}} markers always + win — they are processed first; auto-PHI fills only the gaps. + + Audit: every tokenization writes a JSONL entry to + \$LARRY_HOME/log/auto-phi.log (ts/value/category/token/tier/surface/context). /redetect re-scan for HCIROOT/HCISITE/tools /sites list site dirs under HCIROOT /site switch HCISITE for this session @@ -2742,6 +3413,7 @@ _LARRY_SLASH_CMDS=( /hl7-fields /mouse /origin + /phi-auto ) # _LARRY_SLASH_CMDS_DESC — one-line descriptions for each slash command. @@ -2793,6 +3465,7 @@ _LARRY_SLASH_CMDS_DESC=( [/hl7-fields]=" print component breakdown (e.g. /hl7-fields PID.5)" [/mouse]="on|off toggle xterm mouse mode for this session" [/origin]="show/pin auto-update origin (gitea|github|auto|)" + [/phi-auto]="on|off|confirm|status — runtime control for v0.7.3 auto PHI detection" ) # __larry_complete_slash — bound to TAB via `bind -x` (see _install_readline_tab). @@ -3593,6 +4266,30 @@ main_loop() { ;; esac continue ;; + # v0.7.3 — runtime control for automatic PHI detection. + /phi-auto|/phi-auto\ *) + local _arg; _arg=$(_slash_args "/phi-auto" "$input") + case "${_arg:-status}" in + on) + AUTO_PHI_MODE="on" + larry_say "auto-PHI: on (default — Tier 1-4 detections tokenized; err on caution)" + ;; + off) + AUTO_PHI_MODE="off" + larry_say "auto-PHI: off (explicit markers @@VALUE / {{phi:VALUE}} still work)" + ;; + confirm) + AUTO_PHI_MODE="confirm" + larry_say "auto-PHI: confirm (Tier 3-4 matches prompt Y/n; Tier 1-2 still always tokenize)" + ;; + status) + larry_say "auto-PHI: $AUTO_PHI_MODE (this session tokenized: $AUTO_PHI_SESSION_COUNT) log: $AUTO_PHI_LOG" + ;; + *) + err "usage: /phi-auto on|off|confirm (no arg → status)" + ;; + esac + continue ;; /mouse|/mouse\ *) local _arg; _arg=$(_slash_args "/mouse" "$input") case "${_arg:-status}" in @@ -3851,6 +4548,13 @@ EOF input=$(preprocess_phi_markers "$input") fi + # v0.7.3 — Automatic PHI detection. Runs AFTER explicit-marker handling so + # @@VALUE / {{phi:VALUE}} hits take precedence; auto-PHI fills gaps in + # things Bryan didn't manually mark. Per-turn "!nophi " prefix override + # is consumed inside auto_detect_phi. Bypassed entirely when mode=off. + # Supersedes af2ffe8 (reverted with v0.7.1). + input=$(auto_detect_phi user_input "$input") + log_section "user"; log_append "$input" # v0.7.1: render the persistent status line BETWEEN turns — after the # user has submitted real (non-slash, non-empty) input and after all diff --git a/lib/hl7-sanitize.sh b/lib/hl7-sanitize.sh index 347b095..e62ccf4 100755 --- a/lib/hl7-sanitize.sh +++ b/lib/hl7-sanitize.sh @@ -202,6 +202,26 @@ normalize_value() { # it as an option flag. Same caveat applies for any future tr -d call. printf '%s' "$value" | tr -d '[:space:]-' ;; + PHONE) + # v0.7.3: digits-only canonical so "(555) 123-4567", "5551234567", + # "555.123.4567", "555-123-4567" all collapse to one token. + # (Originally added in af2ffe8 — reverted with v0.7.1; re-added per + # v0.7.3 auto-PHI spec.) + printf '%s' "$value" | tr -cd '[:digit:]' + ;; + EMAIL) + # v0.7.3: lowercase + outer-trim. RFC 5321 says the local-part is + # technically case-sensitive but in practice every mainstream provider + # treats it case-insensitively, so collapsing for PHI purposes is safe. + # (Originally added in af2ffe8 — reverted with v0.7.1; re-added per + # v0.7.3 auto-PHI spec.) + printf '%s' "$value" | tr '[:upper:]' '[:lower:]' | awk '{$1=$1; print}' + ;; + NPI) + # v0.7.3: National Provider Identifier — 10 digits. Strip whitespace + # to canonical 10-digit form. + printf '%s' "$value" | tr -d '[:space:]-' + ;; *) printf '%s' "$value" | awk '{$1=$1; print}' ;; @@ -437,6 +457,35 @@ case "$SUB" in count) shift; cmd_count "$@" ;; tokenize-value) shift; cmd_tokenize_value "$@" ;; detokenize-value) shift; cmd_detokenize_value "$@" ;; + normalize-value) + # v0.7.3: normalize-value VALUE [CATEGORY] — emit canonical form without + # tokenizing or touching the table. Used by larry.sh's v0.7.3 auto-PHI + # detection to build per-session memory keys so "John Smith" and + # "JOHN SMITH" share one accept/decline decision. + # (Originally added in af2ffe8 — reverted with v0.7.1; re-added per + # v0.7.3 auto-PHI spec.) + shift + nv_val="${1:-}"; nv_cat="${2:-}" + [ -n "$nv_val" ] || die "normalize-value needs a VALUE" + [ -n "$nv_cat" ] || nv_cat=$(detect_category "$nv_val") + normalize_value "$nv_val" "$nv_cat" + printf '\n' + ;; + lookup-original) + # v0.7.3: lookup-original VALUE CATEGORY — return token if value is + # already present in the lookup table (Tier-4 "already-known" check). + # Empty stdout + exit 0 if not found; non-empty token + exit 0 if found. + shift + lo_val="${1:-}"; lo_cat="${2:-}" + [ -n "$lo_val" ] || die "lookup-original needs a VALUE" + [ -n "$lo_cat" ] || lo_cat=$(detect_category "$lo_val") + lo_canonical=$(normalize_value "$lo_val" "$lo_cat") + [ -z "$lo_canonical" ] && lo_canonical="$lo_val" + [ -f "$DEFAULT_TABLE" ] || { printf ''; exit 0; } + awk -F'\t' -v cat="$lo_cat" -v c="$lo_canonical" -v orig="$lo_val" ' + NR>1 && $2==cat && ($3==c || $4==orig) { print $1; exit } + ' "$DEFAULT_TABLE" + ;; -h|--help) sed -n '2,30p' "$NC_SELF"; exit 0 ;; *) # Default = sanitize mode